[m-rev.] for review: Add conversion predicates between strings and UTF-8/UTF-16 code units.

Peter Wang novalazy at gmail.com
Wed Feb 25 14:56:31 AEDT 2015


Add the predicates:

	string.to_utf8_code_unit_list/2
	string.to_utf16_code_unit_list/2
	string.from_utf8_code_unit_list/2
	string.from_utf16_code_unit_list/2

library/string.m:
	Implement the above predicates in Mercury code that works across
	backends.

	Add internal_encoding_is_utf8 helper predicate.

	Forward the new predicates, for which the named encoding matches
	that of the backend in use, to string.to_code_unit_list and
	string.from_code_unit_list.

tests/hard_coded/Mmakefile
tests/hard_coded/string_code_unit.exp
tests/hard_coded/string_code_unit.m
	Add test case.

NEWS:
	Announce the additions.

diff --git a/NEWS b/NEWS
index cc9cb1e..c5a3a9f 100644
--- a/NEWS
+++ b/NEWS
@@ -126,6 +126,10 @@ Changes to the Mercury standard library:
 
    - is_all_alnum/1
    - is_empty/1
+   - to_utf8_code_unit_list/2
+   - to_utf16_code_unit_list/2
+   - from_utf8_code_unit_list/2
+   - from_utf16_code_unit_list/2
 
 * The following predicates have been added to the map module:
 
diff --git a/library/string.m b/library/string.m
index 85f6549..5afa52f 100644
--- a/library/string.m
+++ b/library/string.m
@@ -160,16 +160,35 @@
     %
 :- pred semidet_from_rev_char_list(list(char)::in, string::uo) is semidet.
 
-    % Convert a string into a list of code units.
+    % Convert a string into a list of code units of the string encoding used
+    % by the current process.
     %
 :- pred to_code_unit_list(string::in, list(int)::out) is det.
 
+    % Convert a string into a list of UTF-8 code units.
+    %
+:- pred to_utf8_code_unit_list(string::in, list(int)::out) is det.
+
+    % Convert a string into a list of UTF-16 code units.
+    %
+:- pred to_utf16_code_unit_list(string::in, list(int)::out) is det.
+
     % Convert a list of code units to a string.
     % Fails if the list does not contain a valid encoding of a string,
     % in the encoding expected by the current process.
     %
 :- pred from_code_unit_list(list(int)::in, string::uo) is semidet.
 
+    % Convert a list of UTF-8 code units to a string.
+    % Fails if the list does not contain a valid encoding of a string.
+    %
+:- pred from_utf8_code_unit_list(list(int)::in, string::uo) is semidet.
+
+    % Convert a list of UTF-8 code units to a string.
+    % Fails if the list does not contain a valid encoding of a string.
+    %
+:- pred from_utf16_code_unit_list(list(int)::in, string::uo) is semidet.
+
     % duplicate_char(Char, Count, String):
     %
     % Construct a string consisting of `Count' occurrences of `Char'
@@ -1351,6 +1370,41 @@
 
 %---------------------------------------------------------------------------%
 %
+% String encoding.
+%
+
+    % Succeed if the internal string encoding is UTF-8, fail if it is UTF-16.
+    % No other encodings are supported.
+    %
+:- pred internal_encoding_is_utf8 is semidet.
+
+:- pragma foreign_proc("C",
+    internal_encoding_is_utf8,
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    SUCCESS_INDICATOR = MR_TRUE;
+").
+:- pragma foreign_proc("C#",
+    internal_encoding_is_utf8,
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    SUCCESS_INDICATOR = false;
+").
+:- pragma foreign_proc("Java",
+    internal_encoding_is_utf8,
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    SUCCESS_INDICATOR = false;
+").
+:- pragma foreign_proc("Erlang",
+    internal_encoding_is_utf8,
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    SUCCESS_INDICATOR = true
+").
+
+%---------------------------------------------------------------------------%
+%
 % Conversions between strings and lists of characters.
 %
 
@@ -1707,6 +1761,42 @@ string.to_code_unit_list_loop(String, Index, End, List) :-
 
 %---------------------%
 
+string.to_utf8_code_unit_list(String, CodeList) :-
+    ( internal_encoding_is_utf8 ->
+        string.to_code_unit_list(String, CodeList)
+    ;
+        string.foldr(encode_utf8, String, [], CodeList)
+    ).
+
+:- pred encode_utf8(char::in, list(int)::in, list(int)::out) is det.
+
+encode_utf8(Char, CodeList0, CodeList) :-
+    ( char.to_utf8(Char, CharCodes) ->
+        CodeList = CharCodes ++ CodeList0
+    ;
+        unexpected($module, $pred, "char.to_utf8 failed")
+    ).
+
+%---------------------%
+
+string.to_utf16_code_unit_list(String, CodeList) :-
+    ( internal_encoding_is_utf8 ->
+        string.foldr(encode_utf16, String, [], CodeList)
+    ;
+        string.to_code_unit_list(String, CodeList)
+    ).
+
+:- pred encode_utf16(char::in, list(int)::in, list(int)::out) is det.
+
+encode_utf16(Char, CodeList0, CodeList) :-
+    ( char.to_utf16(Char, CharCodes) ->
+        CodeList = CharCodes ++ CodeList0
+    ;
+        unexpected($module, $pred, "char.to_utf16 failed")
+    ).
+
+%---------------------%
+
 :- pragma foreign_proc("C",
     string.from_code_unit_list(CodeList::in, Str::uo),
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
@@ -1832,6 +1922,96 @@ string.to_code_unit_list_loop(String, Index, End, List) :-
 
 %---------------------%
 
+string.from_utf8_code_unit_list(CodeList, String) :-
+    ( internal_encoding_is_utf8 ->
+        string.from_code_unit_list(CodeList, String)
+    ;
+        decode_utf8(CodeList, [], RevChars),
+        string.from_rev_char_list(RevChars, String)
+    ).
+
+:- pred decode_utf8(list(int)::in, list(char)::in, list(char)::out) is semidet.
+
+decode_utf8([], RevChars, RevChars).
+decode_utf8([A | FollowA], RevChars0, RevChars) :-
+    ( A < 0 ->
+        fail
+    ; A =< 0x7f ->  % 1-byte sequence
+        CharInt = A,
+        Rest = FollowA
+    ; A =< 0xc1 ->
+        fail
+    ; A =< 0xdf ->  % 2-byte sequence
+        FollowA = [B | Rest],
+        utf8_is_trail_byte(B),
+        CharInt = (A /\ 0x1f) << 6
+               \/ (B /\ 0x3f),
+        CharInt >= 0x80
+    ; A =< 0xef ->  % 3-byte sequence
+        FollowA = [B, C | Rest],
+        utf8_is_trail_byte(B),
+        utf8_is_trail_byte(C),
+        CharInt = (A /\ 0x0f) << 12
+               \/ (B /\ 0x3f) << 6
+               \/ (C /\ 0x3f),
+        CharInt >= 0x800
+    ; A =< 0xf4 ->  % 4-byte sequence
+        FollowA = [B, C, D | Rest],
+        utf8_is_trail_byte(B),
+        utf8_is_trail_byte(C),
+        utf8_is_trail_byte(D),
+        CharInt = (A /\ 0x07) << 18
+               \/ (B /\ 0x3f) << 12
+               \/ (C /\ 0x3f) << 6
+               \/ (D /\ 0x3f),
+        CharInt >= 0x10000
+    ;
+        fail
+    ),
+    char.from_int(CharInt, Char),
+    decode_utf8(Rest, [Char | RevChars0], RevChars).
+
+:- pred utf8_is_trail_byte(int::in) is semidet.
+
+utf8_is_trail_byte(C) :-
+    (C /\ 0xc0) = 0x80.
+
+%---------------------%
+
+string.from_utf16_code_unit_list(CodeList, String) :-
+    ( internal_encoding_is_utf8 ->
+        decode_utf16(CodeList, [], RevChars),
+        string.from_rev_char_list(RevChars, String)
+    ;
+        string.from_code_unit_list(CodeList, String)
+    ).
+
+:- pred decode_utf16(list(int)::in, list(char)::in, list(char)::out)
+    is semidet.
+
+decode_utf16([], RevChars, RevChars).
+decode_utf16([A | FollowA], RevChars0, RevChars) :-
+    ( A < 0 ->
+        fail
+    ; A < 0xd800 ->
+        CharInt = A,
+        Rest = FollowA
+    ; A < 0xdc00 ->
+        FollowA = [B | Rest],
+        B >= 0xdc00,
+        B =< 0xdfff,
+        CharInt = (A << 10) + B - 0x35fdc00
+    ; A =< 0xffff ->
+        CharInt = A,
+        Rest = FollowA
+    ;
+        fail
+    ),
+    char.from_int(CharInt, Char),
+    decode_utf16(Rest, [Char | RevChars0], RevChars).
+
+%---------------------%
+
 string.duplicate_char(C, N) = S :-
     string.duplicate_char(C, N, S).
 
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 62edbfa..35b41ff 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -287,6 +287,7 @@ ORDINARY_PROGS =	\
 	string_append_ooi \
 	string_builder_test \
 	string_class \
+	string_code_unit \
 	string_codepoint \
 	string_first_char \
 	string_loop \
diff --git a/tests/hard_coded/string_code_unit.exp b/tests/hard_coded/string_code_unit.exp
new file mode 100644
index 0000000..ed981e3
--- /dev/null
+++ b/tests/hard_coded/string_code_unit.exp
@@ -0,0 +1,28 @@
+code points:	0x01, 0x7f
+UTF-8:		0x01, 0x7f
+UTF-16:		0x01, 0x7f
+
+code points:	0x80, 0x7ff
+UTF-8:		0xc2, 0x80, 0xdf, 0xbf
+UTF-16:		0x80, 0x7ff
+
+code points:	0x800, 0xffff
+UTF-8:		0xe0, 0xa0, 0x80, 0xef, 0xbf, 0xbf
+UTF-16:		0x800, 0xffff
+
+code points:	0x100000, 0x10ffff
+UTF-8:		0xf4, 0x80, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf
+UTF-16:		0xdbc0, 0xdc00, 0xdbff, 0xdfff
+
+code points:	0x01, 0xd7ff
+UTF-8:		0x01, 0xed, 0x9f, 0xbf
+UTF-16:		0x01, 0xd7ff
+
+code points:	0xe000, 0xffff
+UTF-8:		0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf
+UTF-16:		0xe000, 0xffff
+
+code points:	0x10000, 0x10ffff
+UTF-8:		0xf0, 0x90, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf
+UTF-16:		0xd800, 0xdc00, 0xdbff, 0xdfff
+
diff --git a/tests/hard_coded/string_code_unit.m b/tests/hard_coded/string_code_unit.m
new file mode 100644
index 0000000..2fd8bbe
--- /dev/null
+++ b/tests/hard_coded/string_code_unit.m
@@ -0,0 +1,83 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+
+:- module string_code_unit.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    Strings = [
+        % "\u0000",             % NUL
+        "\u0001\u007f",         % ASCII
+        "\u0080\u07ff",         % UTF-8 2-byte
+        "\u0800\uffff",         % UTF-8 3-byte
+        "\U00100000\U0010ffff", % UTF-8 4-byte
+        "\u0001\ud7ff",         % UTF-16 1 code unit
+        "\ue000\uffff",         % UTF-16 1 code unit
+        "\U00010000\U0010ffff"  % UTF-16 2 code units
+        % "\ud800\udbff",       % leading surrogates
+        % "\udc00\udfff",       % trailing surrogates
+        % "\U00110000"          % out of range
+    ],
+    list.foldl(test, Strings, !IO).
+
+:- pred test(string::in, io::di, io::uo) is det.
+
+test(String, !IO) :-
+    io.write_string("code points:\t", !IO),
+    string.to_char_list(String, Chars),
+    write_hex_chars(Chars, !IO),
+    io.nl(!IO),
+
+    string.to_utf8_code_unit_list(String, UTF8),
+    io.write_string("UTF-8:\t\t", !IO),
+    write_hex_ints(UTF8, !IO),
+    io.nl(!IO),
+    ( string.from_utf8_code_unit_list(UTF8, String) ->
+        true
+    ;
+        io.write_string("from_utf8_code_unit_list failed\n", !IO)
+    ),
+
+    string.to_utf16_code_unit_list(String, UTF16),
+    io.write_string("UTF-16:\t\t", !IO),
+    write_hex_ints(UTF16, !IO),
+    io.nl(!IO),
+    ( string.from_utf16_code_unit_list(UTF16, String) ->
+        true
+    ;
+        io.write_string("from_utf16_code_unit_list failed\n", !IO)
+    ),
+    io.nl(!IO).
+
+:- pred write_hex_chars(list(char)::in, io::di, io::uo) is det.
+
+write_hex_chars(Chars, !IO) :-
+    write_hex_ints(map(to_int, Chars), !IO).
+
+:- pred write_hex_ints(list(int)::in, io::di, io::uo) is det.
+
+write_hex_ints(Ints, !IO) :-
+    io.write_list(Ints, ", ", write_hex_int, !IO).
+
+:- pred write_hex_int(int::in, io::di, io::uo) is det.
+
+write_hex_int(Int, !IO) :-
+    io.format("0x%02x", [i(Int)], !IO).
+
+%---------------------------------------------------------------------------%
-- 
2.1.2




More information about the reviews mailing list