[m-rev.] for review: Add conversion predicates between strings and UTF-8/UTF-16 code units.
Peter Wang
novalazy at gmail.com
Wed Feb 25 14:56:31 AEDT 2015
Add the predicates:
string.to_utf8_code_unit_list/2
string.to_utf16_code_unit_list/2
string.from_utf8_code_unit_list/2
string.from_utf16_code_unit_list/2
library/string.m:
Implement the above predicates in Mercury code that works across
backends.
Add internal_encoding_is_utf8 helper predicate.
Forward the new predicates, for which the named encoding matches
that of the backend in use, to string.to_code_unit_list and
string.from_code_unit_list.
tests/hard_coded/Mmakefile
tests/hard_coded/string_code_unit.exp
tests/hard_coded/string_code_unit.m
Add test case.
NEWS:
Announce the additions.
diff --git a/NEWS b/NEWS
index cc9cb1e..c5a3a9f 100644
--- a/NEWS
+++ b/NEWS
@@ -126,6 +126,10 @@ Changes to the Mercury standard library:
- is_all_alnum/1
- is_empty/1
+ - to_utf8_code_unit_list/2
+ - to_utf16_code_unit_list/2
+ - from_utf8_code_unit_list/2
+ - from_utf16_code_unit_list/2
* The following predicates have been added to the map module:
diff --git a/library/string.m b/library/string.m
index 85f6549..5afa52f 100644
--- a/library/string.m
+++ b/library/string.m
@@ -160,16 +160,35 @@
%
:- pred semidet_from_rev_char_list(list(char)::in, string::uo) is semidet.
- % Convert a string into a list of code units.
+ % Convert a string into a list of code units of the string encoding used
+ % by the current process.
%
:- pred to_code_unit_list(string::in, list(int)::out) is det.
+ % Convert a string into a list of UTF-8 code units.
+ %
+:- pred to_utf8_code_unit_list(string::in, list(int)::out) is det.
+
+ % Convert a string into a list of UTF-16 code units.
+ %
+:- pred to_utf16_code_unit_list(string::in, list(int)::out) is det.
+
% Convert a list of code units to a string.
% Fails if the list does not contain a valid encoding of a string,
% in the encoding expected by the current process.
%
:- pred from_code_unit_list(list(int)::in, string::uo) is semidet.
+ % Convert a list of UTF-8 code units to a string.
+ % Fails if the list does not contain a valid encoding of a string.
+ %
+:- pred from_utf8_code_unit_list(list(int)::in, string::uo) is semidet.
+
+ % Convert a list of UTF-8 code units to a string.
+ % Fails if the list does not contain a valid encoding of a string.
+ %
+:- pred from_utf16_code_unit_list(list(int)::in, string::uo) is semidet.
+
% duplicate_char(Char, Count, String):
%
% Construct a string consisting of `Count' occurrences of `Char'
@@ -1351,6 +1370,41 @@
%---------------------------------------------------------------------------%
%
+% String encoding.
+%
+
+ % Succeed if the internal string encoding is UTF-8, fail if it is UTF-16.
+ % No other encodings are supported.
+ %
+:- pred internal_encoding_is_utf8 is semidet.
+
+:- pragma foreign_proc("C",
+ internal_encoding_is_utf8,
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ SUCCESS_INDICATOR = MR_TRUE;
+").
+:- pragma foreign_proc("C#",
+ internal_encoding_is_utf8,
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ SUCCESS_INDICATOR = false;
+").
+:- pragma foreign_proc("Java",
+ internal_encoding_is_utf8,
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ SUCCESS_INDICATOR = false;
+").
+:- pragma foreign_proc("Erlang",
+ internal_encoding_is_utf8,
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ SUCCESS_INDICATOR = true
+").
+
+%---------------------------------------------------------------------------%
+%
% Conversions between strings and lists of characters.
%
@@ -1707,6 +1761,42 @@ string.to_code_unit_list_loop(String, Index, End, List) :-
%---------------------%
+string.to_utf8_code_unit_list(String, CodeList) :-
+ ( internal_encoding_is_utf8 ->
+ string.to_code_unit_list(String, CodeList)
+ ;
+ string.foldr(encode_utf8, String, [], CodeList)
+ ).
+
+:- pred encode_utf8(char::in, list(int)::in, list(int)::out) is det.
+
+encode_utf8(Char, CodeList0, CodeList) :-
+ ( char.to_utf8(Char, CharCodes) ->
+ CodeList = CharCodes ++ CodeList0
+ ;
+ unexpected($module, $pred, "char.to_utf8 failed")
+ ).
+
+%---------------------%
+
+string.to_utf16_code_unit_list(String, CodeList) :-
+ ( internal_encoding_is_utf8 ->
+ string.foldr(encode_utf16, String, [], CodeList)
+ ;
+ string.to_code_unit_list(String, CodeList)
+ ).
+
+:- pred encode_utf16(char::in, list(int)::in, list(int)::out) is det.
+
+encode_utf16(Char, CodeList0, CodeList) :-
+ ( char.to_utf16(Char, CharCodes) ->
+ CodeList = CharCodes ++ CodeList0
+ ;
+ unexpected($module, $pred, "char.to_utf16 failed")
+ ).
+
+%---------------------%
+
:- pragma foreign_proc("C",
string.from_code_unit_list(CodeList::in, Str::uo),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
@@ -1832,6 +1922,96 @@ string.to_code_unit_list_loop(String, Index, End, List) :-
%---------------------%
+string.from_utf8_code_unit_list(CodeList, String) :-
+ ( internal_encoding_is_utf8 ->
+ string.from_code_unit_list(CodeList, String)
+ ;
+ decode_utf8(CodeList, [], RevChars),
+ string.from_rev_char_list(RevChars, String)
+ ).
+
+:- pred decode_utf8(list(int)::in, list(char)::in, list(char)::out) is semidet.
+
+decode_utf8([], RevChars, RevChars).
+decode_utf8([A | FollowA], RevChars0, RevChars) :-
+ ( A < 0 ->
+ fail
+ ; A =< 0x7f -> % 1-byte sequence
+ CharInt = A,
+ Rest = FollowA
+ ; A =< 0xc1 ->
+ fail
+ ; A =< 0xdf -> % 2-byte sequence
+ FollowA = [B | Rest],
+ utf8_is_trail_byte(B),
+ CharInt = (A /\ 0x1f) << 6
+ \/ (B /\ 0x3f),
+ CharInt >= 0x80
+ ; A =< 0xef -> % 3-byte sequence
+ FollowA = [B, C | Rest],
+ utf8_is_trail_byte(B),
+ utf8_is_trail_byte(C),
+ CharInt = (A /\ 0x0f) << 12
+ \/ (B /\ 0x3f) << 6
+ \/ (C /\ 0x3f),
+ CharInt >= 0x800
+ ; A =< 0xf4 -> % 4-byte sequence
+ FollowA = [B, C, D | Rest],
+ utf8_is_trail_byte(B),
+ utf8_is_trail_byte(C),
+ utf8_is_trail_byte(D),
+ CharInt = (A /\ 0x07) << 18
+ \/ (B /\ 0x3f) << 12
+ \/ (C /\ 0x3f) << 6
+ \/ (D /\ 0x3f),
+ CharInt >= 0x10000
+ ;
+ fail
+ ),
+ char.from_int(CharInt, Char),
+ decode_utf8(Rest, [Char | RevChars0], RevChars).
+
+:- pred utf8_is_trail_byte(int::in) is semidet.
+
+utf8_is_trail_byte(C) :-
+ (C /\ 0xc0) = 0x80.
+
+%---------------------%
+
+string.from_utf16_code_unit_list(CodeList, String) :-
+ ( internal_encoding_is_utf8 ->
+ decode_utf16(CodeList, [], RevChars),
+ string.from_rev_char_list(RevChars, String)
+ ;
+ string.from_code_unit_list(CodeList, String)
+ ).
+
+:- pred decode_utf16(list(int)::in, list(char)::in, list(char)::out)
+ is semidet.
+
+decode_utf16([], RevChars, RevChars).
+decode_utf16([A | FollowA], RevChars0, RevChars) :-
+ ( A < 0 ->
+ fail
+ ; A < 0xd800 ->
+ CharInt = A,
+ Rest = FollowA
+ ; A < 0xdc00 ->
+ FollowA = [B | Rest],
+ B >= 0xdc00,
+ B =< 0xdfff,
+ CharInt = (A << 10) + B - 0x35fdc00
+ ; A =< 0xffff ->
+ CharInt = A,
+ Rest = FollowA
+ ;
+ fail
+ ),
+ char.from_int(CharInt, Char),
+ decode_utf16(Rest, [Char | RevChars0], RevChars).
+
+%---------------------%
+
string.duplicate_char(C, N) = S :-
string.duplicate_char(C, N, S).
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 62edbfa..35b41ff 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -287,6 +287,7 @@ ORDINARY_PROGS = \
string_append_ooi \
string_builder_test \
string_class \
+ string_code_unit \
string_codepoint \
string_first_char \
string_loop \
diff --git a/tests/hard_coded/string_code_unit.exp b/tests/hard_coded/string_code_unit.exp
new file mode 100644
index 0000000..ed981e3
--- /dev/null
+++ b/tests/hard_coded/string_code_unit.exp
@@ -0,0 +1,28 @@
+code points: 0x01, 0x7f
+UTF-8: 0x01, 0x7f
+UTF-16: 0x01, 0x7f
+
+code points: 0x80, 0x7ff
+UTF-8: 0xc2, 0x80, 0xdf, 0xbf
+UTF-16: 0x80, 0x7ff
+
+code points: 0x800, 0xffff
+UTF-8: 0xe0, 0xa0, 0x80, 0xef, 0xbf, 0xbf
+UTF-16: 0x800, 0xffff
+
+code points: 0x100000, 0x10ffff
+UTF-8: 0xf4, 0x80, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf
+UTF-16: 0xdbc0, 0xdc00, 0xdbff, 0xdfff
+
+code points: 0x01, 0xd7ff
+UTF-8: 0x01, 0xed, 0x9f, 0xbf
+UTF-16: 0x01, 0xd7ff
+
+code points: 0xe000, 0xffff
+UTF-8: 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf
+UTF-16: 0xe000, 0xffff
+
+code points: 0x10000, 0x10ffff
+UTF-8: 0xf0, 0x90, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf
+UTF-16: 0xd800, 0xdc00, 0xdbff, 0xdfff
+
diff --git a/tests/hard_coded/string_code_unit.m b/tests/hard_coded/string_code_unit.m
new file mode 100644
index 0000000..2fd8bbe
--- /dev/null
+++ b/tests/hard_coded/string_code_unit.m
@@ -0,0 +1,83 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+
+:- module string_code_unit.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+ Strings = [
+ % "\u0000", % NUL
+ "\u0001\u007f", % ASCII
+ "\u0080\u07ff", % UTF-8 2-byte
+ "\u0800\uffff", % UTF-8 3-byte
+ "\U00100000\U0010ffff", % UTF-8 4-byte
+ "\u0001\ud7ff", % UTF-16 1 code unit
+ "\ue000\uffff", % UTF-16 1 code unit
+ "\U00010000\U0010ffff" % UTF-16 2 code units
+ % "\ud800\udbff", % leading surrogates
+ % "\udc00\udfff", % trailing surrogates
+ % "\U00110000" % out of range
+ ],
+ list.foldl(test, Strings, !IO).
+
+:- pred test(string::in, io::di, io::uo) is det.
+
+test(String, !IO) :-
+ io.write_string("code points:\t", !IO),
+ string.to_char_list(String, Chars),
+ write_hex_chars(Chars, !IO),
+ io.nl(!IO),
+
+ string.to_utf8_code_unit_list(String, UTF8),
+ io.write_string("UTF-8:\t\t", !IO),
+ write_hex_ints(UTF8, !IO),
+ io.nl(!IO),
+ ( string.from_utf8_code_unit_list(UTF8, String) ->
+ true
+ ;
+ io.write_string("from_utf8_code_unit_list failed\n", !IO)
+ ),
+
+ string.to_utf16_code_unit_list(String, UTF16),
+ io.write_string("UTF-16:\t\t", !IO),
+ write_hex_ints(UTF16, !IO),
+ io.nl(!IO),
+ ( string.from_utf16_code_unit_list(UTF16, String) ->
+ true
+ ;
+ io.write_string("from_utf16_code_unit_list failed\n", !IO)
+ ),
+ io.nl(!IO).
+
+:- pred write_hex_chars(list(char)::in, io::di, io::uo) is det.
+
+write_hex_chars(Chars, !IO) :-
+ write_hex_ints(map(to_int, Chars), !IO).
+
+:- pred write_hex_ints(list(int)::in, io::di, io::uo) is det.
+
+write_hex_ints(Ints, !IO) :-
+ io.write_list(Ints, ", ", write_hex_int, !IO).
+
+:- pred write_hex_int(int::in, io::di, io::uo) is det.
+
+write_hex_int(Int, !IO) :-
+ io.format("0x%02x", [i(Int)], !IO).
+
+%---------------------------------------------------------------------------%
--
2.1.2
More information about the reviews
mailing list