[m-rev.] for review: Define behaviour of string.to_char_list (and rev) on ill-formed sequences.

Peter Wang novalazy at gmail.com
Wed Oct 16 17:04:20 AEDT 2019


library/string.m:
    Define string.to_char_list and string.to_rev_char_list to either
    replace code units in ill-formed sequences with U+FFFD or return
    unpaired surrogate code points.

    Use Mercury version of do_to_char_list instead of updating
    the foreign language implementations.

tests/hard_coded/Mmakefile:
tests/hard_coded/string_char_list_ilseq.exp:
tests/hard_coded/string_char_list_ilseq.exp2:
tests/hard_coded/string_char_list_ilseq.m:
    Add test case.
---
 library/string.m                             | 123 ++++---------------
 tests/hard_coded/Mmakefile                   |   1 +
 tests/hard_coded/string_char_list_ilseq.exp  |   5 +
 tests/hard_coded/string_char_list_ilseq.exp2 |   5 +
 tests/hard_coded/string_char_list_ilseq.m    |  50 ++++++++
 5 files changed, 88 insertions(+), 96 deletions(-)
 create mode 100644 tests/hard_coded/string_char_list_ilseq.exp
 create mode 100644 tests/hard_coded/string_char_list_ilseq.exp2
 create mode 100644 tests/hard_coded/string_char_list_ilseq.m

diff --git a/library/string.m b/library/string.m
index 1b0d32e87..0501a3198 100644
--- a/library/string.m
+++ b/library/string.m
@@ -124,9 +124,15 @@
 %
 
     % Convert the string to a list of characters (code points).
+    %
+    % In the forward mode:
+    % If strings use UTF-8 encoding then each code unit in an ill-formed
+    % sequence is replaced by U+FFFD REPLACEMENT CHARACTER in the list.
+    % If strings use UTF-16 encoding then each unpaired surrogate code point
+    % is returned as a separate code point in the list.
+    %
     % The reverse mode of the predicate throws an exception if
     % the list of characters contains a null character.
-    %
     % NOTE: In the future we may also throw an exception if the list contains
     % a surrogate code point.
     %
@@ -135,10 +141,17 @@
 :- mode to_char_list(in, out) is det.
 :- mode to_char_list(uo, in) is det.
 
-    % Convert the string to a list of characters (code points) in reverse order.
+    % Convert the string to a list of characters (code points) in reverse
+    % order.
+    %
+    % In the forward mode:
+    % If strings use UTF-8 encoding then each code unit in an ill-formed
+    % sequence is replaced by U+FFFD REPLACEMENT CHARACTER in the list.
+    % If strings use UTF-16 encoding then each unpaired surrogate code point
+    % is returned as a separate code point in the list.
+    %
     % The reverse mode of the predicate throws an exception if
     % the list of characters contains a null character.
-    %
     % NOTE: In the future we may also throw an exception if the list contains
     % a surrogate code point.
     %
@@ -1514,26 +1527,6 @@
 % Conversions between strings and lists of characters.
 %
 
-% XXX ILSEQ Behaviour on ill-formed sequences differs by target language.
-%   - C: stops at first ill-formed sequence from the right
-%   - C#: unpaired surrogates are replaced by U+FFFD
-%   - Java: unpaired surrogates are returned as-is
-%
-% Perhaps we need versions of to_char_list/to_rev_char_list with choice of
-% action on ill-formed sequences:
-%   - throw an exception
-%   - ignore
-%   - replace with U+FFFD
-%
-% Replacing with U+FFFD is only necessary for UTF-8. Unpaired surrogates in
-% UTF-16 can represent themselves since they are just code points.
-%
-% For users that want round-trip conversion of potentially invalid UTF-8,
-% one possibility would be to map bytes in ill-formed sequences to chars
-% in the low surrogate range U+DC80..U+DCFF, see
-% https://hyperreal.org/~est/utf-8b/releases/utf-8b-20060413043934/kuhn-utf-8b.html
-% A simpler possibility is to introduce a char_or_code_unit type.
-
 to_char_list(S) = Cs :-
     to_char_list(S, Cs).
 
@@ -1546,84 +1539,22 @@ to_char_list(Str::uo, CharList::in) :-
 
 :- pred do_to_char_list(string::in, list(char)::out) is det.
 
-:- pragma foreign_proc("C",
-    do_to_char_list(Str::in, CharList::out),
-    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
-        does_not_affect_liveness, no_sharing],
-"{
-    MR_Integer pos = strlen(Str);
-    int c;
+do_to_char_list(Str, CharList) :-
+    do_to_char_list_loop(Str, length(Str), [], CharList).
 
-    CharList = MR_list_empty_msg(MR_ALLOC_ID);
-    for (;;) {
-        c = MR_utf8_prev_get(Str, &pos);
-        if (c <= 0) {
-            break;
-        }
-        CharList = MR_char_list_cons_msg((MR_UnsignedChar) c, CharList,
-            MR_ALLOC_ID);
-    }
-}").
-:- pragma foreign_proc("C#",
-    do_to_char_list(Str::in, CharList::out),
-    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
-        does_not_affect_liveness, no_sharing],
-"
-    list.List_1 lst = list.empty_list();
-    for (int i = Str.Length - 1; i >= 0; i--) {
-        int c;
-        char c2 = Str[i];
-        if (System.Char.IsLowSurrogate(c2)) {
-            try {
-                char c1 = Str[i - 1];
-                c = System.Char.ConvertToUtf32(c1, c2);
-                i--;
-            } catch (System.ArgumentOutOfRangeException) {
-                c = 0xfffd;
-            } catch (System.IndexOutOfRangeException) {
-                c = 0xfffd;
-            }
-        } else if (System.Char.IsHighSurrogate(c2)) {
-            c = 0xfffd;
-        } else {
-            c = c2;
-        }
-        lst = list.cons(c, lst);
-    }
-    CharList = lst;
-").
-:- pragma foreign_proc("Java",
-    do_to_char_list(Str::in, CharList::out),
-    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
-        does_not_affect_liveness, no_sharing],
-"
-    list.List_1<Integer> lst = list.empty_list();
-    for (int i = Str.length(); i > 0; ) {
-        int c = Str.codePointBefore(i);
-        lst = list.cons(c, lst);
-        i -= java.lang.Character.charCount(c);
-    }
-    CharList = lst;
-").
-:- pragma foreign_proc("Erlang",
-    do_to_char_list(Str::in, CharList::out),
-    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
-        does_not_affect_liveness],
-"
-    CharList = unicode:characters_to_list(Str)
-").
+:- pred do_to_char_list_loop(string::in, int::in,
+    list(char)::in, list(char)::out) is det.
 
-    % It would be worth checking if this Mercury implementation (or another
-    % one) is as fast the foreign_proc implementations, the next time any
-    % changes are required.
-    %
-do_to_char_list(Str, CharList) :-
-    foldr(list.cons, Str, [], CharList).
+do_to_char_list_loop(Str, Index0, !CharList) :-
+    ( if string.unsafe_prev_index(Str, Index0, Index1, C) then
+        !:CharList = [C | !.CharList],
+        do_to_char_list_loop(Str, Index1, !CharList)
+    else
+        true
+    ).
 
 %---------------------%
 
-% XXX ILSEQ unsafe_index_next causes truncation at first ill-formed sequence.
-
 to_rev_char_list(S) = Cs :-
     to_rev_char_list(S, Cs).
 
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 84e38bd34..f11b93eb0 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -354,6 +354,7 @@ ORDINARY_PROGS = \
 	string_append_ooi \
 	string_builder_test \
 	string_case \
+	string_char_list_ilseq \
 	string_class \
 	string_code_unit \
 	string_codepoint \
diff --git a/tests/hard_coded/string_char_list_ilseq.exp b/tests/hard_coded/string_char_list_ilseq.exp
new file mode 100644
index 000000000..1999b22a4
--- /dev/null
+++ b/tests/hard_coded/string_char_list_ilseq.exp
@@ -0,0 +1,5 @@
+string.to_char_list
+[a, b, c, 😀, 0xfffd, 0xfffd, 0xfffd, x, y, z]
+
+string.to_rev_char_list
+[z, y, x, 0xfffd, 0xfffd, 0xfffd, 😀, c, b, a]
diff --git a/tests/hard_coded/string_char_list_ilseq.exp2 b/tests/hard_coded/string_char_list_ilseq.exp2
new file mode 100644
index 000000000..e45d8e4dd
--- /dev/null
+++ b/tests/hard_coded/string_char_list_ilseq.exp2
@@ -0,0 +1,5 @@
+string.to_char_list
+[a, b, c, 😀, 0xd83d, x, y, z]
+
+string.to_rev_char_list
+[z, y, x, 0xd83d, 😀, c, b, a]
diff --git a/tests/hard_coded/string_char_list_ilseq.m b/tests/hard_coded/string_char_list_ilseq.m
new file mode 100644
index 000000000..615eea350
--- /dev/null
+++ b/tests/hard_coded/string_char_list_ilseq.m
@@ -0,0 +1,50 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+%
+% The .exp file is for backends using UTF-8 string encoding.
+% The .exp2 file is for backends using UTF-16 string encoding.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_char_list_ilseq.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module int.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    S0 = "😀",
+    S1 = string.between(S0, 0, count_code_units(S0) - 1),
+    S = "abc" ++ S0 ++ S1 ++ "xyz",
+
+    string.to_char_list(S, CharList),
+    io.write_string("string.to_char_list\n[", !IO),
+    io.write_list(CharList, ", ", write_char_or_hex, !IO),
+    io.write_string("]\n\n", !IO),
+
+    string.to_rev_char_list(S, RevCharList),
+    io.write_string("string.to_rev_char_list\n[", !IO),
+    io.write_list(RevCharList, ", ", write_char_or_hex, !IO),
+    io.write_string("]\n", !IO).
+
+:- pred write_char_or_hex(char::in, io::di, io::uo) is det.
+
+write_char_or_hex(Char, !IO) :-
+    ( if Char = '\ufffd' ; char.is_surrogate(Char) then
+        io.format("%#x", [i(char.to_int(Char))], !IO)
+    else
+        io.write_char(Char, !IO)
+    ).
-- 
2.23.0



More information about the reviews mailing list