[m-rev.] for review: Define behaviour of string.foldl etc on ill-formed sequences.

Peter Wang novalazy at gmail.com
Wed Oct 16 17:04:22 AEDT 2019


library/string.m:
    As above.

tests/hard_coded/Mmakefile:
tests/hard_coded/string_fold_ilseq.exp:
tests/hard_coded/string_fold_ilseq.exp2:
tests/hard_coded/string_fold_ilseq.m:
    Add test case.
---
 library/string.m                        | 27 +++++---------
 tests/hard_coded/Mmakefile              |  1 +
 tests/hard_coded/string_fold_ilseq.exp  |  5 +++
 tests/hard_coded/string_fold_ilseq.exp2 |  5 +++
 tests/hard_coded/string_fold_ilseq.m    | 49 +++++++++++++++++++++++++
 5 files changed, 69 insertions(+), 18 deletions(-)
 create mode 100644 tests/hard_coded/string_fold_ilseq.exp
 create mode 100644 tests/hard_coded/string_fold_ilseq.exp2
 create mode 100644 tests/hard_coded/string_fold_ilseq.m

diff --git a/library/string.m b/library/string.m
index 884363c82..10bbf6f5b 100644
--- a/library/string.m
+++ b/library/string.m
@@ -1006,8 +1006,15 @@
     % foldl(Closure, String, !Acc):
     %
     % `Closure' is an accumulator predicate which is to be called for each
-    % character (code point) of the string `String' in turn. The initial
-    % value of the accumulator is `!.Acc' and the final value is `!:Acc'.
+    % character (code point) of the string `String' in turn.
+    % If `String' contains ill-formed sequences, `Closure' is called for each
+    % code unit in an ill-formed sequences. If strings use UTF-8 encoding,
+    % U+FFFD is passed to `Closure' in place each such code unit.
+    % If strings use UTF-16 encoding, each code unit in an ill-formed sequence
+    % is an unpaired surrogate code point, which will be passed to `Closure'.
+    %
+    % The initial value of the accumulator is `!.Acc' and the final value is
+    % `!:Acc'.
     % (foldl(Closure, String, !Acc)  is equivalent to
     %   to_char_list(String, Chars),
     %   list.foldl(Closure, Chars, !Acc)
@@ -5202,20 +5209,6 @@ break_up_string_reverse(Str, N, Prev) = Strs :-
 % Folds over the characters in strings.
 %
 
-% XXX ILSEQ The behaviour of foldl depends on unsafe_index_next.
-% For UTF-16, we can call Closure for unpaired surrogate code points like any
-% other code point. For UTF-8, bytes in ill-formed sequences cannot be passed
-% as `char's since they are not code points. Perhaps foldl should throw an
-% exception, or just should pass replacement char.
-%
-% We may want to introduce fold predicates with an accumulator of type:
-%       pred(char_or_code_unit, A, A)
-% or
-%       pred(char, maybe(int), A, A)
-% For the latter, the second argument would be set to `yes(CodeUnit)' for each
-% code unit in an ill-formed sequence, and the first argument could be set to
-% U+FFFD (replacement char).
-
 foldl(F, S, A) = B :-
     P = ( pred(X::in, Y::in, Z::out) is det :- Z = F(X, Y) ),
     foldl(P, S, A, B).
@@ -5308,8 +5301,6 @@ foldl2_substring(Closure, String, Start, Count, !Acc1, !Acc2) :-
 
 %---------------------%
 
-% XXX ILSEQ Behaviour depends on unsafe_prev_index.
-
 foldr(F, String, Acc0) = Acc :-
     Closure = ( pred(X::in, Y::in, Z::out) is det :- Z = F(X, Y)),
     foldr(Closure, String, Acc0, Acc).
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 7906dcbf3..626f60c6a 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -360,6 +360,7 @@ ORDINARY_PROGS = \
 	string_codepoint \
 	string_count_codepoints_ilseq \
 	string_first_char \
+	string_fold_ilseq \
 	string_index_ilseq \
 	string_index_next_ilseq \
 	string_loop \
diff --git a/tests/hard_coded/string_fold_ilseq.exp b/tests/hard_coded/string_fold_ilseq.exp
new file mode 100644
index 000000000..bfc1435ba
--- /dev/null
+++ b/tests/hard_coded/string_fold_ilseq.exp
@@ -0,0 +1,5 @@
+string.foldl:
+a b c 😀 0xfffd 0xfffd 0xfffd x y z 
+
+string.foldr:
+z y x 0xfffd 0xfffd 0xfffd 😀 c b a 
diff --git a/tests/hard_coded/string_fold_ilseq.exp2 b/tests/hard_coded/string_fold_ilseq.exp2
new file mode 100644
index 000000000..d28c67096
--- /dev/null
+++ b/tests/hard_coded/string_fold_ilseq.exp2
@@ -0,0 +1,5 @@
+string.foldl:
+a b c 😀 0xd83d x y z 
+
+string.foldr:
+z y x 0xd83d 😀 c b a 
diff --git a/tests/hard_coded/string_fold_ilseq.m b/tests/hard_coded/string_fold_ilseq.m
new file mode 100644
index 000000000..ba8a64bcb
--- /dev/null
+++ b/tests/hard_coded/string_fold_ilseq.m
@@ -0,0 +1,49 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+%
+% The .exp file is for backends using UTF-8 string encoding.
+% The .exp2 file is for backends using UTF-16 string encoding.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_fold_ilseq.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module int.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    S0 = "😀",
+    S1 = string.between(S0, 0, count_code_units(S0) - 1),
+    S = "abc" ++ S0 ++ S1 ++ "xyz",
+
+    io.write_string("string.foldl:\n", !IO),
+    string.foldl(write_char_or_hex, S, !IO),
+    io.write_string("\n\n", !IO),
+
+    io.write_string("string.foldr:\n", !IO),
+    string.foldr(write_char_or_hex, S, !IO),
+    io.write_string("\n", !IO).
+
+:- pred write_char_or_hex(char::in, io::di, io::uo) is det.
+
+write_char_or_hex(Char, !IO) :-
+    ( if Char = '\ufffd' ; char.is_surrogate(Char) then
+        io.format("%#x", [i(char.to_int(Char))], !IO)
+    else
+        io.write_char(Char, !IO)
+    ),
+    io.write_char(' ', !IO).
-- 
2.23.0



More information about the reviews mailing list