[m-rev.] for review: Make string.append(out, out, in) work with ill-formed sequences.

Peter Wang novalazy at gmail.com
Wed Oct 23 15:02:04 AEDT 2019


library/string.m:
    Simplify string.append(out, out, in) and make it work sensibly in
    the presence of ill-formed code unit sequences, breaking the input
    string after each code point or code unit in an ill-formed sequence.

tests/hard_coded/Mmakefile:
tests/hard_coded/string_append_ooi_ilseq.exp:
tests/hard_coded/string_append_ooi_ilseq.exp2:
tests/hard_coded/string_append_ooi_ilseq.m:
    Add test case.
---
 library/string.m                              | 59 +++-----------
 tests/hard_coded/Mmakefile                    |  1 +
 tests/hard_coded/string_append_ooi_ilseq.exp  | 18 +++++
 tests/hard_coded/string_append_ooi_ilseq.exp2 | 12 +++
 tests/hard_coded/string_append_ooi_ilseq.m    | 78 +++++++++++++++++++
 5 files changed, 118 insertions(+), 50 deletions(-)
 create mode 100644 tests/hard_coded/string_append_ooi_ilseq.exp
 create mode 100644 tests/hard_coded/string_append_ooi_ilseq.exp2
 create mode 100644 tests/hard_coded/string_append_ooi_ilseq.m

diff --git a/library/string.m b/library/string.m
index ec893e9a7..ee624c32b 100644
--- a/library/string.m
+++ b/library/string.m
@@ -3610,62 +3610,21 @@ append(S1::out, S2::out, S3::in) :-
 :- pred append_ooi(string::out, string::out, string::in) is multi.
 
 append_ooi(S1, S2, S3) :-
-    S3Len = length(S3),
-    append_ooi_2(0, S3Len, S1, S2, S3).
+    Len3 = length(S3),
+    append_ooi_2(0, Len3, S1, S2, S3).
 
 :- pred append_ooi_2(int::in, int::in, string::out, string::out,
     string::in) is multi.
 
-append_ooi_2(NextS1Len, S3Len, S1, S2, S3) :-
-    ( if NextS1Len = S3Len then
-        append_ooi_3(NextS1Len, S3Len, S1, S2, S3)
-    else
-        (
-            append_ooi_3(NextS1Len, S3Len, S1, S2, S3)
-        ;
-            unsafe_index_next(S3, NextS1Len, AdvS1Len, _),
-            append_ooi_2(AdvS1Len, S3Len, S1, S2, S3)
-        )
+append_ooi_2(Start2, Len3, S1, S2, S3) :-
+    (
+        unsafe_between(S3, 0, Start2, S1),
+        unsafe_between(S3, Start2, Len3, S2)
+    ;
+        unsafe_index_next(S3, Start2, NextStart2, _Char),
+        append_ooi_2(NextStart2, Len3, S1, S2, S3)
     ).
 
-:- pred append_ooi_3(int::in, int::in, string::out,
-    string::out, string::in) is det.
-
-:- pragma foreign_proc("C",
-    append_ooi_3(S1Len::in, S3Len::in, S1::out, S2::out, S3::in),
-    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
-        does_not_affect_liveness, may_not_duplicate, no_sharing],
-"{
-    MR_allocate_aligned_string_msg(S1, S1Len, MR_ALLOC_ID);
-    MR_memcpy(S1, S3, S1Len);
-    S1[S1Len] = '\\0';
-    MR_allocate_aligned_string_msg(S2, S3Len - S1Len, MR_ALLOC_ID);
-    strcpy(S2, S3 + S1Len);
-}").
-:- pragma foreign_proc("C#",
-    append_ooi_3(S1Len::in, _S3Len::in, S1::out, S2::out, S3::in),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    S1 = S3.Substring(0, S1Len);
-    S2 = S3.Substring(S1Len);
-").
-:- pragma foreign_proc("Java",
-    append_ooi_3(S1Len::in, _S3Len::in, S1::out, S2::out, S3::in),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    S1 = S3.substring(0, S1Len);
-    S2 = S3.substring(S1Len);
-").
-:- pragma foreign_proc("Erlang",
-    append_ooi_3(S1Len::in, _S3Len::in, S1::out, S2::out, S3::in),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    << S1:S1Len/binary, S2/binary >> = S3
-").
-
-append_ooi_3(S1Len, _S3Len, S1, S2, S3) :-
-    split(S3, S1Len, S1, S2).
-
 S1 ++ S2 = append(S1, S2).
 
 %---------------------%
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 3b5e7f945..347a93c31 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -353,6 +353,7 @@ ORDINARY_PROGS = \
 	string_append_iii \
 	string_append_ioi \
 	string_append_ooi \
+	string_append_ooi_ilseq \
 	string_builder_test \
 	string_case \
 	string_char_list_ilseq \
diff --git a/tests/hard_coded/string_append_ooi_ilseq.exp b/tests/hard_coded/string_append_ooi_ilseq.exp
new file mode 100644
index 000000000..69c73faa9
--- /dev/null
+++ b/tests/hard_coded/string_append_ooi_ilseq.exp
@@ -0,0 +1,18 @@
+L: 
+R: 😀 0xf0 0x9f 0x98 z 
+
+L: 😀 
+R: 0xf0 0x9f 0x98 z 
+
+L: 😀 0xf0 
+R: 0x9f 0x98 z 
+
+L: 😀 0xf0 0x9f 
+R: 0x98 z 
+
+L: 😀 0xf0 0x9f 0x98 
+R: z 
+
+L: 😀 0xf0 0x9f 0x98 z 
+R: 
+
diff --git a/tests/hard_coded/string_append_ooi_ilseq.exp2 b/tests/hard_coded/string_append_ooi_ilseq.exp2
new file mode 100644
index 000000000..2013cb783
--- /dev/null
+++ b/tests/hard_coded/string_append_ooi_ilseq.exp2
@@ -0,0 +1,12 @@
+L: 
+R: 😀 0xd83d z 
+
+L: 😀 
+R: 0xd83d z 
+
+L: 😀 0xd83d 
+R: z 
+
+L: 😀 0xd83d z 
+R: 
+
diff --git a/tests/hard_coded/string_append_ooi_ilseq.m b/tests/hard_coded/string_append_ooi_ilseq.m
new file mode 100644
index 000000000..e3a13fcad
--- /dev/null
+++ b/tests/hard_coded/string_append_ooi_ilseq.m
@@ -0,0 +1,78 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+%
+% The .exp file is for backends using UTF-8 string encoding.
+% The .exp2 file is for backends using UTF-16 string encoding.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_append_ooi_ilseq.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is cc_multi.
+
+%---------------------------------------------------------------------------%
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module int.
+:- import_module list.
+:- import_module pair.
+:- import_module solutions.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    S0 = "😀",
+    S1 = string.between(S0, 0, count_code_units(S0) - 1),
+    S = S0 ++ S1 ++ "z",
+    unsorted_aggregate(test_append_ooi(S), write_result, !IO).
+
+:- pred test_append_ooi(string::in, pair(string, string)::out) is multi.
+
+test_append_ooi(S, L - R) :-
+    string.append(L, R, S).
+
+:- pred write_result(pair(string, string)::in, io::di, io::uo) is det.
+
+write_result(L - R, !IO) :-
+    io.write_string("L: ", !IO),
+    write_string_debug(L, !IO),
+    io.write_string("\n", !IO),
+    io.write_string("R: ", !IO),
+    write_string_debug(R, !IO),
+    io.write_string("\n\n", !IO).
+
+:- pred write_string_debug(string::in, io::di, io::uo) is det.
+
+write_string_debug(S, !IO) :-
+    write_string_debug_loop(S, 0, !IO).
+
+:- pred write_string_debug_loop(string::in, int::in, io::di, io::uo) is det.
+
+write_string_debug_loop(S, Index, !IO) :-
+    ( if string.index_next(S, Index, NextIndex, Char) then
+        ( if Char = '\ufffd' then
+            string.unsafe_index_code_unit(S, Index, CodeUnit),
+            write_hex(CodeUnit, !IO)
+        else if is_surrogate(Char) then
+            write_hex(char.to_int(Char), !IO)
+        else
+            io.write_char(Char, !IO)
+        ),
+        io.write_char(' ', !IO),
+        write_string_debug_loop(S, NextIndex, !IO)
+    else
+        true
+    ).
+
+:- pred write_hex(int::in, io::di, io::uo) is det.
+
+write_hex(I, !IO) :-
+    io.format("%#x", [i(I)], !IO).
-- 
2.23.0



More information about the reviews mailing list