[m-rev.] for review: Fix some handling of ill-formed sequences in string module.

Peter Wang novalazy at gmail.com
Tue Jul 26 17:49:59 AEST 2022


I can add something to NEWS if required.
----

library/string.m:
    Document that string.duplicate_char may throw an exception.

    Fix string.all_match to fail if the string being tested contains
    any ill-formed code unit sequences.

    Fix the Mercury implementation of string.contains_char to continue
    searching for the character past any ill-formed code unit sequences.
    The foreign code implementations already did so.

    Fix unsafe_index_next_repl and unsafe_prev_index_repl in C grades.
    We indexed the C string with `ReplacedCodeUnit = Str[Index]' but
    since Mercury strings have the C type `char *', ReplacedCodeUnit
    could take on a negative value. When ReplacedCodeUnit == -1,
    the caller would assume there is a valid encoding of a code point
    beginning at Index, and therefore return `not_replaced' instead of
    `replaced_code_unit(255)'.

    Add casts to `unsigned char' in other places where we index C
    strings.

tests/hard_coded/string_all_match.m:
tests/hard_coded/string_all_match.exp:
tests/hard_coded/string_all_match.exp2:
    Add test for string.all_match.

tests/hard_coded/contains_char_2.m:
tests/hard_coded/contains_char_2.exp:
    Delete older test case for string.contains_char.

tests/hard_coded/string_contains_char.m:
tests/hard_coded/string_contains_char.exp:
tests/hard_coded/string_contains_char.exp2:
    Add better test for string.contains_char.

tests/hard_coded/Mmakefile:
    Enable the tests.

diff --git a/library/string.m b/library/string.m
index 0844c84c2..573ea2909 100644
--- a/library/string.m
+++ b/library/string.m
@@ -216,7 +216,9 @@
     %
     % Construct a string consisting of Count occurrences of Char code points
     % in sequence, returning the empty string if Count is less than or equal
-    % to zero.
+    % to zero. Throws an exception if Char is a null character or code point
+    % that cannot be encoded in a string (namely, surrogate code points cannot
+    % be encoded in UTF-8 strings).
     %
 :- func duplicate_char(char::in, int::in) = (string::uo) is det.
 :- pred duplicate_char(char::in, int::in, string::uo) is det.
@@ -2390,7 +2392,7 @@ unsafe_index(S, N) = C :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
-    Ch = Str[Index];
+    Ch = (unsigned char) Str[Index];
     if (!MR_is_ascii(Ch)) {
         int width;
         Ch = MR_utf8_get_mb(Str, Index, &width);
@@ -2462,7 +2464,7 @@ unsafe_index_next_repl(Str, Index, NextIndex, Ch, MaybeReplaced) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
-    Ch = Str[Index];
+    Ch = (unsigned char) Str[Index];
     ReplacedCodeUnit = -1;
     if (MR_is_ascii(Ch)) {
         NextIndex = Index + 1;
@@ -2472,7 +2474,7 @@ unsafe_index_next_repl(Str, Index, NextIndex, Ch, MaybeReplaced) :-
         Ch = MR_utf8_get_next_mb(Str, &NextIndex);
         if (Ch < 0) {
             Ch = 0xfffd;
-            ReplacedCodeUnit = Str[Index];
+            ReplacedCodeUnit = (unsigned char) Str[Index];
             NextIndex = Index + 1;
         }
         SUCCESS_INDICATOR = MR_TRUE;
@@ -2563,7 +2565,7 @@ unsafe_prev_index_repl(Str, Index, PrevIndex, Ch, MaybeReplaced) :-
         SUCCESS_INDICATOR = MR_FALSE;
     } else {
         PrevIndex = Index - 1;
-        Ch = Str[PrevIndex];
+        Ch = (unsigned char) Str[PrevIndex];
         if (! MR_is_ascii(Ch)) {
             Ch = MR_utf8_prev_get(Str, &PrevIndex);
             // XXX MR_utf8_prev_get currently just scans backwards to find a
@@ -2571,7 +2573,7 @@ unsafe_prev_index_repl(Str, Index, PrevIndex, Ch, MaybeReplaced) :-
             // unaccounted for.
             if (Ch < 0 || PrevIndex + MR_utf8_width(Ch) != Index) {
                 Ch = 0xfffd;
-                ReplacedCodeUnit = Str[Index - 1];
+                ReplacedCodeUnit = (unsigned char) Str[Index - 1];
                 PrevIndex = Index - 1;
             }
         }
@@ -3291,7 +3293,8 @@ all_match(P, String) :-
     int::in) is semidet.
 
 all_match_loop(P, String, Cur) :-
-    ( if unsafe_index_next_repl(String, Cur, Next, Char, not_replaced) then
+    ( if unsafe_index_next_repl(String, Cur, Next, Char, MaybeReplaced) then
+        MaybeReplaced = not_replaced,
         P(Char),
         all_match_loop(P, String, Next)
     else
@@ -3362,8 +3365,11 @@ contains_char(String, Char) :-
 :- pred contains_char_loop(string::in, char::in, int::in) is semidet.
 
 contains_char_loop(Str, Char, I) :-
-    unsafe_index_next_repl(Str, I, J, IndexChar, not_replaced),
-    ( if IndexChar = Char then
+    unsafe_index_next_repl(Str, I, J, IndexChar, MaybeReplaced),
+    ( if
+        MaybeReplaced = not_replaced,
+        IndexChar = Char
+    then
         true
     else
         contains_char_loop(Str, Char, J)
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index caa47e9ac..484cd7e40 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -79,7 +79,6 @@ ORDINARY_PROGS = \
 	construct_test \
 	construct_test_exist \
 	contains_char \
-	contains_char_2 \
 	contravariance_bug \
 	contravariance_poly \
 	curry \
@@ -373,6 +372,7 @@ ORDINARY_PROGS = \
 	stream_test \
 	string_alignment \
 	string_alignment_bug \
+	string_all_match \
 	string_append_iii \
 	string_append_ioi \
 	string_append_ooi \
@@ -385,6 +385,7 @@ ORDINARY_PROGS = \
 	string_codepoint \
 	string_codepoint_offset_ilseq \
 	string_compare_substrings \
+	string_contains_char \
 	string_contains_match \
 	string_count_codepoints_ilseq \
 	string_first_char \
diff --git a/tests/hard_coded/contains_char_2.exp b/tests/hard_coded/contains_char_2.exp
deleted file mode 100644
index 4745bd879..000000000
--- a/tests/hard_coded/contains_char_2.exp
+++ /dev/null
@@ -1 +0,0 @@
-test succeeded
diff --git a/tests/hard_coded/contains_char_2.m b/tests/hard_coded/contains_char_2.m
deleted file mode 100644
index 07628c060..000000000
--- a/tests/hard_coded/contains_char_2.m
+++ /dev/null
@@ -1,39 +0,0 @@
-%---------------------------------------------------------------------------%
-% vim: ts=4 sw=4 et ft=mercury
-%---------------------------------------------------------------------------%
-
-:- module contains_char_2.
-:- interface.
-
-:- import_module io.
-
-:- pred main(io::di, io::uo) is det.
-
-%---------------------------------------------------------------------------%
-%---------------------------------------------------------------------------%
-
-:- implementation.
-
-:- import_module string.
-
-%---------------------------------------------------------------------------%
-
-main(!IO) :-
-    ( if
-        string.contains_char("cat", 'c'),
-        string.contains_char("cat", 'a'),
-        string.contains_char("cat", 't'),
-        not string.contains_char("cat", 'm'),
-        string.contains_char("aΓŸΞΎε••π€€.", 'ß'),
-        string.contains_char("aΓŸΞΎε••π€€.", 'ß'),
-        string.contains_char("aΓŸΞΎε••π€€.", 'ΞΎ'),
-        string.contains_char("aΓŸΞΎε••π€€.", 'ε••'),
-        string.contains_char("aΓŸΞΎε••π€€.", '.'),
-        not string.contains_char("aΓŸΞΎε••π€€.", '☿')
-    then
-        io.write_string("test succeeded\n", !IO)
-    else
-        io.write_string("test failed\n", !IO)
-    ).
-
-%---------------------------------------------------------------------------%
diff --git a/tests/hard_coded/string_all_match.exp b/tests/hard_coded/string_all_match.exp
new file mode 100644
index 000000000..2d86efdd6
--- /dev/null
+++ b/tests/hard_coded/string_all_match.exp
@@ -0,0 +1,6 @@
+all_match(is_alpha, "") ==> true.
+all_match(is_alpha, "abc") ==> true.
+all_match(is_alpha, "abc123") ==> false.
+all_match(is_chess_piece, "β™œβ™žβ™β™›β™šβ™β™žβ™œ") ==> true.
+all_match(is_chess_piece, "β™œβ™žβ™β™›Kβ™β™žβ™œ") ==> false.
+all_match(is_alpha, "abcοΏ½def") ==> false.
diff --git a/tests/hard_coded/string_all_match.exp2 b/tests/hard_coded/string_all_match.exp2
new file mode 100644
index 000000000..9847dac62
--- /dev/null
+++ b/tests/hard_coded/string_all_match.exp2
@@ -0,0 +1,6 @@
+all_match(is_alpha, "") ==> true.
+all_match(is_alpha, "abc") ==> true.
+all_match(is_alpha, "abc123") ==> false.
+all_match(is_chess_piece, "β™œβ™žβ™β™›β™šβ™β™žβ™œ") ==> true.
+all_match(is_chess_piece, "β™œβ™žβ™β™›Kβ™β™žβ™œ") ==> false.
+all_match(is_alpha, "abc?def") ==> false.
diff --git a/tests/hard_coded/string_all_match.m b/tests/hard_coded/string_all_match.m
new file mode 100644
index 000000000..fd0b4d4fc
--- /dev/null
+++ b/tests/hard_coded/string_all_match.m
@@ -0,0 +1,61 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+% A test of string.all_match/2.
+%
+% The .exp file is for C and C# grades.
+% The .exp2 file is for Java grades.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_all_match.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module int.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    test_all_match(is_alpha, "is_alpha", "", !IO),
+    test_all_match(is_alpha, "is_alpha", "abc", !IO),
+    test_all_match(is_alpha, "is_alpha", "abc123", !IO),
+    test_all_match(is_chess_piece, "is_chess_piece", "β™œβ™žβ™β™›β™šβ™β™žβ™œ", !IO),
+    test_all_match(is_chess_piece, "is_chess_piece", "β™œβ™žβ™β™›Kβ™β™žβ™œ", !IO),
+
+    Ilseq = string.between("πŸ˜€", 0, 1),
+    S = "abc" ++ Ilseq ++ "def",
+    test_all_match(is_alpha, "is_alpha", S, !IO).
+
+:- pred test_all_match(pred(char)::in(pred(in) is semidet),
+    string::in, string::in, io::di, io::uo) is det.
+
+test_all_match(Pred, PredName, String, !IO) :-
+    ( if string.all_match(Pred, String) then
+        Result = "true"
+    else
+        Result = "false"
+    ),
+    io.format("all_match(%s, %s) ==> %s.\n",
+        [s(PredName), s(string(String)), s(Result)], !IO).
+
+:- pred is_chess_piece(char::in) is semidet.
+
+is_chess_piece(Char) :-
+    char.to_int(Char, CodePoint),
+    CodePoint >= 0x2654, CodePoint =< 0x265F.
+
+%---------------------------------------------------------------------------%
+:- end_module string_all_match.
+%---------------------------------------------------------------------------%
diff --git a/tests/hard_coded/string_contains_char.exp b/tests/hard_coded/string_contains_char.exp
new file mode 100644
index 000000000..6eacc76a0
--- /dev/null
+++ b/tests/hard_coded/string_contains_char.exp
@@ -0,0 +1,15 @@
+contains_char("", 'X') ==> false.
+contains_char("abc", 'X') ==> false.
+contains_char("abc", 'a') ==> true.
+contains_char("abc", 'b') ==> true.
+contains_char("abc", 'c') ==> true.
+contains_char("aΓŸΞΎε••π€€.", 'ß') ==> true.
+contains_char("aΓŸΞΎε••π€€.", 'ΞΎ') ==> true.
+contains_char("aΓŸΞΎε••π€€.", 'ε••') ==> true.
+contains_char("aΓŸΞΎε••π€€.", '.') ==> true.
+contains_char("aΓŸΞΎε••π€€.", '☿') ==> false.
+contains_char("abcοΏ½123", 'c') ==> true.
+contains_char("abcοΏ½123", '1') ==> true.
+contains_char("abcοΏ½123", '2') ==> true.
+contains_char("abcοΏ½123", '3') ==> true.
+contains_char("abcοΏ½123", 'οΏ½') ==> false.
diff --git a/tests/hard_coded/string_contains_char.exp2 b/tests/hard_coded/string_contains_char.exp2
new file mode 100644
index 000000000..430748e2a
--- /dev/null
+++ b/tests/hard_coded/string_contains_char.exp2
@@ -0,0 +1,15 @@
+contains_char("", 'X') ==> false.
+contains_char("abc", 'X') ==> false.
+contains_char("abc", 'a') ==> true.
+contains_char("abc", 'b') ==> true.
+contains_char("abc", 'c') ==> true.
+contains_char("aΓŸΞΎε••π€€.", 'ß') ==> true.
+contains_char("aΓŸΞΎε••π€€.", 'ΞΎ') ==> true.
+contains_char("aΓŸΞΎε••π€€.", 'ε••') ==> true.
+contains_char("aΓŸΞΎε••π€€.", '.') ==> true.
+contains_char("aΓŸΞΎε••π€€.", '☿') ==> false.
+contains_char("abc?123", 'c') ==> true.
+contains_char("abc?123", '1') ==> true.
+contains_char("abc?123", '2') ==> true.
+contains_char("abc?123", '3') ==> true.
+contains_char("abc?123", 'οΏ½') ==> false.
diff --git a/tests/hard_coded/string_contains_char.m b/tests/hard_coded/string_contains_char.m
new file mode 100644
index 000000000..4f9c1f1af
--- /dev/null
+++ b/tests/hard_coded/string_contains_char.m
@@ -0,0 +1,63 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+% A test of string.contains_char/2.
+%
+% The .exp file is for C and C# grades.
+% The .exp2 file is for Java grades.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_contains_char.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    test_contains_char("", 'X', !IO),
+    test_contains_char("abc", 'X', !IO),
+    test_contains_char("abc", 'a', !IO),
+    test_contains_char("abc", 'b', !IO),
+    test_contains_char("abc", 'c', !IO),
+
+    test_contains_char("aΓŸΞΎε••π€€.", 'ß', !IO),
+    test_contains_char("aΓŸΞΎε••π€€.", 'ΞΎ', !IO),
+    test_contains_char("aΓŸΞΎε••π€€.", 'ε••', !IO),
+    test_contains_char("aΓŸΞΎε••π€€.", '.', !IO),
+    test_contains_char("aΓŸΞΎε••π€€.", '☿', !IO),
+
+    Ilseq = string.between("πŸ˜€", 0, 1),
+    S = "abc" ++ Ilseq ++ "123",
+    test_contains_char(S, 'c', !IO),
+    test_contains_char(S, '1', !IO),
+    test_contains_char(S, '2', !IO),
+    test_contains_char(S, '3', !IO),
+    test_contains_char(S, '\uFFFD', !IO).
+
+:- pred test_contains_char(string::in, char::in, io::di, io::uo) is det.
+
+test_contains_char(String, Char, !IO) :-
+    ( if string.contains_char(String, Char) then
+        Result = "true"
+    else
+        Result = "false"
+    ),
+    io.format("contains_char(%s, %s) ==> %s.\n",
+        [s(string(String)), s(string(Char)), s(Result)], !IO).
+
+%---------------------------------------------------------------------------%
+:- end_module string_contains_char.
+%---------------------------------------------------------------------------%
-- 
2.36.1



More information about the reviews mailing list