[m-rev.] for review: Fix some handling of ill-formed sequences in string module.
Peter Wang
novalazy at gmail.com
Tue Jul 26 17:49:59 AEST 2022
I can add something to NEWS if required.
----
library/string.m:
Document that string.duplicate_char may throw an exception.
Fix string.all_match to fail if the string being tested contains
any ill-formed code unit sequences.
Fix the Mercury implementation of string.contains_char to continue
searching for the character past any ill-formed code unit sequences.
The foreign code implementations already did so.
Fix unsafe_index_next_repl and unsafe_prev_index_repl in C grades.
We indexed the C string with `ReplacedCodeUnit = Str[Index]' but
since Mercury strings have the C type `char *', ReplacedCodeUnit
could take on a negative value. When ReplacedCodeUnit == -1,
the caller would assume there is a valid encoding of a code point
beginning at Index, and therefore return `not_replaced' instead of
`replaced_code_unit(255)'.
Add casts to `unsigned char' in other places where we index C
strings.
tests/hard_coded/string_all_match.m:
tests/hard_coded/string_all_match.exp:
tests/hard_coded/string_all_match.exp2:
Add test for string.all_match.
tests/hard_coded/contains_char_2.m:
tests/hard_coded/contains_char_2.exp:
Delete older test case for string.contains_char.
tests/hard_coded/string_contains_char.m:
tests/hard_coded/string_contains_char.exp:
tests/hard_coded/string_contains_char.exp2:
Add better test for string.contains_char.
tests/hard_coded/Mmakefile:
Enable the tests.
diff --git a/library/string.m b/library/string.m
index 0844c84c2..573ea2909 100644
--- a/library/string.m
+++ b/library/string.m
@@ -216,7 +216,9 @@
%
% Construct a string consisting of Count occurrences of Char code points
% in sequence, returning the empty string if Count is less than or equal
- % to zero.
+ % to zero. Throws an exception if Char is a null character or code point
+ % that cannot be encoded in a string (namely, surrogate code points cannot
+ % be encoded in UTF-8 strings).
%
:- func duplicate_char(char::in, int::in) = (string::uo) is det.
:- pred duplicate_char(char::in, int::in, string::uo) is det.
@@ -2390,7 +2392,7 @@ unsafe_index(S, N) = C :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
- Ch = Str[Index];
+ Ch = (unsigned char) Str[Index];
if (!MR_is_ascii(Ch)) {
int width;
Ch = MR_utf8_get_mb(Str, Index, &width);
@@ -2462,7 +2464,7 @@ unsafe_index_next_repl(Str, Index, NextIndex, Ch, MaybeReplaced) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
- Ch = Str[Index];
+ Ch = (unsigned char) Str[Index];
ReplacedCodeUnit = -1;
if (MR_is_ascii(Ch)) {
NextIndex = Index + 1;
@@ -2472,7 +2474,7 @@ unsafe_index_next_repl(Str, Index, NextIndex, Ch, MaybeReplaced) :-
Ch = MR_utf8_get_next_mb(Str, &NextIndex);
if (Ch < 0) {
Ch = 0xfffd;
- ReplacedCodeUnit = Str[Index];
+ ReplacedCodeUnit = (unsigned char) Str[Index];
NextIndex = Index + 1;
}
SUCCESS_INDICATOR = MR_TRUE;
@@ -2563,7 +2565,7 @@ unsafe_prev_index_repl(Str, Index, PrevIndex, Ch, MaybeReplaced) :-
SUCCESS_INDICATOR = MR_FALSE;
} else {
PrevIndex = Index - 1;
- Ch = Str[PrevIndex];
+ Ch = (unsigned char) Str[PrevIndex];
if (! MR_is_ascii(Ch)) {
Ch = MR_utf8_prev_get(Str, &PrevIndex);
// XXX MR_utf8_prev_get currently just scans backwards to find a
@@ -2571,7 +2573,7 @@ unsafe_prev_index_repl(Str, Index, PrevIndex, Ch, MaybeReplaced) :-
// unaccounted for.
if (Ch < 0 || PrevIndex + MR_utf8_width(Ch) != Index) {
Ch = 0xfffd;
- ReplacedCodeUnit = Str[Index - 1];
+ ReplacedCodeUnit = (unsigned char) Str[Index - 1];
PrevIndex = Index - 1;
}
}
@@ -3291,7 +3293,8 @@ all_match(P, String) :-
int::in) is semidet.
all_match_loop(P, String, Cur) :-
- ( if unsafe_index_next_repl(String, Cur, Next, Char, not_replaced) then
+ ( if unsafe_index_next_repl(String, Cur, Next, Char, MaybeReplaced) then
+ MaybeReplaced = not_replaced,
P(Char),
all_match_loop(P, String, Next)
else
@@ -3362,8 +3365,11 @@ contains_char(String, Char) :-
:- pred contains_char_loop(string::in, char::in, int::in) is semidet.
contains_char_loop(Str, Char, I) :-
- unsafe_index_next_repl(Str, I, J, IndexChar, not_replaced),
- ( if IndexChar = Char then
+ unsafe_index_next_repl(Str, I, J, IndexChar, MaybeReplaced),
+ ( if
+ MaybeReplaced = not_replaced,
+ IndexChar = Char
+ then
true
else
contains_char_loop(Str, Char, J)
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index caa47e9ac..484cd7e40 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -79,7 +79,6 @@ ORDINARY_PROGS = \
construct_test \
construct_test_exist \
contains_char \
- contains_char_2 \
contravariance_bug \
contravariance_poly \
curry \
@@ -373,6 +372,7 @@ ORDINARY_PROGS = \
stream_test \
string_alignment \
string_alignment_bug \
+ string_all_match \
string_append_iii \
string_append_ioi \
string_append_ooi \
@@ -385,6 +385,7 @@ ORDINARY_PROGS = \
string_codepoint \
string_codepoint_offset_ilseq \
string_compare_substrings \
+ string_contains_char \
string_contains_match \
string_count_codepoints_ilseq \
string_first_char \
diff --git a/tests/hard_coded/contains_char_2.exp b/tests/hard_coded/contains_char_2.exp
deleted file mode 100644
index 4745bd879..000000000
--- a/tests/hard_coded/contains_char_2.exp
+++ /dev/null
@@ -1 +0,0 @@
-test succeeded
diff --git a/tests/hard_coded/contains_char_2.m b/tests/hard_coded/contains_char_2.m
deleted file mode 100644
index 07628c060..000000000
--- a/tests/hard_coded/contains_char_2.m
+++ /dev/null
@@ -1,39 +0,0 @@
-%---------------------------------------------------------------------------%
-% vim: ts=4 sw=4 et ft=mercury
-%---------------------------------------------------------------------------%
-
-:- module contains_char_2.
-:- interface.
-
-:- import_module io.
-
-:- pred main(io::di, io::uo) is det.
-
-%---------------------------------------------------------------------------%
-%---------------------------------------------------------------------------%
-
-:- implementation.
-
-:- import_module string.
-
-%---------------------------------------------------------------------------%
-
-main(!IO) :-
- ( if
- string.contains_char("cat", 'c'),
- string.contains_char("cat", 'a'),
- string.contains_char("cat", 't'),
- not string.contains_char("cat", 'm'),
- string.contains_char("aΓΞΎεπ.", 'Γ'),
- string.contains_char("aΓΞΎεπ.", 'Γ'),
- string.contains_char("aΓΞΎεπ.", 'ΞΎ'),
- string.contains_char("aΓΞΎεπ.", 'ε'),
- string.contains_char("aΓΞΎεπ.", '.'),
- not string.contains_char("aΓΞΎεπ.", 'βΏ')
- then
- io.write_string("test succeeded\n", !IO)
- else
- io.write_string("test failed\n", !IO)
- ).
-
-%---------------------------------------------------------------------------%
diff --git a/tests/hard_coded/string_all_match.exp b/tests/hard_coded/string_all_match.exp
new file mode 100644
index 000000000..2d86efdd6
--- /dev/null
+++ b/tests/hard_coded/string_all_match.exp
@@ -0,0 +1,6 @@
+all_match(is_alpha, "") ==> true.
+all_match(is_alpha, "abc") ==> true.
+all_match(is_alpha, "abc123") ==> false.
+all_match(is_chess_piece, "ββββββββ") ==> true.
+all_match(is_chess_piece, "ββββKβββ") ==> false.
+all_match(is_alpha, "abcοΏ½def") ==> false.
diff --git a/tests/hard_coded/string_all_match.exp2 b/tests/hard_coded/string_all_match.exp2
new file mode 100644
index 000000000..9847dac62
--- /dev/null
+++ b/tests/hard_coded/string_all_match.exp2
@@ -0,0 +1,6 @@
+all_match(is_alpha, "") ==> true.
+all_match(is_alpha, "abc") ==> true.
+all_match(is_alpha, "abc123") ==> false.
+all_match(is_chess_piece, "ββββββββ") ==> true.
+all_match(is_chess_piece, "ββββKβββ") ==> false.
+all_match(is_alpha, "abc?def") ==> false.
diff --git a/tests/hard_coded/string_all_match.m b/tests/hard_coded/string_all_match.m
new file mode 100644
index 000000000..fd0b4d4fc
--- /dev/null
+++ b/tests/hard_coded/string_all_match.m
@@ -0,0 +1,61 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+% A test of string.all_match/2.
+%
+% The .exp file is for C and C# grades.
+% The .exp2 file is for Java grades.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_all_match.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module int.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+ test_all_match(is_alpha, "is_alpha", "", !IO),
+ test_all_match(is_alpha, "is_alpha", "abc", !IO),
+ test_all_match(is_alpha, "is_alpha", "abc123", !IO),
+ test_all_match(is_chess_piece, "is_chess_piece", "ββββββββ", !IO),
+ test_all_match(is_chess_piece, "is_chess_piece", "ββββKβββ", !IO),
+
+ Ilseq = string.between("π", 0, 1),
+ S = "abc" ++ Ilseq ++ "def",
+ test_all_match(is_alpha, "is_alpha", S, !IO).
+
+:- pred test_all_match(pred(char)::in(pred(in) is semidet),
+ string::in, string::in, io::di, io::uo) is det.
+
+test_all_match(Pred, PredName, String, !IO) :-
+ ( if string.all_match(Pred, String) then
+ Result = "true"
+ else
+ Result = "false"
+ ),
+ io.format("all_match(%s, %s) ==> %s.\n",
+ [s(PredName), s(string(String)), s(Result)], !IO).
+
+:- pred is_chess_piece(char::in) is semidet.
+
+is_chess_piece(Char) :-
+ char.to_int(Char, CodePoint),
+ CodePoint >= 0x2654, CodePoint =< 0x265F.
+
+%---------------------------------------------------------------------------%
+:- end_module string_all_match.
+%---------------------------------------------------------------------------%
diff --git a/tests/hard_coded/string_contains_char.exp b/tests/hard_coded/string_contains_char.exp
new file mode 100644
index 000000000..6eacc76a0
--- /dev/null
+++ b/tests/hard_coded/string_contains_char.exp
@@ -0,0 +1,15 @@
+contains_char("", 'X') ==> false.
+contains_char("abc", 'X') ==> false.
+contains_char("abc", 'a') ==> true.
+contains_char("abc", 'b') ==> true.
+contains_char("abc", 'c') ==> true.
+contains_char("aΓΞΎεπ.", 'Γ') ==> true.
+contains_char("aΓΞΎεπ.", 'ΞΎ') ==> true.
+contains_char("aΓΞΎεπ.", 'ε') ==> true.
+contains_char("aΓΞΎεπ.", '.') ==> true.
+contains_char("aΓΞΎεπ.", 'βΏ') ==> false.
+contains_char("abcοΏ½123", 'c') ==> true.
+contains_char("abcοΏ½123", '1') ==> true.
+contains_char("abcοΏ½123", '2') ==> true.
+contains_char("abcοΏ½123", '3') ==> true.
+contains_char("abcοΏ½123", 'οΏ½') ==> false.
diff --git a/tests/hard_coded/string_contains_char.exp2 b/tests/hard_coded/string_contains_char.exp2
new file mode 100644
index 000000000..430748e2a
--- /dev/null
+++ b/tests/hard_coded/string_contains_char.exp2
@@ -0,0 +1,15 @@
+contains_char("", 'X') ==> false.
+contains_char("abc", 'X') ==> false.
+contains_char("abc", 'a') ==> true.
+contains_char("abc", 'b') ==> true.
+contains_char("abc", 'c') ==> true.
+contains_char("aΓΞΎεπ.", 'Γ') ==> true.
+contains_char("aΓΞΎεπ.", 'ΞΎ') ==> true.
+contains_char("aΓΞΎεπ.", 'ε') ==> true.
+contains_char("aΓΞΎεπ.", '.') ==> true.
+contains_char("aΓΞΎεπ.", 'βΏ') ==> false.
+contains_char("abc?123", 'c') ==> true.
+contains_char("abc?123", '1') ==> true.
+contains_char("abc?123", '2') ==> true.
+contains_char("abc?123", '3') ==> true.
+contains_char("abc?123", 'οΏ½') ==> false.
diff --git a/tests/hard_coded/string_contains_char.m b/tests/hard_coded/string_contains_char.m
new file mode 100644
index 000000000..4f9c1f1af
--- /dev/null
+++ b/tests/hard_coded/string_contains_char.m
@@ -0,0 +1,63 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+% A test of string.contains_char/2.
+%
+% The .exp file is for C and C# grades.
+% The .exp2 file is for Java grades.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_contains_char.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+ test_contains_char("", 'X', !IO),
+ test_contains_char("abc", 'X', !IO),
+ test_contains_char("abc", 'a', !IO),
+ test_contains_char("abc", 'b', !IO),
+ test_contains_char("abc", 'c', !IO),
+
+ test_contains_char("aΓΞΎεπ.", 'Γ', !IO),
+ test_contains_char("aΓΞΎεπ.", 'ΞΎ', !IO),
+ test_contains_char("aΓΞΎεπ.", 'ε', !IO),
+ test_contains_char("aΓΞΎεπ.", '.', !IO),
+ test_contains_char("aΓΞΎεπ.", 'βΏ', !IO),
+
+ Ilseq = string.between("π", 0, 1),
+ S = "abc" ++ Ilseq ++ "123",
+ test_contains_char(S, 'c', !IO),
+ test_contains_char(S, '1', !IO),
+ test_contains_char(S, '2', !IO),
+ test_contains_char(S, '3', !IO),
+ test_contains_char(S, '\uFFFD', !IO).
+
+:- pred test_contains_char(string::in, char::in, io::di, io::uo) is det.
+
+test_contains_char(String, Char, !IO) :-
+ ( if string.contains_char(String, Char) then
+ Result = "true"
+ else
+ Result = "false"
+ ),
+ io.format("contains_char(%s, %s) ==> %s.\n",
+ [s(string(String)), s(string(Char)), s(Result)], !IO).
+
+%---------------------------------------------------------------------------%
+:- end_module string_contains_char.
+%---------------------------------------------------------------------------%
--
2.36.1
More information about the reviews
mailing list