[m-rev.] for review: Make string.replace_all with empty pattern preserve ill-formed sequences.
Peter Wang
novalazy at gmail.com
Thu Nov 7 10:44:08 AEDT 2019
library/string.m:
Define behaviour of string.replace_all on ill-formed code unit
sequences when the pattern is empty.
Implement that behaviour.
tests/general/string_replace.exp:
tests/general/string_replace.exp2:
tests/general/string_replace.m:
Extend test case.
Update code style.
---
library/string.m | 61 ++++++++++++++++++++++---
tests/general/string_replace.exp | 4 ++
tests/general/string_replace.exp2 | 44 ++++++++++++++++++
tests/general/string_replace.m | 75 +++++++++++++++++++++++--------
4 files changed, 159 insertions(+), 25 deletions(-)
create mode 100644 tests/general/string_replace.exp2
diff --git a/library/string.m b/library/string.m
index c65e857f8..7d6dda761 100644
--- a/library/string.m
+++ b/library/string.m
@@ -1077,11 +1077,19 @@
%
:- pred replace(string::in, string::in, string::in, string::uo) is semidet.
- % replace_all(String0, Search, Replace, String):
+ % replace_all(String0, Pattern, Subst, String):
%
- % Replaces any occurrences of Search in String0 with Replace to give
+ % Replaces any occurrences of Pattern in String0 with Subst to give
% String.
%
+ % If Pattern is the empty string then Subst is inserted at every point
+ % in String0 except between two code units in an encoding of a code point.
+ % For example, these are true:
+ %
+ % replace_all("", "", "|", "|")
+ % replace_all("a", "", "|", "|a|")
+ % replace_all("ab", "", "|", "|a|b|")
+ %
:- func replace_all(string::in, string::in, string::in) = (string::uo) is det.
:- pred replace_all(string::in, string::in, string::in, string::uo) is det.
@@ -4718,10 +4726,7 @@ replace_all(S1, S2, S3) = S4 :-
replace_all(Str, Pat, Subst, Result) :-
( if Pat = "" then
- % XXX ILSEQ foldl cannot handle ill-formed sequences.
- F = (func(C, L) = [char_to_string(C) ++ Subst | L]),
- Foldl = foldl(F, Str, []),
- Result = append_list([Subst | list.reverse(Foldl)])
+ replace_all_empty_pat(Str, Subst, Result)
else
PatLength = length(Pat),
replace_all_loop(Str, Pat, Subst, PatLength, 0, [], ReversedChunks),
@@ -4729,6 +4734,50 @@ replace_all(Str, Pat, Subst, Result) :-
Result = append_list(Chunks)
).
+:- pred replace_all_empty_pat(string::in, string::in, string::uo) is det.
+
+replace_all_empty_pat(Str, Subst, Result) :-
+ % This implementation is not the most efficient, but it is not expected
+ % to be used much in practice.
+ to_code_unit_list(Subst, SubstCodes),
+ Codes0 = SubstCodes,
+ replace_all_empty_pat_loop(Str, SubstCodes, length(Str), Codes0, Codes),
+ ( if from_code_unit_list_allow_ill_formed(Codes, ResultPrime) then
+ Result = ResultPrime
+ else
+ unexpected($pred, "string.from_code_unit_list_allow_ill_formed failed")
+ ).
+
+:- pred replace_all_empty_pat_loop(string::in, list(int)::in, int::in,
+ list(int)::in, list(int)::out) is det.
+
+replace_all_empty_pat_loop(Str, Subst, Index, Codes0, Codes) :-
+ ( if unsafe_prev_index(Str, Index, PrevIndex, Char) then
+ char.to_int(Char, CharInt),
+ ( if CharInt =< 0x7f then
+ % Fast path for single code unit code points.
+ Codes1 = [CharInt | Codes0]
+ else
+ prepend_code_units(Str, PrevIndex, Index - 1, Codes0, Codes1)
+ ),
+ Codes2 = Subst ++ Codes1,
+ replace_all_empty_pat_loop(Str, Subst, PrevIndex, Codes2, Codes)
+ else
+ Codes = Codes0
+ ).
+
+:- pred prepend_code_units(string::in, int::in, int::in,
+ list(int)::in, list(int)::out) is det.
+
+prepend_code_units(Str, FirstIndex, Index, Codes0, Codes) :-
+ unsafe_index_code_unit(Str, Index, Code),
+ Codes1 = [Code | Codes0],
+ ( if Index = FirstIndex then
+ Codes = Codes1
+ else
+ prepend_code_units(Str, FirstIndex, Index - 1, Codes1, Codes)
+ ).
+
:- pred replace_all_loop(string::in, string::in, string::in,
int::in, int::in, list(string)::in, list(string)::out) is det.
diff --git a/tests/general/string_replace.exp b/tests/general/string_replace.exp
index eab49079f..d633a7822 100644
--- a/tests/general/string_replace.exp
+++ b/tests/general/string_replace.exp
@@ -8,6 +8,8 @@ string__replace("aaa bbbb ccccc aaa", "", "**", Result)
"**aaa bbbb ccccc aaa"
string__replace("aßξ啕ßξ啕𐀀.", "", "**", Result)
"**aßξ啕ßξ啕𐀀."
+string__replace("[0x9f][0x98][0x80]😀", "", "**", Result)
+ "**[0x9f][0x98][0x80]😀"
string__replace("aaa bbbb ccccc aaa", "aaa", "", Result)
" bbbb ccccc aaa"
string__replace("aaa bbbb ccccc aaa", "cc", "**", Result)
@@ -28,6 +30,8 @@ string__replace_all("aaa bbbb ccccc aaa", "", "**", Result)
"**a**a**a** **b**b**b**b** **c**c**c**c**c** **a**a**a**"
string__replace_all("aßξ啕ßξ啕𐀀.", "", "**", Result)
"**a**ß**ξ**啕**ß**ξ**啕**𐀀**.**"
+string__replace_all("[0x9f][0x98][0x80]😀", "", "**", Result)
+ "**[0x9f]**[0x98]**[0x80]**😀**"
string__replace_all("aaa bbbb ccccc aaa", "aaa", "", Result)
" bbbb ccccc "
string__replace_all("aaa bbbb ccccc aaa", "cc", "**", Result)
diff --git a/tests/general/string_replace.exp2 b/tests/general/string_replace.exp2
new file mode 100644
index 000000000..082aa9237
--- /dev/null
+++ b/tests/general/string_replace.exp2
@@ -0,0 +1,44 @@
+string__replace("", "a", "bc", Result)
+ FAIL!
+string__replace("aaa bbbb ccccc aaa", "aab", "**", Result)
+ FAIL!
+string__replace("aaa bbbb ccccc aaa", "aaaa", "**", Result)
+ FAIL!
+string__replace("aaa bbbb ccccc aaa", "", "**", Result)
+ "**aaa bbbb ccccc aaa"
+string__replace("aßξ啕ßξ啕𐀀.", "", "**", Result)
+ "**aßξ啕ßξ啕𐀀."
+string__replace("[0xde00]😀", "", "**", Result)
+ "**[0xde00]😀"
+string__replace("aaa bbbb ccccc aaa", "aaa", "", Result)
+ " bbbb ccccc aaa"
+string__replace("aaa bbbb ccccc aaa", "cc", "**", Result)
+ "aaa bbbb **ccc aaa"
+string__replace("aßξ啕ßξ啕𐀀.", "ßξ", "**", Result)
+ "a**啕ßξ啕𐀀."
+string__replace("aßξ啕ßξ啕𐀀.", "ßξ", "★★", Result)
+ "a★★啕ßξ啕𐀀."
+string__replace("aßξ啕ßξ啕𐀀.", "啕ßξ", "***", Result)
+ "aßξ***啕𐀀."
+string__replace_all("", "a", "bc", Result)
+ ""
+string__replace_all("aaa bbbb ccccc aaa", "aab", "**", Result)
+ "aaa bbbb ccccc aaa"
+string__replace_all("aaa bbbb ccccc aaa", "aaaa", "**", Result)
+ "aaa bbbb ccccc aaa"
+string__replace_all("aaa bbbb ccccc aaa", "", "**", Result)
+ "**a**a**a** **b**b**b**b** **c**c**c**c**c** **a**a**a**"
+string__replace_all("aßξ啕ßξ啕𐀀.", "", "**", Result)
+ "**a**ß**ξ**啕**ß**ξ**啕**𐀀**.**"
+string__replace_all("[0xde00]😀", "", "**", Result)
+ "**[0xde00]**😀**"
+string__replace_all("aaa bbbb ccccc aaa", "aaa", "", Result)
+ " bbbb ccccc "
+string__replace_all("aaa bbbb ccccc aaa", "cc", "**", Result)
+ "aaa bbbb ****c aaa"
+string__replace_all("aßξ啕ßξ啕𐀀.", "ßξ", "**", Result)
+ "a**啕**啕𐀀."
+string__replace_all("aßξ啕ßξ啕𐀀.", "ßξ", "★★", Result)
+ "a★★啕★★啕𐀀."
+string__replace_all("aßξ啕ßξ啕𐀀.", "啕ßξ", "***", Result)
+ "aßξ***啕𐀀."
diff --git a/tests/general/string_replace.m b/tests/general/string_replace.m
index 2b905fe90..3dcfa2ae8 100644
--- a/tests/general/string_replace.m
+++ b/tests/general/string_replace.m
@@ -1,6 +1,11 @@
%---------------------------------------------------------------------------%
% vim: ts=4 sw=4 et ft=mercury
%---------------------------------------------------------------------------%
+%
+% The .exp file is for backends using UTF-8 string encoding.
+% The .exp2 file is for backends using UTF-16 string encoding.
+%
+%---------------------------------------------------------------------------%
:- module string_replace.
@@ -14,6 +19,8 @@
:- implementation.
+:- import_module char.
+:- import_module int.
:- import_module list.
:- import_module string.
@@ -22,6 +29,8 @@
main(!IO) :-
Str = "aaa bbbb ccccc aaa",
Str2 = "aßξ啕ßξ啕𐀀.",
+ Smiley = "😀",
+ Str3 = between(Smiley, 1, length(Smiley)) ++ Smiley,
Tests = [
% pattern not in string
{"", "a", "bc"},
@@ -31,6 +40,7 @@ main(!IO) :-
% pattern is empty string
{Str, "", "**"},
{Str2, "", "**"},
+ {Str3, "", "**"},
% pattern in string
{Str, "aaa", ""},
@@ -39,31 +49,58 @@ main(!IO) :-
{Str2, "ßξ", "★★"},
{Str2, "啕ßξ", "***"}
],
- list__foldl(test_replace, Tests, !IO),
- list__foldl(test_replace_all, Tests, !IO).
+ list.foldl(test_replace, Tests, !IO),
+ list.foldl(test_replace_all, Tests, !IO).
:- pred test_replace({string, string, string}::in, io::di, io::uo) is det.
test_replace({Str, Pat, Subst}, !IO) :-
- io__write_string("string__replace(\"" ++ Str ++
- "\", \"" ++ Pat ++
- "\", \"" ++ Subst ++ "\", Result) \n\t", !IO),
- ( string__replace(Str, Pat, Subst, Result) ->
- io__write(Result, !IO),
- io__nl(!IO)
- ;
- io__write_string("FAIL!\n", !IO)
+ io.write_string("string__replace(\"", !IO),
+ write_string_debug(Str, !IO),
+ io.write_string("\", \"", !IO),
+ io.write_string(Pat, !IO),
+ io.write_string("\", \"", !IO),
+ io.write_string(Subst, !IO),
+ io.write_string("\", Result) \n\t", !IO),
+ ( if string.replace(Str, Pat, Subst, Result) then
+ io.write_string("\"", !IO),
+ write_string_debug(Result, !IO),
+ io.write_string("\"\n", !IO)
+ else
+ io.write_string("FAIL!\n", !IO)
).
:- pred test_replace_all({string, string, string}::in, io::di, io::uo) is det.
test_replace_all({Str, Pat, Subst}, !IO) :-
- io__write_string("string__replace_all(\"" ++ Str ++
- "\", \"" ++ Pat ++
- "\", \"" ++ Subst ++ "\", Result) \n\t", !IO),
- string__replace_all(Str, Pat, Subst, Result),
- io__write(Result, !IO),
- io__nl(!IO).
-
-%---------------------------------------------------------------------------%
-%---------------------------------------------------------------------------%
+ io.write_string("string__replace_all(\"", !IO),
+ write_string_debug(Str, !IO),
+ io.write_string("\", \"", !IO),
+ io.write_string(Pat, !IO),
+ io.write_string("\", \"", !IO),
+ io.write_string(Subst, !IO),
+ io.write_string("\", Result) \n\t", !IO),
+ io.write_string("\"", !IO),
+ string.replace_all(Str, Pat, Subst, Result),
+ write_string_debug(Result, !IO),
+ io.write_string("\"\n", !IO).
+
+:- pred write_string_debug(string::in, io::di, io::uo) is det.
+
+write_string_debug(S, !IO) :-
+ write_string_debug_loop(S, 0, !IO).
+
+:- pred write_string_debug_loop(string::in, int::in, io::di, io::uo) is det.
+
+write_string_debug_loop(S, Index, !IO) :-
+ ( if string.index_next(S, Index, NextIndex, Char) then
+ ( if char.is_surrogate(Char) ; Char = '\ufffd' then
+ unsafe_index_code_unit(S, Index, Code),
+ io.format("[%#x]", [i(Code)], !IO)
+ else
+ io.write_char(Char, !IO)
+ ),
+ write_string_debug_loop(S, NextIndex, !IO)
+ else
+ true
+ ).
--
2.23.0
More information about the reviews
mailing list