[m-rev.] for review: Make string.replace_all with empty pattern preserve ill-formed sequences.
Mark Brown
mark at mercurylang.org
Thu Nov 7 12:58:16 AEDT 2019
This looks fine.
On Thu, Nov 7, 2019 at 10:44 AM Peter Wang <novalazy at gmail.com> wrote:
> library/string.m:
> Define behaviour of string.replace_all on ill-formed code unit
> sequences when the pattern is empty.
>
> Implement that behaviour.
>
> tests/general/string_replace.exp:
> tests/general/string_replace.exp2:
> tests/general/string_replace.m:
> Extend test case.
>
> Update code style.
> ---
> library/string.m | 61 ++++++++++++++++++++++---
> tests/general/string_replace.exp | 4 ++
> tests/general/string_replace.exp2 | 44 ++++++++++++++++++
> tests/general/string_replace.m | 75 +++++++++++++++++++++++--------
> 4 files changed, 159 insertions(+), 25 deletions(-)
> create mode 100644 tests/general/string_replace.exp2
>
> diff --git a/library/string.m b/library/string.m
> index c65e857f8..7d6dda761 100644
> --- a/library/string.m
> +++ b/library/string.m
> @@ -1077,11 +1077,19 @@
> %
> :- pred replace(string::in, string::in, string::in, string::uo) is
> semidet.
>
> - % replace_all(String0, Search, Replace, String):
> + % replace_all(String0, Pattern, Subst, String):
> %
> - % Replaces any occurrences of Search in String0 with Replace to give
> + % Replaces any occurrences of Pattern in String0 with Subst to give
> % String.
> %
> + % If Pattern is the empty string then Subst is inserted at every point
> + % in String0 except between two code units in an encoding of a code
> point.
> + % For example, these are true:
> + %
> + % replace_all("", "", "|", "|")
> + % replace_all("a", "", "|", "|a|")
> + % replace_all("ab", "", "|", "|a|b|")
> + %
> :- func replace_all(string::in, string::in, string::in) = (string::uo) is
> det.
> :- pred replace_all(string::in, string::in, string::in, string::uo) is
> det.
>
> @@ -4718,10 +4726,7 @@ replace_all(S1, S2, S3) = S4 :-
>
> replace_all(Str, Pat, Subst, Result) :-
> ( if Pat = "" then
> - % XXX ILSEQ foldl cannot handle ill-formed sequences.
> - F = (func(C, L) = [char_to_string(C) ++ Subst | L]),
> - Foldl = foldl(F, Str, []),
> - Result = append_list([Subst | list.reverse(Foldl)])
> + replace_all_empty_pat(Str, Subst, Result)
> else
> PatLength = length(Pat),
> replace_all_loop(Str, Pat, Subst, PatLength, 0, [],
> ReversedChunks),
> @@ -4729,6 +4734,50 @@ replace_all(Str, Pat, Subst, Result) :-
> Result = append_list(Chunks)
> ).
>
> +:- pred replace_all_empty_pat(string::in, string::in, string::uo) is det.
> +
> +replace_all_empty_pat(Str, Subst, Result) :-
> + % This implementation is not the most efficient, but it is not
> expected
> + % to be used much in practice.
> + to_code_unit_list(Subst, SubstCodes),
> + Codes0 = SubstCodes,
> + replace_all_empty_pat_loop(Str, SubstCodes, length(Str), Codes0,
> Codes),
> + ( if from_code_unit_list_allow_ill_formed(Codes, ResultPrime) then
> + Result = ResultPrime
> + else
> + unexpected($pred, "string.from_code_unit_list_allow_ill_formed
> failed")
> + ).
> +
> +:- pred replace_all_empty_pat_loop(string::in, list(int)::in, int::in,
> + list(int)::in, list(int)::out) is det.
> +
> +replace_all_empty_pat_loop(Str, Subst, Index, Codes0, Codes) :-
> + ( if unsafe_prev_index(Str, Index, PrevIndex, Char) then
> + char.to_int(Char, CharInt),
> + ( if CharInt =< 0x7f then
> + % Fast path for single code unit code points.
> + Codes1 = [CharInt | Codes0]
> + else
> + prepend_code_units(Str, PrevIndex, Index - 1, Codes0, Codes1)
> + ),
> + Codes2 = Subst ++ Codes1,
> + replace_all_empty_pat_loop(Str, Subst, PrevIndex, Codes2, Codes)
> + else
> + Codes = Codes0
> + ).
> +
> +:- pred prepend_code_units(string::in, int::in, int::in,
> + list(int)::in, list(int)::out) is det.
> +
> +prepend_code_units(Str, FirstIndex, Index, Codes0, Codes) :-
> + unsafe_index_code_unit(Str, Index, Code),
> + Codes1 = [Code | Codes0],
> + ( if Index = FirstIndex then
> + Codes = Codes1
> + else
> + prepend_code_units(Str, FirstIndex, Index - 1, Codes1, Codes)
> + ).
> +
> :- pred replace_all_loop(string::in, string::in, string::in,
> int::in, int::in, list(string)::in, list(string)::out) is det.
>
> diff --git a/tests/general/string_replace.exp
> b/tests/general/string_replace.exp
> index eab49079f..d633a7822 100644
> --- a/tests/general/string_replace.exp
> +++ b/tests/general/string_replace.exp
> @@ -8,6 +8,8 @@ string__replace("aaa bbbb ccccc aaa", "", "**", Result)
> "**aaa bbbb ccccc aaa"
> string__replace("aßξ啕ßξ啕𐀀.", "", "**", Result)
> "**aßξ啕ßξ啕𐀀."
> +string__replace("[0x9f][0x98][0x80]😀", "", "**", Result)
> + "**[0x9f][0x98][0x80]😀"
> string__replace("aaa bbbb ccccc aaa", "aaa", "", Result)
> " bbbb ccccc aaa"
> string__replace("aaa bbbb ccccc aaa", "cc", "**", Result)
> @@ -28,6 +30,8 @@ string__replace_all("aaa bbbb ccccc aaa", "", "**",
> Result)
> "**a**a**a** **b**b**b**b** **c**c**c**c**c** **a**a**a**"
> string__replace_all("aßξ啕ßξ啕𐀀.", "", "**", Result)
> "**a**ß**ξ**啕**ß**ξ**啕**𐀀**.**"
> +string__replace_all("[0x9f][0x98][0x80]😀", "", "**", Result)
> + "**[0x9f]**[0x98]**[0x80]**😀**"
> string__replace_all("aaa bbbb ccccc aaa", "aaa", "", Result)
> " bbbb ccccc "
> string__replace_all("aaa bbbb ccccc aaa", "cc", "**", Result)
> diff --git a/tests/general/string_replace.exp2
> b/tests/general/string_replace.exp2
> new file mode 100644
> index 000000000..082aa9237
> --- /dev/null
> +++ b/tests/general/string_replace.exp2
> @@ -0,0 +1,44 @@
> +string__replace("", "a", "bc", Result)
> + FAIL!
> +string__replace("aaa bbbb ccccc aaa", "aab", "**", Result)
> + FAIL!
> +string__replace("aaa bbbb ccccc aaa", "aaaa", "**", Result)
> + FAIL!
> +string__replace("aaa bbbb ccccc aaa", "", "**", Result)
> + "**aaa bbbb ccccc aaa"
> +string__replace("aßξ啕ßξ啕𐀀.", "", "**", Result)
> + "**aßξ啕ßξ啕𐀀."
> +string__replace("[0xde00]😀", "", "**", Result)
> + "**[0xde00]😀"
> +string__replace("aaa bbbb ccccc aaa", "aaa", "", Result)
> + " bbbb ccccc aaa"
> +string__replace("aaa bbbb ccccc aaa", "cc", "**", Result)
> + "aaa bbbb **ccc aaa"
> +string__replace("aßξ啕ßξ啕𐀀.", "ßξ", "**", Result)
> + "a**啕ßξ啕𐀀."
> +string__replace("aßξ啕ßξ啕𐀀.", "ßξ", "★★", Result)
> + "a★★啕ßξ啕𐀀."
> +string__replace("aßξ啕ßξ啕𐀀.", "啕ßξ", "***", Result)
> + "aßξ***啕𐀀."
> +string__replace_all("", "a", "bc", Result)
> + ""
> +string__replace_all("aaa bbbb ccccc aaa", "aab", "**", Result)
> + "aaa bbbb ccccc aaa"
> +string__replace_all("aaa bbbb ccccc aaa", "aaaa", "**", Result)
> + "aaa bbbb ccccc aaa"
> +string__replace_all("aaa bbbb ccccc aaa", "", "**", Result)
> + "**a**a**a** **b**b**b**b** **c**c**c**c**c** **a**a**a**"
> +string__replace_all("aßξ啕ßξ啕𐀀.", "", "**", Result)
> + "**a**ß**ξ**啕**ß**ξ**啕**𐀀**.**"
> +string__replace_all("[0xde00]😀", "", "**", Result)
> + "**[0xde00]**😀**"
> +string__replace_all("aaa bbbb ccccc aaa", "aaa", "", Result)
> + " bbbb ccccc "
> +string__replace_all("aaa bbbb ccccc aaa", "cc", "**", Result)
> + "aaa bbbb ****c aaa"
> +string__replace_all("aßξ啕ßξ啕𐀀.", "ßξ", "**", Result)
> + "a**啕**啕𐀀."
> +string__replace_all("aßξ啕ßξ啕𐀀.", "ßξ", "★★", Result)
> + "a★★啕★★啕𐀀."
> +string__replace_all("aßξ啕ßξ啕𐀀.", "啕ßξ", "***", Result)
> + "aßξ***啕𐀀."
> diff --git a/tests/general/string_replace.m
> b/tests/general/string_replace.m
> index 2b905fe90..3dcfa2ae8 100644
> --- a/tests/general/string_replace.m
> +++ b/tests/general/string_replace.m
> @@ -1,6 +1,11 @@
>
> %---------------------------------------------------------------------------%
> % vim: ts=4 sw=4 et ft=mercury
>
> %---------------------------------------------------------------------------%
> +%
> +% The .exp file is for backends using UTF-8 string encoding.
> +% The .exp2 file is for backends using UTF-16 string encoding.
> +%
>
> +%---------------------------------------------------------------------------%
>
> :- module string_replace.
>
> @@ -14,6 +19,8 @@
>
> :- implementation.
>
> +:- import_module char.
> +:- import_module int.
> :- import_module list.
> :- import_module string.
>
> @@ -22,6 +29,8 @@
> main(!IO) :-
> Str = "aaa bbbb ccccc aaa",
> Str2 = "aßξ啕ßξ啕𐀀.",
> + Smiley = "😀",
> + Str3 = between(Smiley, 1, length(Smiley)) ++ Smiley,
> Tests = [
> % pattern not in string
> {"", "a", "bc"},
> @@ -31,6 +40,7 @@ main(!IO) :-
> % pattern is empty string
> {Str, "", "**"},
> {Str2, "", "**"},
> + {Str3, "", "**"},
>
> % pattern in string
> {Str, "aaa", ""},
> @@ -39,31 +49,58 @@ main(!IO) :-
> {Str2, "ßξ", "★★"},
> {Str2, "啕ßξ", "***"}
> ],
> - list__foldl(test_replace, Tests, !IO),
> - list__foldl(test_replace_all, Tests, !IO).
> + list.foldl(test_replace, Tests, !IO),
> + list.foldl(test_replace_all, Tests, !IO).
>
> :- pred test_replace({string, string, string}::in, io::di, io::uo) is det.
>
> test_replace({Str, Pat, Subst}, !IO) :-
> - io__write_string("string__replace(\"" ++ Str ++
> - "\", \"" ++ Pat ++
> - "\", \"" ++ Subst ++ "\", Result) \n\t", !IO),
> - ( string__replace(Str, Pat, Subst, Result) ->
> - io__write(Result, !IO),
> - io__nl(!IO)
> - ;
> - io__write_string("FAIL!\n", !IO)
> + io.write_string("string__replace(\"", !IO),
> + write_string_debug(Str, !IO),
> + io.write_string("\", \"", !IO),
> + io.write_string(Pat, !IO),
> + io.write_string("\", \"", !IO),
> + io.write_string(Subst, !IO),
> + io.write_string("\", Result) \n\t", !IO),
> + ( if string.replace(Str, Pat, Subst, Result) then
> + io.write_string("\"", !IO),
> + write_string_debug(Result, !IO),
> + io.write_string("\"\n", !IO)
> + else
> + io.write_string("FAIL!\n", !IO)
> ).
>
> :- pred test_replace_all({string, string, string}::in, io::di, io::uo) is
> det.
>
> test_replace_all({Str, Pat, Subst}, !IO) :-
> - io__write_string("string__replace_all(\"" ++ Str ++
> - "\", \"" ++ Pat ++
> - "\", \"" ++ Subst ++ "\", Result) \n\t", !IO),
> - string__replace_all(Str, Pat, Subst, Result),
> - io__write(Result, !IO),
> - io__nl(!IO).
> -
>
> -%---------------------------------------------------------------------------%
>
> -%---------------------------------------------------------------------------%
> + io.write_string("string__replace_all(\"", !IO),
> + write_string_debug(Str, !IO),
> + io.write_string("\", \"", !IO),
> + io.write_string(Pat, !IO),
> + io.write_string("\", \"", !IO),
> + io.write_string(Subst, !IO),
> + io.write_string("\", Result) \n\t", !IO),
> + io.write_string("\"", !IO),
> + string.replace_all(Str, Pat, Subst, Result),
> + write_string_debug(Result, !IO),
> + io.write_string("\"\n", !IO).
> +
> +:- pred write_string_debug(string::in, io::di, io::uo) is det.
> +
> +write_string_debug(S, !IO) :-
> + write_string_debug_loop(S, 0, !IO).
> +
> +:- pred write_string_debug_loop(string::in, int::in, io::di, io::uo) is
> det.
> +
> +write_string_debug_loop(S, Index, !IO) :-
> + ( if string.index_next(S, Index, NextIndex, Char) then
> + ( if char.is_surrogate(Char) ; Char = '\ufffd' then
> + unsafe_index_code_unit(S, Index, Code),
> + io.format("[%#x]", [i(Code)], !IO)
> + else
> + io.write_char(Char, !IO)
> + ),
> + write_string_debug_loop(S, NextIndex, !IO)
> + else
> + true
> + ).
> --
> 2.23.0
>
> _______________________________________________
> reviews mailing list
> reviews at lists.mercurylang.org
> https://lists.mercurylang.org/listinfo/reviews
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mercurylang.org/archives/reviews/attachments/20191107/57de208c/attachment-0001.html>
More information about the reviews
mailing list