[m-rev.] for review: Make string.replace_all with empty pattern preserve ill-formed sequences.

Mark Brown mark at mercurylang.org
Thu Nov 7 12:58:16 AEDT 2019


This looks fine.

On Thu, Nov 7, 2019 at 10:44 AM Peter Wang <novalazy at gmail.com> wrote:

> library/string.m:
>     Define behaviour of string.replace_all on ill-formed code unit
>     sequences when the pattern is empty.
>
>     Implement that behaviour.
>
> tests/general/string_replace.exp:
> tests/general/string_replace.exp2:
> tests/general/string_replace.m:
>     Extend test case.
>
>     Update code style.
> ---
>  library/string.m                  | 61 ++++++++++++++++++++++---
>  tests/general/string_replace.exp  |  4 ++
>  tests/general/string_replace.exp2 | 44 ++++++++++++++++++
>  tests/general/string_replace.m    | 75 +++++++++++++++++++++++--------
>  4 files changed, 159 insertions(+), 25 deletions(-)
>  create mode 100644 tests/general/string_replace.exp2
>
> diff --git a/library/string.m b/library/string.m
> index c65e857f8..7d6dda761 100644
> --- a/library/string.m
> +++ b/library/string.m
> @@ -1077,11 +1077,19 @@
>      %
>  :- pred replace(string::in, string::in, string::in, string::uo) is
> semidet.
>
> -    % replace_all(String0, Search, Replace, String):
> +    % replace_all(String0, Pattern, Subst, String):
>      %
> -    % Replaces any occurrences of Search in String0 with Replace to give
> +    % Replaces any occurrences of Pattern in String0 with Subst to give
>      % String.
>      %
> +    % If Pattern is the empty string then Subst is inserted at every point
> +    % in String0 except between two code units in an encoding of a code
> point.
> +    % For example, these are true:
> +    %
> +    %   replace_all("", "", "|", "|")
> +    %   replace_all("a", "", "|", "|a|")
> +    %   replace_all("ab", "", "|", "|a|b|")
> +    %
>  :- func replace_all(string::in, string::in, string::in) = (string::uo) is
> det.
>  :- pred replace_all(string::in, string::in, string::in, string::uo) is
> det.
>
> @@ -4718,10 +4726,7 @@ replace_all(S1, S2, S3) = S4 :-
>
>  replace_all(Str, Pat, Subst, Result) :-
>      ( if Pat = "" then
> -        % XXX ILSEQ foldl cannot handle ill-formed sequences.
> -        F = (func(C, L) = [char_to_string(C) ++ Subst | L]),
> -        Foldl = foldl(F, Str, []),
> -        Result = append_list([Subst | list.reverse(Foldl)])
> +        replace_all_empty_pat(Str, Subst, Result)
>      else
>          PatLength = length(Pat),
>          replace_all_loop(Str, Pat, Subst, PatLength, 0, [],
> ReversedChunks),
> @@ -4729,6 +4734,50 @@ replace_all(Str, Pat, Subst, Result) :-
>          Result = append_list(Chunks)
>      ).
>
> +:- pred replace_all_empty_pat(string::in, string::in, string::uo) is det.
> +
> +replace_all_empty_pat(Str, Subst, Result) :-
> +    % This implementation is not the most efficient, but it is not
> expected
> +    % to be used much in practice.
> +    to_code_unit_list(Subst, SubstCodes),
> +    Codes0 = SubstCodes,
> +    replace_all_empty_pat_loop(Str, SubstCodes, length(Str), Codes0,
> Codes),
> +    ( if from_code_unit_list_allow_ill_formed(Codes, ResultPrime) then
> +        Result = ResultPrime
> +    else
> +        unexpected($pred, "string.from_code_unit_list_allow_ill_formed
> failed")
> +    ).
> +
> +:- pred replace_all_empty_pat_loop(string::in, list(int)::in, int::in,
> +    list(int)::in, list(int)::out) is det.
> +
> +replace_all_empty_pat_loop(Str, Subst, Index, Codes0, Codes) :-
> +    ( if unsafe_prev_index(Str, Index, PrevIndex, Char) then
> +        char.to_int(Char, CharInt),
> +        ( if CharInt =< 0x7f then
> +            % Fast path for single code unit code points.
> +            Codes1 = [CharInt | Codes0]
> +        else
> +            prepend_code_units(Str, PrevIndex, Index - 1, Codes0, Codes1)
> +        ),
> +        Codes2 = Subst ++ Codes1,
> +        replace_all_empty_pat_loop(Str, Subst, PrevIndex, Codes2, Codes)
> +    else
> +        Codes = Codes0
> +    ).
> +
> +:- pred prepend_code_units(string::in, int::in, int::in,
> +    list(int)::in, list(int)::out) is det.
> +
> +prepend_code_units(Str, FirstIndex, Index, Codes0, Codes) :-
> +    unsafe_index_code_unit(Str, Index, Code),
> +    Codes1 = [Code | Codes0],
> +    ( if Index = FirstIndex then
> +        Codes = Codes1
> +    else
> +        prepend_code_units(Str, FirstIndex, Index - 1, Codes1, Codes)
> +    ).
> +
>  :- pred replace_all_loop(string::in, string::in, string::in,
>      int::in, int::in, list(string)::in, list(string)::out) is det.
>
> diff --git a/tests/general/string_replace.exp
> b/tests/general/string_replace.exp
> index eab49079f..d633a7822 100644
> --- a/tests/general/string_replace.exp
> +++ b/tests/general/string_replace.exp
> @@ -8,6 +8,8 @@ string__replace("aaa bbbb ccccc aaa", "", "**", Result)
>         "**aaa bbbb ccccc aaa"
>  string__replace("aßξ啕ßξ啕𐀀.", "", "**", Result)
>         "**aßξ啕ßξ啕𐀀."
> +string__replace("[0x9f][0x98][0x80]😀", "", "**", Result)
> +       "**[0x9f][0x98][0x80]😀"
>  string__replace("aaa bbbb ccccc aaa", "aaa", "", Result)
>         " bbbb ccccc aaa"
>  string__replace("aaa bbbb ccccc aaa", "cc", "**", Result)
> @@ -28,6 +30,8 @@ string__replace_all("aaa bbbb ccccc aaa", "", "**",
> Result)
>         "**a**a**a** **b**b**b**b** **c**c**c**c**c** **a**a**a**"
>  string__replace_all("aßξ啕ßξ啕𐀀.", "", "**", Result)
>         "**a**ß**ξ**啕**ß**ξ**啕**𐀀**.**"
> +string__replace_all("[0x9f][0x98][0x80]😀", "", "**", Result)
> +       "**[0x9f]**[0x98]**[0x80]**😀**"
>  string__replace_all("aaa bbbb ccccc aaa", "aaa", "", Result)
>         " bbbb ccccc "
>  string__replace_all("aaa bbbb ccccc aaa", "cc", "**", Result)
> diff --git a/tests/general/string_replace.exp2
> b/tests/general/string_replace.exp2
> new file mode 100644
> index 000000000..082aa9237
> --- /dev/null
> +++ b/tests/general/string_replace.exp2
> @@ -0,0 +1,44 @@
> +string__replace("", "a", "bc", Result)
> +       FAIL!
> +string__replace("aaa bbbb ccccc aaa", "aab", "**", Result)
> +       FAIL!
> +string__replace("aaa bbbb ccccc aaa", "aaaa", "**", Result)
> +       FAIL!
> +string__replace("aaa bbbb ccccc aaa", "", "**", Result)
> +       "**aaa bbbb ccccc aaa"
> +string__replace("aßξ啕ßξ啕𐀀.", "", "**", Result)
> +       "**aßξ啕ßξ啕𐀀."
> +string__replace("[0xde00]😀", "", "**", Result)
> +       "**[0xde00]😀"
> +string__replace("aaa bbbb ccccc aaa", "aaa", "", Result)
> +       " bbbb ccccc aaa"
> +string__replace("aaa bbbb ccccc aaa", "cc", "**", Result)
> +       "aaa bbbb **ccc aaa"
> +string__replace("aßξ啕ßξ啕𐀀.", "ßξ", "**", Result)
> +       "a**啕ßξ啕𐀀."
> +string__replace("aßξ啕ßξ啕𐀀.", "ßξ", "★★", Result)
> +       "a★★啕ßξ啕𐀀."
> +string__replace("aßξ啕ßξ啕𐀀.", "啕ßξ", "***", Result)
> +       "aßξ***啕𐀀."
> +string__replace_all("", "a", "bc", Result)
> +       ""
> +string__replace_all("aaa bbbb ccccc aaa", "aab", "**", Result)
> +       "aaa bbbb ccccc aaa"
> +string__replace_all("aaa bbbb ccccc aaa", "aaaa", "**", Result)
> +       "aaa bbbb ccccc aaa"
> +string__replace_all("aaa bbbb ccccc aaa", "", "**", Result)
> +       "**a**a**a** **b**b**b**b** **c**c**c**c**c** **a**a**a**"
> +string__replace_all("aßξ啕ßξ啕𐀀.", "", "**", Result)
> +       "**a**ß**ξ**啕**ß**ξ**啕**𐀀**.**"
> +string__replace_all("[0xde00]😀", "", "**", Result)
> +       "**[0xde00]**😀**"
> +string__replace_all("aaa bbbb ccccc aaa", "aaa", "", Result)
> +       " bbbb ccccc "
> +string__replace_all("aaa bbbb ccccc aaa", "cc", "**", Result)
> +       "aaa bbbb ****c aaa"
> +string__replace_all("aßξ啕ßξ啕𐀀.", "ßξ", "**", Result)
> +       "a**啕**啕𐀀."
> +string__replace_all("aßξ啕ßξ啕𐀀.", "ßξ", "★★", Result)
> +       "a★★啕★★啕𐀀."
> +string__replace_all("aßξ啕ßξ啕𐀀.", "啕ßξ", "***", Result)
> +       "aßξ***啕𐀀."
> diff --git a/tests/general/string_replace.m
> b/tests/general/string_replace.m
> index 2b905fe90..3dcfa2ae8 100644
> --- a/tests/general/string_replace.m
> +++ b/tests/general/string_replace.m
> @@ -1,6 +1,11 @@
>
>  %---------------------------------------------------------------------------%
>  % vim: ts=4 sw=4 et ft=mercury
>
>  %---------------------------------------------------------------------------%
> +%
> +% The .exp file is for backends using UTF-8 string encoding.
> +% The .exp2 file is for backends using UTF-16 string encoding.
> +%
>
> +%---------------------------------------------------------------------------%
>
>  :- module string_replace.
>
> @@ -14,6 +19,8 @@
>
>  :- implementation.
>
> +:- import_module char.
> +:- import_module int.
>  :- import_module list.
>  :- import_module string.
>
> @@ -22,6 +29,8 @@
>  main(!IO) :-
>      Str = "aaa bbbb ccccc aaa",
>      Str2 = "aßξ啕ßξ啕𐀀.",
> +    Smiley = "😀",
> +    Str3 = between(Smiley, 1, length(Smiley)) ++ Smiley,
>      Tests = [
>          % pattern not in string
>          {"", "a", "bc"},
> @@ -31,6 +40,7 @@ main(!IO) :-
>          % pattern is empty string
>          {Str, "", "**"},
>          {Str2, "", "**"},
> +        {Str3, "", "**"},
>
>          % pattern in string
>          {Str, "aaa", ""},
> @@ -39,31 +49,58 @@ main(!IO) :-
>          {Str2, "ßξ", "★★"},
>          {Str2, "啕ßξ", "***"}
>      ],
> -    list__foldl(test_replace, Tests, !IO),
> -    list__foldl(test_replace_all, Tests, !IO).
> +    list.foldl(test_replace, Tests, !IO),
> +    list.foldl(test_replace_all, Tests, !IO).
>
>  :- pred test_replace({string, string, string}::in, io::di, io::uo) is det.
>
>  test_replace({Str, Pat, Subst}, !IO) :-
> -    io__write_string("string__replace(\"" ++ Str ++
> -            "\", \"" ++ Pat ++
> -            "\", \"" ++ Subst ++ "\", Result) \n\t", !IO),
> -    ( string__replace(Str, Pat, Subst, Result) ->
> -        io__write(Result, !IO),
> -        io__nl(!IO)
> -    ;
> -        io__write_string("FAIL!\n", !IO)
> +    io.write_string("string__replace(\"", !IO),
> +    write_string_debug(Str, !IO),
> +    io.write_string("\", \"", !IO),
> +    io.write_string(Pat, !IO),
> +    io.write_string("\", \"", !IO),
> +    io.write_string(Subst, !IO),
> +    io.write_string("\", Result) \n\t", !IO),
> +    ( if string.replace(Str, Pat, Subst, Result) then
> +        io.write_string("\"", !IO),
> +        write_string_debug(Result, !IO),
> +        io.write_string("\"\n", !IO)
> +    else
> +        io.write_string("FAIL!\n", !IO)
>      ).
>
>  :- pred test_replace_all({string, string, string}::in, io::di, io::uo) is
> det.
>
>  test_replace_all({Str, Pat, Subst}, !IO) :-
> -    io__write_string("string__replace_all(\"" ++ Str ++
> -            "\", \"" ++ Pat ++
> -            "\", \"" ++ Subst ++ "\", Result) \n\t", !IO),
> -    string__replace_all(Str, Pat, Subst, Result),
> -    io__write(Result, !IO),
> -    io__nl(!IO).
> -
>
> -%---------------------------------------------------------------------------%
>
> -%---------------------------------------------------------------------------%
> +    io.write_string("string__replace_all(\"", !IO),
> +    write_string_debug(Str, !IO),
> +    io.write_string("\", \"", !IO),
> +    io.write_string(Pat, !IO),
> +    io.write_string("\", \"", !IO),
> +    io.write_string(Subst, !IO),
> +    io.write_string("\", Result) \n\t", !IO),
> +    io.write_string("\"", !IO),
> +    string.replace_all(Str, Pat, Subst, Result),
> +    write_string_debug(Result, !IO),
> +    io.write_string("\"\n", !IO).
> +
> +:- pred write_string_debug(string::in, io::di, io::uo) is det.
> +
> +write_string_debug(S, !IO) :-
> +    write_string_debug_loop(S, 0, !IO).
> +
> +:- pred write_string_debug_loop(string::in, int::in, io::di, io::uo) is
> det.
> +
> +write_string_debug_loop(S, Index, !IO) :-
> +    ( if string.index_next(S, Index, NextIndex, Char) then
> +        ( if char.is_surrogate(Char) ; Char = '\ufffd' then
> +            unsafe_index_code_unit(S, Index, Code),
> +            io.format("[%#x]", [i(Code)], !IO)
> +        else
> +            io.write_char(Char, !IO)
> +        ),
> +        write_string_debug_loop(S, NextIndex, !IO)
> +    else
> +        true
> +    ).
> --
> 2.23.0
>
> _______________________________________________
> reviews mailing list
> reviews at lists.mercurylang.org
> https://lists.mercurylang.org/listinfo/reviews
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mercurylang.org/archives/reviews/attachments/20191107/57de208c/attachment-0001.html>


More information about the reviews mailing list