[m-rev.] for review: Define behaviour of string.char_to_string on edge cases.

Mark Brown mark at mercurylang.org
Tue Oct 29 19:26:57 AEDT 2019


This looks fine.

On Tue, Oct 29, 2019 at 12:02 PM Peter Wang <novalazy at gmail.com> wrote:
>
> library/string.m:
>     Define behaviour of char_to_string when the string is not
>     well-formed or if the char is a surrogate code point.
>
>     Implement char_to_string/2 using multiple clauses
>     as the described behaviour doesn't match to_char_list/2.
>
> tests/hard_coded/Mmakefile:
> tests/hard_coded/char_to_string.exp:
> tests/hard_coded/char_to_string.exp2:
> tests/hard_coded/char_to_string.m:
>     Add test case.
> ---
>  library/string.m                     |  21 ++++--
>  tests/hard_coded/Mmakefile           |   1 +
>  tests/hard_coded/char_to_string.exp  |  11 +++
>  tests/hard_coded/char_to_string.exp2 |  11 +++
>  tests/hard_coded/char_to_string.m    | 106 +++++++++++++++++++++++++++
>  5 files changed, 145 insertions(+), 5 deletions(-)
>  create mode 100644 tests/hard_coded/char_to_string.exp
>  create mode 100644 tests/hard_coded/char_to_string.exp2
>  create mode 100644 tests/hard_coded/char_to_string.m
>
> diff --git a/library/string.m b/library/string.m
> index 82bc84fa3..aff17b069 100644
> --- a/library/string.m
> +++ b/library/string.m
> @@ -1353,7 +1353,15 @@
>
>      % char_to_string(Char, String):
>      %
> -    % Converts a character (code point) to a string, or vice versa.
> +    % Converts a character to a string, or vice versa.
> +    % True if `String' is the well-formed string that encodes the code point
> +    % `Char'; or, if strings are UTF-16 encoded, `Char' is a surrogate code
> +    % point and `String' is the string that contains only that surrogate code
> +    % point. Otherwise, `char_to_string(Char, String)' is false.
> +    %
> +    % Throws an exception if `Char' is the null character or a code point that
> +    % cannot be encoded in a string (namely, surrogate code points cannot be
> +    % encoded in UTF-8 strings).
>      %
>  :- func char_to_string(char::in) = (string::uo) is det.
>  :- pred char_to_string(char, string).
> @@ -5340,10 +5348,13 @@ det_to_float(FloatString) = Float :-
>  char_to_string(C) = S1 :-
>      char_to_string(C, S1).
>
> -char_to_string(Char, String) :-
> -    % XXX ILSEQ Should fail when String is not a well-formed encoding of a
> -    % single code point.
> -    to_char_list(String, [Char]).
> +:- pragma promise_equivalent_clauses(char_to_string/2).
> +
> +char_to_string(Char::in, String::uo) :-
> +    from_char_list([Char], String).
> +char_to_string(Char::out, String::in) :-
> +    index_next_not_replaced(String, 0, NextIndex, Char),
> +    length(String, NextIndex).
>
>  from_char(Char) = char_to_string(Char).
>
> diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
> index 048827e4a..826ea7d0d 100644
> --- a/tests/hard_coded/Mmakefile
> +++ b/tests/hard_coded/Mmakefile
> @@ -54,6 +54,7 @@ ORDINARY_PROGS = \
>         cc_nondet_disj \
>         change_hunk_test \
>         char_signed \
> +       char_to_string \
>         char_unicode \
>         checked_nondet_tailcall \
>         checked_nondet_tailcall_noinline \
> diff --git a/tests/hard_coded/char_to_string.exp b/tests/hard_coded/char_to_string.exp
> new file mode 100644
> index 000000000..3bce46e39
> --- /dev/null
> +++ b/tests/hard_coded/char_to_string.exp
> @@ -0,0 +1,11 @@
> +exception: software_error("predicate `string.from_char_list\'/2: Unexpected: null character or surrogate code point in list")
> +exception: software_error("predicate `string.from_char_list\'/2: Unexpected: null character or surrogate code point in list")
> +exception: software_error("predicate `string.from_char_list\'/2: Unexpected: null character or surrogate code point in list")
> +succeeded:
> +
> +failed
> +failed
> +failed
> +failed
> +failed
> +succeeded:
> diff --git a/tests/hard_coded/char_to_string.exp2 b/tests/hard_coded/char_to_string.exp2
> new file mode 100644
> index 000000000..716a1dfef
> --- /dev/null
> +++ b/tests/hard_coded/char_to_string.exp2
> @@ -0,0 +1,11 @@
> +exception: software_error("predicate `string.from_char_list\'/2: Unexpected: null character or surrogate code point in list")
> +succeeded: 0xd83d
> +succeeded: 0xde00
> +succeeded:
> +
> +failed
> +failed
> +failed
> +succeeded: 0xd83d
> +succeeded: 0xde00
> +succeeded:
> diff --git a/tests/hard_coded/char_to_string.m b/tests/hard_coded/char_to_string.m
> new file mode 100644
> index 000000000..2027ce1c0
> --- /dev/null
> +++ b/tests/hard_coded/char_to_string.m
> @@ -0,0 +1,106 @@
> +%---------------------------------------------------------------------------%
> +% vim: ts=4 sw=4 et ft=mercury
> +%---------------------------------------------------------------------------%
> +%
> +% The .exp file is for backends using UTF-8 string encoding.
> +% The .exp2 file is for backends using UTF-16 string encoding.
> +%
> +%---------------------------------------------------------------------------%
> +
> +:- module char_to_string.
> +:- interface.
> +
> +:- import_module io.
> +
> +:- pred main(io::di, io::uo) is cc_multi.
> +
> +%---------------------------------------------------------------------------%
> +
> +:- implementation.
> +
> +:- import_module char.
> +:- import_module list.
> +:- import_module string.
> +
> +%---------------------------------------------------------------------------%
> +
> +main(!IO) :-
> +    % null character
> +    test_char_to_string_fwd(char.det_from_int(0), !IO),
> +    % surrogate code points
> +    test_char_to_string_fwd(char.det_from_int(0xd83d), !IO),
> +    test_char_to_string_fwd(char.det_from_int(0xde00), !IO),
> +    % non-BMP code point
> +    test_char_to_string_fwd('', !IO),
> +    io.nl(!IO),
> +
> +    S = "",
> +    S0 = between(S, 0, 1),
> +    S1 = between(S, 1, length(S)),
> +
> +    % empty string
> +    test_char_to_string_rev("", !IO),
> +    % string too long
> +    test_char_to_string_rev(S0 ++ S, !IO),
> +    test_char_to_string_rev(S ++ S0, !IO),
> +    % ill-formed (unpaired surrogate in UTF-16 backends)
> +    test_char_to_string_rev(S0, !IO),
> +    test_char_to_string_rev(S1, !IO),
> +    % non-BMP code point
> +    test_char_to_string_rev(S, !IO).
> +
> +:- pred test_char_to_string_fwd(char::in, io::di, io::uo) is cc_multi.
> +
> +test_char_to_string_fwd(Char, !IO) :-
> +    ( try []
> +        char_to_string(Char, Str)
> +    then
> +        io.write_string("succeeded: ", !IO),
> +        write_string_debug(Str, !IO),
> +        io.nl(!IO)
> +    catch_any Excp ->
> +        io.write_string("exception: ", !IO),
> +        io.write(Excp, !IO),
> +        io.nl(!IO)
> +    ).
> +
> +:- pred test_char_to_string_rev(string::in, io::di, io::uo) is cc_multi.
> +
> +test_char_to_string_rev(Str, !IO) :-
> +    ( try []
> +        char_to_string(Char, Str)
> +    then
> +        io.write_string("succeeded: ", !IO),
> +        write_char_or_hex(Char, !IO),
> +        io.nl(!IO)
> +    else
> +        io.write_string("failed\n", !IO)
> +    catch_any Excp ->
> +        io.write_string("exception: ", !IO),
> +        io.write(Excp, !IO),
> +        io.nl(!IO)
> +    ).
> +
> +:- pred write_string_debug(string::in, io::di, io::uo) is det.
> +
> +write_string_debug(S, !IO) :-
> +    write_string_debug_loop(S, 0, !IO).
> +
> +:- pred write_string_debug_loop(string::in, int::in, io::di, io::uo) is det.
> +
> +write_string_debug_loop(S, Index, !IO) :-
> +    ( if string.index_next(S, Index, NextIndex, Char) then
> +        write_char_or_hex(Char, !IO),
> +        write_string_debug_loop(S, NextIndex, !IO)
> +    else
> +        true
> +    ).
> +
> +:- pred write_char_or_hex(char::in, io::di, io::uo) is det.
> +
> +write_char_or_hex(Char, !IO) :-
> +    ( if char.is_surrogate(Char) then
> +        io.format("%#x", [i(char.to_int(Char))], !IO)
> +    else
> +        io.write_char(Char, !IO)
> +    ).
> --
> 2.23.0
>
> _______________________________________________
> reviews mailing list
> reviews at lists.mercurylang.org
> https://lists.mercurylang.org/listinfo/reviews


More information about the reviews mailing list