[m-rev.] for review: Define behaviour of string.index_next on ill-formed sequences.

Mark Brown mark at mercurylang.org
Thu Oct 17 02:55:43 AEDT 2019


This looks fine.

On Wed, Oct 16, 2019 at 5:04 PM Peter Wang <novalazy at gmail.com> wrote:

> library/string.m:
>     Make string.index_next and string.unsafe_index_next
>     return either U+FFFD or an unpaired surrogate code point
>     when an ill-formed code unit sequence is detected.
>
> tests/hard_coded/Mmakefile:
> tests/hard_coded/string_index_next_ilseq.exp:
> tests/hard_coded/string_index_next_ilseq.exp2:
> tests/hard_coded/string_index_next_ilseq.m:
>     Add test case.
> ---
>  library/string.m                              | 69 ++++++++++---------
>  tests/hard_coded/Mmakefile                    |  1 +
>  tests/hard_coded/string_index_next_ilseq.exp  |  5 ++
>  tests/hard_coded/string_index_next_ilseq.exp2 |  5 ++
>  tests/hard_coded/string_index_next_ilseq.m    | 44 ++++++++++++
>  5 files changed, 90 insertions(+), 34 deletions(-)
>  create mode 100644 tests/hard_coded/string_index_next_ilseq.exp
>  create mode 100644 tests/hard_coded/string_index_next_ilseq.exp2
>  create mode 100644 tests/hard_coded/string_index_next_ilseq.m
>
> diff --git a/library/string.m b/library/string.m
> index 09716dc49..b687cece3 100644
> --- a/library/string.m
> +++ b/library/string.m
> @@ -274,17 +274,26 @@
>
>      % index_next(String, Index, NextIndex, Char):
>      %
> -    % Like `index'/3 but also returns the position of the code unit
> -    % that follows the code point beginning at `Index',
> -    % i.e. NextIndex = Index + num_code_units_to_encode(Char).
> +    % If `Index' is the initial code unit offset of a well-formed code
> unit
> +    % sequence in `String' then `Char' is the code point encoded by that
> +    % sequence, and `NextIndex' is the offset immediately following that
> +    % sequence.
> +    %
> +    % If the code unit in `String' at `Index' is part of an ill-formed
> sequence
> +    % then `Char' is either U+FFFD REPLACEMENT CHARACTER (if strings use
> UTF-8
> +    % encoding) or the unpaired surrogate code point at `Index' (if
> strings use
> +    % UTF-16 encoding), and `NextIndex' is Index + 1.
> +    %
> +    % Fails if `Index' is out of range (negative, or greater than or
> equal to
> +    % the length of `String').
>      %
>  :- pred index_next(string::in, int::in, int::out, char::uo) is semidet.
>
>      % unsafe_index_next(String, Index, NextIndex, Char):
>      %
> -    % `Char' is the character (code point) in `String', beginning at the
> -    % code unit `Index'. `NextIndex' is the offset following the encoding
> -    % of `Char'. Fails if `Index' is equal to the length of `String'.
> +    % Like index_next/4 but does not check that `Index' is in range.
> +    % Fails if `Index' is equal to the length of `String'.
> +    %
>      % WARNING: behavior is UNDEFINED if `Index' is out of range
>      % (negative, or greater than the length of `String').
>      %
> @@ -2166,9 +2175,7 @@ duplicate_char(Char, Count, String) :-
>  % so that the compiler can do loop invariant hoisting on calls to them
>  % that occur in loops.
>
> -% XXX ILSEQ The index_next family fails if an ill-formed sequence is
> detected.
> -% It needs to be updated to match the behaviour of the index family.
> -%
> +% XXX ILSEQ
>  % We should allow the possibility of working with strings containing
> ill-formed
>  % sequences. That would require predicates that can return either a code
> point
>  % when possible, or else code units from ill-formed sequences.
> @@ -2251,11 +2258,6 @@ String ^ unsafe_elem(Index) = unsafe_index(String,
> Index).
>
>  %---------------------%
>
> -% XXX ILSEQ index_next/4 and unsafe_index_next/4 fail when an ill-formed
> -% sequence is detected, unlike the index family. Most programs will
> probably
> -% work as intended if we just replace code units in ill-formed sequences
> with
> -% replacement char (for UTF-8) or return the unpaired surrogate code point
> -% (for UTF-16).
>  index_next(Str, Index, NextIndex, Char) :-
>      Len = length(Str),
>      ( if index_check(Index, Len) then
> @@ -2264,10 +2266,6 @@ index_next(Str, Index, NextIndex, Char) :-
>          fail
>      ).
>
> -% XXX ILSEQ Behaviour of unsafe_index_next depends on target language.
> -%   - c: fails at ill-formed sequence
> -%   - java: fails at unpaired surrogate
> -%   - csharp: System.Char.ConvertToUtf32 throws exception for unpaired
> surrogate
>  :- pragma foreign_proc("C",
>      unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
>      [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
> @@ -2280,7 +2278,11 @@ index_next(Str, Index, NextIndex, Char) :-
>      } else {
>          NextIndex = Index;
>          Ch = MR_utf8_get_next_mb(Str, &NextIndex);
> -        SUCCESS_INDICATOR = (Ch > 0);
> +        if (Ch < 0) {
> +            Ch = 0xfffd;
> +            NextIndex = Index + 1;
> +        }
> +        SUCCESS_INDICATOR = MR_TRUE;
>      }
>  ").
>  :- pragma foreign_proc("C#",
> @@ -2288,7 +2290,7 @@ index_next(Str, Index, NextIndex, Char) :-
>      [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
>          does_not_affect_liveness, no_sharing],
>  "
> -    if (Index < Str.Length) {
> +    try {
>          Ch = System.Char.ConvertToUtf32(Str, Index);
>          if (Ch <= 0xffff) {
>              NextIndex = Index + 1;
> @@ -2296,10 +2298,15 @@ index_next(Str, Index, NextIndex, Char) :-
>              NextIndex = Index + 2;
>          }
>          SUCCESS_INDICATOR = true;
> -    } else {
> -        Ch = -1;
> +    } catch (System.ArgumentOutOfRangeException) {
> +        Ch = 0;
>          NextIndex = Index;
>          SUCCESS_INDICATOR = false;
> +    } catch (System.ArgumentException) {
> +        // Return unpaired surrogate code point.
> +        Ch = Str[Index];
> +        NextIndex = Index + 1;
> +        SUCCESS_INDICATOR = true;
>      }
>  ").
>  :- pragma foreign_proc("Java",
> @@ -2307,19 +2314,12 @@ index_next(Str, Index, NextIndex, Char) :-
>      [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
>          does_not_affect_liveness, no_sharing],
>  "
> -    if (Index < Str.length()) {
> +    try {
>          Ch = Str.codePointAt(Index);
> -        SUCCESS_INDICATOR =
> -            !java.lang.Character.isHighSurrogate((char) Ch) &&
> -            !java.lang.Character.isLowSurrogate((char) Ch);
> -        if (SUCCESS_INDICATOR) {
> -            NextIndex = Index + java.lang.Character.charCount(Ch);
> -        } else {
> -            Ch = -1;
> -            NextIndex = Index;
> -        }
> -    } else {
> -        Ch = -1;
> +        NextIndex = Index + java.lang.Character.charCount(Ch);
> +        SUCCESS_INDICATOR = true;
> +    } catch (IndexOutOfBoundsException e) {
> +        Ch = 0;
>          NextIndex = Index;
>          SUCCESS_INDICATOR = false;
>      }
> @@ -2329,6 +2329,7 @@ index_next(Str, Index, NextIndex, Char) :-
>      [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
>          does_not_affect_liveness, no_sharing],
>  "
> +    % XXX does not handle ill-formed sequences as described
>      case Str of
>          << _:Index/binary, Ch/utf8, _/binary >> ->
>              if
> diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
> index 631d0da19..8c317da86 100644
> --- a/tests/hard_coded/Mmakefile
> +++ b/tests/hard_coded/Mmakefile
> @@ -359,6 +359,7 @@ ORDINARY_PROGS = \
>         string_codepoint \
>         string_first_char \
>         string_index_ilseq \
> +       string_index_next_ilseq \
>         string_loop \
>         string_presuffix \
>         string_set_char \
> diff --git a/tests/hard_coded/string_index_next_ilseq.exp
> b/tests/hard_coded/string_index_next_ilseq.exp
> new file mode 100644
> index 000000000..1ad4aba9a
> --- /dev/null
> +++ b/tests/hard_coded/string_index_next_ilseq.exp
> @@ -0,0 +1,5 @@
> +string.index_next(S, 0, 4, 0x1f600)
> +string.index_next(S, 4, 5, 0xfffd)
> +string.index_next(S, 5, 6, 0xfffd)
> +string.index_next(S, 6, 10, 0x1f600)
> +string.index_next(S, 10, _, _) failed
> diff --git a/tests/hard_coded/string_index_next_ilseq.exp2
> b/tests/hard_coded/string_index_next_ilseq.exp2
> new file mode 100644
> index 000000000..c889bf0d8
> --- /dev/null
> +++ b/tests/hard_coded/string_index_next_ilseq.exp2
> @@ -0,0 +1,5 @@
> +string.index_next(S, 0, 2, 0x1f600)
> +string.index_next(S, 2, 3, 0xde00)
> +string.index_next(S, 3, 4, 0xd83d)
> +string.index_next(S, 4, 6, 0x1f600)
> +string.index_next(S, 6, _, _) failed
> diff --git a/tests/hard_coded/string_index_next_ilseq.m
> b/tests/hard_coded/string_index_next_ilseq.m
> new file mode 100644
> index 000000000..848a6fbae
> --- /dev/null
> +++ b/tests/hard_coded/string_index_next_ilseq.m
> @@ -0,0 +1,44 @@
>
> +%---------------------------------------------------------------------------%
> +% vim: ts=4 sw=4 et ft=mercury
>
> +%---------------------------------------------------------------------------%
> +%
> +% The .exp file is for backends using UTF-8 string encoding.
> +% The .exp2 file is for backends using UTF-16 string encoding.
> +%
>
> +%---------------------------------------------------------------------------%
> +
> +:- module string_index_next_ilseq.
> +:- interface.
> +
> +:- import_module io.
> +
> +:- pred main(io::di, io::uo) is det.
> +
>
> +%---------------------------------------------------------------------------%
> +
> +:- implementation.
> +
> +:- import_module char.
> +:- import_module list.
> +:- import_module string.
> +
>
> +%---------------------------------------------------------------------------%
> +
> +main(!IO) :-
> +    S0 = "😀",
> +    S1 = string.between(S0, 0, 1),
> +    S2 = string.between(S0, 1, 2),
> +    S = S0 ++ S2 ++ S1 ++ S0,
> +    test_index_next(S, 0, !IO).
> +
> +:- pred test_index_next(string::in, int::in, io::di, io::uo) is det.
> +
> +test_index_next(S, Index, !IO) :-
> +    ( if string.index_next(S, Index, NextIndex, Char) then
> +        io.format("string.index_next(S, %d, %d, %#x)\n",
> +            [i(Index), i(NextIndex), i(char.to_int(Char))], !IO),
> +        test_index_next(S, NextIndex, !IO)
> +    else
> +        io.format("string.index_next(S, %d, _, _) failed\n",
> +            [i(Index)], !IO)
> +    ).
> --
> 2.23.0
>
> _______________________________________________
> reviews mailing list
> reviews at lists.mercurylang.org
> https://lists.mercurylang.org/listinfo/reviews
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mercurylang.org/archives/reviews/attachments/20191017/fb7bc6ce/attachment-0001.html>


More information about the reviews mailing list