[m-rev.] for review: Change behaviour of string.prev_index on ill-formed sequences.

Mark Brown mark at mercurylang.org
Thu Oct 17 03:08:21 AEDT 2019


Looks good to me.

On Wed, Oct 16, 2019 at 5:04 PM Peter Wang <novalazy at gmail.com> wrote:

> library/string.m:
>     Make string.prev_index and string.unsafe_prev_index
>     return either U+FFFD or an unpaired surrogate code point
>     when an ill-formed code unit sequence is detected.
>
> tests/hard_coded/Mmakefile:
> tests/hard_coded/string_prev_index_ilseq.exp:
> tests/hard_coded/string_prev_index_ilseq.exp2:
> tests/hard_coded/string_prev_index_ilseq.m:
>     Add test case.
> ---
>  library/string.m                              | 91 ++++++++++---------
>  tests/hard_coded/Mmakefile                    |  1 +
>  tests/hard_coded/string_prev_index_ilseq.exp  |  5 +
>  tests/hard_coded/string_prev_index_ilseq.exp2 |  5 +
>  tests/hard_coded/string_prev_index_ilseq.m    | 44 +++++++++
>  5 files changed, 104 insertions(+), 42 deletions(-)
>  create mode 100644 tests/hard_coded/string_prev_index_ilseq.exp
>  create mode 100644 tests/hard_coded/string_prev_index_ilseq.exp2
>  create mode 100644 tests/hard_coded/string_prev_index_ilseq.m
>
> diff --git a/library/string.m b/library/string.m
> index b687cece3..1b0d32e87 100644
> --- a/library/string.m
> +++ b/library/string.m
> @@ -299,19 +299,27 @@
>      %
>  :- pred unsafe_index_next(string::in, int::in, int::out, char::uo) is
> semidet.
>
> -    % prev_index(String, Index, CharIndex, Char):
> +    % prev_index(String, Index, PrevIndex, Char):
>      %
> -    % `Char' is the character (code point) in `String' immediately
> _before_
> -    % the code unit `Index'. Fails if `Index' is out of range
> (non-positive,
> -    % or greater than the length of `String').
> +    % If `Index - 1' is the final code unit offset of a well-formed
> sequence in
> +    % `String' then `Char' is the code point encoded by that sequence, and
> +    % `PrevIndex' is the initial code unit offset of that sequence.
> +    %
> +    % If the code unit in `String' at `Index - 1' is part of an ill-formed
> +    % sequence then `Char' is either U+FFFD REPLACEMENT CHARACTER (if
> strings
> +    % use UTF-8 encoding) or the unpaired surrogate code point at `Index
> - 1'
> +    % (if strings use UTF-16 encoding), and `PrevIndex' is `Index - 1'.
> +    %
> +    % Fails if `Index' is out of range (non-positive, or greater than the
> +    % length of `String').
>      %
>  :- pred prev_index(string::in, int::in, int::out, char::uo) is semidet.
>
> -    % unsafe_prev_index(String, Index, CharIndex, Char):
> +    % unsafe_prev_index(String, Index, PrevIndex, Char):
> +    %
> +    % Like prev_index/4 but does not check that `Index' is in range.
> +    % Fails if `Index' is zero.
>      %
> -    % `Char' is the character (code point) in `String' immediately
> _before_
> -    % the code unit `Index'. `CharIndex' is the offset of the beginning of
> -    % `Char'. Fails if `Index' is zero.
>      % WARNING: behavior is UNDEFINED if `Index' is out of range
>      % (negative, or greater than the length of `String').
>      %
> @@ -2350,37 +2358,37 @@ index_next(Str, Index, NextIndex, Char) :-
>      end
>  ").
>
> -prev_index(Str, Index, CharIndex, Char) :-
> +prev_index(Str, Index, PrevIndex, Char) :-
>      Len = length(Str),
>      ( if index_check(Index - 1, Len) then
> -        unsafe_prev_index(Str, Index, CharIndex, Char)
> +        unsafe_prev_index(Str, Index, PrevIndex, Char)
>      else
>          fail
>      ).
>
> -% XXX ILSEQ Behaviour of unsafe_prev_index depends on target language.
> -%   - c: fails on ill-formed sequence
> -%   - java: returns unpaired surrogate
> -%   - csharp: fails on unpaired surrogate (excepting for the bug)
>  :- pragma foreign_proc("C",
>      unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
>      [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
>          does_not_affect_liveness, no_sharing],
>  "
> -    if (Index > 0) {
> +    if (Index <= 0) {
> +        PrevIndex = Index;
> +        Ch = 0;
> +        SUCCESS_INDICATOR = MR_FALSE;
> +    } else {
>          PrevIndex = Index - 1;
>          Ch = Str[PrevIndex];
> -        if (MR_is_ascii(Ch)) {
> -            SUCCESS_INDICATOR = (Ch != 0);
> -        } else {
> +        if (! MR_is_ascii(Ch)) {
>              Ch = MR_utf8_prev_get(Str, &PrevIndex);
> -            // XXX ILSEQ
> -            // SUCCESS_INDICATOR should be false if there are any extra
> bytes
> -            // after the encoding of Ch, but before Index.
> -            SUCCESS_INDICATOR = (Ch > 0);
> +            // XXX MR_utf8_prev_get currently just scans backwards to
> find a
> +            // lead byte, so we need a separate check to ensure no bytes
> are
> +            // unaccounted for.
> +            if (Ch < 0 || PrevIndex + MR_utf8_width(Ch) != Index) {
> +                Ch = 0xfffd;
> +                PrevIndex = Index - 1;
> +            }
>          }
> -    } else {
> -        SUCCESS_INDICATOR = MR_FALSE;
> +        SUCCESS_INDICATOR = MR_TRUE;
>      }
>  ").
>  :- pragma foreign_proc("C#",
> @@ -2388,33 +2396,31 @@ prev_index(Str, Index, CharIndex, Char) :-
>      [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
>          does_not_affect_liveness, no_sharing],
>  "
> -    if (Index > 0) {
> +    if (Index <= 0) {
> +        Ch = 0;
> +        PrevIndex = Index;
> +        SUCCESS_INDICATOR = false;
> +    } else {
>          char c2 = Str[Index - 1];
> -        if (System.Char.IsSurrogate(c2)) {
> +        if (System.Char.IsLowSurrogate(c2)) {
>              try {
>                  char c1 = Str[Index - 2];
>                  Ch = System.Char.ConvertToUtf32(c1, c2);
>                  PrevIndex = Index - 2;
> -            } catch (System.IndexOutOfRangeException) {
> -                Ch = -1;
> -                PrevIndex = Index;
> -                SUCCESS_INDICATOR = false;
>              } catch (System.ArgumentOutOfRangeException) {
> -                Ch = -1;
> -                PrevIndex = Index;
> -                SUCCESS_INDICATOR = false;
> +                // Return unpaired surrogate code point.
> +                Ch = (int) c2;
> +                PrevIndex = Index - 1;
> +            } catch (System.IndexOutOfRangeException) {
> +                // Return unpaired surrogate code point.
> +                Ch = (int) c2;
> +                PrevIndex = Index - 1;
>              }
>          } else {
> -            // Common case.
>              Ch = (int) c2;
>              PrevIndex = Index - 1;
>          }
> -        // XXX ILSEQ bug, check (Ch >= 0)
>          SUCCESS_INDICATOR = true;
> -    } else {
> -        Ch = -1;
> -        PrevIndex = Index;
> -        SUCCESS_INDICATOR = false;
>      }
>  ").
>  :- pragma foreign_proc("Java",
> @@ -2422,12 +2428,12 @@ prev_index(Str, Index, CharIndex, Char) :-
>      [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
>          does_not_affect_liveness, no_sharing],
>  "
> -    if (Index > 0) {
> +    try {
>          Ch = Str.codePointBefore(Index);
>          PrevIndex = Index - java.lang.Character.charCount(Ch);
>          SUCCESS_INDICATOR = true;
> -    } else {
> -        Ch = -1;
> +    } catch (IndexOutOfBoundsException e) {
> +        Ch = 0;
>          PrevIndex = Index;
>          SUCCESS_INDICATOR = false;
>      }
> @@ -2437,6 +2443,7 @@ prev_index(Str, Index, CharIndex, Char) :-
>      [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
>          does_not_affect_liveness, may_not_duplicate, no_sharing],
>  "
> +    % XXX does not handle ill-formed sequences as described
>      {PrevIndex, Ch} = do_unsafe_prev_index(Str, Index - 1),
>      SUCCESS_INDICATOR = (Ch =/= -1)
>  ").
> diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
> index 8c317da86..84e38bd34 100644
> --- a/tests/hard_coded/Mmakefile
> +++ b/tests/hard_coded/Mmakefile
> @@ -362,6 +362,7 @@ ORDINARY_PROGS = \
>         string_index_next_ilseq \
>         string_loop \
>         string_presuffix \
> +       string_prev_index_ilseq \
>         string_set_char \
>         string_split \
>         string_split_2 \
> diff --git a/tests/hard_coded/string_prev_index_ilseq.exp
> b/tests/hard_coded/string_prev_index_ilseq.exp
> new file mode 100644
> index 000000000..3dbadd510
> --- /dev/null
> +++ b/tests/hard_coded/string_prev_index_ilseq.exp
> @@ -0,0 +1,5 @@
> +string.prev_index(S, 10, 6, 0x1f600)
> +string.prev_index(S, 6, 5, 0xfffd)
> +string.prev_index(S, 5, 4, 0xfffd)
> +string.prev_index(S, 4, 0, 0x1f600)
> +string.prev_index(S, 0, _, _) failed
> diff --git a/tests/hard_coded/string_prev_index_ilseq.exp2
> b/tests/hard_coded/string_prev_index_ilseq.exp2
> new file mode 100644
> index 000000000..8f61dd24d
> --- /dev/null
> +++ b/tests/hard_coded/string_prev_index_ilseq.exp2
> @@ -0,0 +1,5 @@
> +string.prev_index(S, 6, 4, 0x1f600)
> +string.prev_index(S, 4, 3, 0xd83d)
> +string.prev_index(S, 3, 2, 0xde00)
> +string.prev_index(S, 2, 0, 0x1f600)
> +string.prev_index(S, 0, _, _) failed
> diff --git a/tests/hard_coded/string_prev_index_ilseq.m
> b/tests/hard_coded/string_prev_index_ilseq.m
> new file mode 100644
> index 000000000..ce4b427b6
> --- /dev/null
> +++ b/tests/hard_coded/string_prev_index_ilseq.m
> @@ -0,0 +1,44 @@
>
> +%---------------------------------------------------------------------------%
> +% vim: ts=4 sw=4 et ft=mercury
>
> +%---------------------------------------------------------------------------%
> +%
> +% The .exp file is for backends using UTF-8 string encoding.
> +% The .exp2 file is for backends using UTF-16 string encoding.
> +%
>
> +%---------------------------------------------------------------------------%
> +
> +:- module string_prev_index_ilseq.
> +:- interface.
> +
> +:- import_module io.
> +
> +:- pred main(io::di, io::uo) is det.
> +
>
> +%---------------------------------------------------------------------------%
> +
> +:- implementation.
> +
> +:- import_module char.
> +:- import_module list.
> +:- import_module string.
> +
>
> +%---------------------------------------------------------------------------%
> +
> +main(!IO) :-
> +    S0 = "😀",
> +    S1 = string.between(S0, 0, 1),
> +    S2 = string.between(S0, 1, 2),
> +    S = S0 ++ S2 ++ S1 ++ S0,
> +    test_prev_index(S, count_code_units(S), !IO).
> +
> +:- pred test_prev_index(string::in, int::in, io::di, io::uo) is det.
> +
> +test_prev_index(S, Index, !IO) :-
> +    ( if string.prev_index(S, Index, PrevIndex, Char) then
> +        io.format("string.prev_index(S, %d, %d, %#x)\n",
> +            [i(Index), i(PrevIndex), i(char.to_int(Char))], !IO),
> +        test_prev_index(S, PrevIndex, !IO)
> +    else
> +        io.format("string.prev_index(S, %d, _, _) failed\n",
> +            [i(Index)], !IO)
> +    ).
> --
> 2.23.0
>
> _______________________________________________
> reviews mailing list
> reviews at lists.mercurylang.org
> https://lists.mercurylang.org/listinfo/reviews
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mercurylang.org/archives/reviews/attachments/20191017/7d5c6605/attachment.html>


More information about the reviews mailing list