[m-rev.] for review: Change behaviour of string.prev_index on ill-formed sequences.
Mark Brown
mark at mercurylang.org
Thu Oct 17 03:08:21 AEDT 2019
Looks good to me.
On Wed, Oct 16, 2019 at 5:04 PM Peter Wang <novalazy at gmail.com> wrote:
> library/string.m:
> Make string.prev_index and string.unsafe_prev_index
> return either U+FFFD or an unpaired surrogate code point
> when an ill-formed code unit sequence is detected.
>
> tests/hard_coded/Mmakefile:
> tests/hard_coded/string_prev_index_ilseq.exp:
> tests/hard_coded/string_prev_index_ilseq.exp2:
> tests/hard_coded/string_prev_index_ilseq.m:
> Add test case.
> ---
> library/string.m | 91 ++++++++++---------
> tests/hard_coded/Mmakefile | 1 +
> tests/hard_coded/string_prev_index_ilseq.exp | 5 +
> tests/hard_coded/string_prev_index_ilseq.exp2 | 5 +
> tests/hard_coded/string_prev_index_ilseq.m | 44 +++++++++
> 5 files changed, 104 insertions(+), 42 deletions(-)
> create mode 100644 tests/hard_coded/string_prev_index_ilseq.exp
> create mode 100644 tests/hard_coded/string_prev_index_ilseq.exp2
> create mode 100644 tests/hard_coded/string_prev_index_ilseq.m
>
> diff --git a/library/string.m b/library/string.m
> index b687cece3..1b0d32e87 100644
> --- a/library/string.m
> +++ b/library/string.m
> @@ -299,19 +299,27 @@
> %
> :- pred unsafe_index_next(string::in, int::in, int::out, char::uo) is
> semidet.
>
> - % prev_index(String, Index, CharIndex, Char):
> + % prev_index(String, Index, PrevIndex, Char):
> %
> - % `Char' is the character (code point) in `String' immediately
> _before_
> - % the code unit `Index'. Fails if `Index' is out of range
> (non-positive,
> - % or greater than the length of `String').
> + % If `Index - 1' is the final code unit offset of a well-formed
> sequence in
> + % `String' then `Char' is the code point encoded by that sequence, and
> + % `PrevIndex' is the initial code unit offset of that sequence.
> + %
> + % If the code unit in `String' at `Index - 1' is part of an ill-formed
> + % sequence then `Char' is either U+FFFD REPLACEMENT CHARACTER (if
> strings
> + % use UTF-8 encoding) or the unpaired surrogate code point at `Index
> - 1'
> + % (if strings use UTF-16 encoding), and `PrevIndex' is `Index - 1'.
> + %
> + % Fails if `Index' is out of range (non-positive, or greater than the
> + % length of `String').
> %
> :- pred prev_index(string::in, int::in, int::out, char::uo) is semidet.
>
> - % unsafe_prev_index(String, Index, CharIndex, Char):
> + % unsafe_prev_index(String, Index, PrevIndex, Char):
> + %
> + % Like prev_index/4 but does not check that `Index' is in range.
> + % Fails if `Index' is zero.
> %
> - % `Char' is the character (code point) in `String' immediately
> _before_
> - % the code unit `Index'. `CharIndex' is the offset of the beginning of
> - % `Char'. Fails if `Index' is zero.
> % WARNING: behavior is UNDEFINED if `Index' is out of range
> % (negative, or greater than the length of `String').
> %
> @@ -2350,37 +2358,37 @@ index_next(Str, Index, NextIndex, Char) :-
> end
> ").
>
> -prev_index(Str, Index, CharIndex, Char) :-
> +prev_index(Str, Index, PrevIndex, Char) :-
> Len = length(Str),
> ( if index_check(Index - 1, Len) then
> - unsafe_prev_index(Str, Index, CharIndex, Char)
> + unsafe_prev_index(Str, Index, PrevIndex, Char)
> else
> fail
> ).
>
> -% XXX ILSEQ Behaviour of unsafe_prev_index depends on target language.
> -% - c: fails on ill-formed sequence
> -% - java: returns unpaired surrogate
> -% - csharp: fails on unpaired surrogate (excepting for the bug)
> :- pragma foreign_proc("C",
> unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
> [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
> does_not_affect_liveness, no_sharing],
> "
> - if (Index > 0) {
> + if (Index <= 0) {
> + PrevIndex = Index;
> + Ch = 0;
> + SUCCESS_INDICATOR = MR_FALSE;
> + } else {
> PrevIndex = Index - 1;
> Ch = Str[PrevIndex];
> - if (MR_is_ascii(Ch)) {
> - SUCCESS_INDICATOR = (Ch != 0);
> - } else {
> + if (! MR_is_ascii(Ch)) {
> Ch = MR_utf8_prev_get(Str, &PrevIndex);
> - // XXX ILSEQ
> - // SUCCESS_INDICATOR should be false if there are any extra
> bytes
> - // after the encoding of Ch, but before Index.
> - SUCCESS_INDICATOR = (Ch > 0);
> + // XXX MR_utf8_prev_get currently just scans backwards to
> find a
> + // lead byte, so we need a separate check to ensure no bytes
> are
> + // unaccounted for.
> + if (Ch < 0 || PrevIndex + MR_utf8_width(Ch) != Index) {
> + Ch = 0xfffd;
> + PrevIndex = Index - 1;
> + }
> }
> - } else {
> - SUCCESS_INDICATOR = MR_FALSE;
> + SUCCESS_INDICATOR = MR_TRUE;
> }
> ").
> :- pragma foreign_proc("C#",
> @@ -2388,33 +2396,31 @@ prev_index(Str, Index, CharIndex, Char) :-
> [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
> does_not_affect_liveness, no_sharing],
> "
> - if (Index > 0) {
> + if (Index <= 0) {
> + Ch = 0;
> + PrevIndex = Index;
> + SUCCESS_INDICATOR = false;
> + } else {
> char c2 = Str[Index - 1];
> - if (System.Char.IsSurrogate(c2)) {
> + if (System.Char.IsLowSurrogate(c2)) {
> try {
> char c1 = Str[Index - 2];
> Ch = System.Char.ConvertToUtf32(c1, c2);
> PrevIndex = Index - 2;
> - } catch (System.IndexOutOfRangeException) {
> - Ch = -1;
> - PrevIndex = Index;
> - SUCCESS_INDICATOR = false;
> } catch (System.ArgumentOutOfRangeException) {
> - Ch = -1;
> - PrevIndex = Index;
> - SUCCESS_INDICATOR = false;
> + // Return unpaired surrogate code point.
> + Ch = (int) c2;
> + PrevIndex = Index - 1;
> + } catch (System.IndexOutOfRangeException) {
> + // Return unpaired surrogate code point.
> + Ch = (int) c2;
> + PrevIndex = Index - 1;
> }
> } else {
> - // Common case.
> Ch = (int) c2;
> PrevIndex = Index - 1;
> }
> - // XXX ILSEQ bug, check (Ch >= 0)
> SUCCESS_INDICATOR = true;
> - } else {
> - Ch = -1;
> - PrevIndex = Index;
> - SUCCESS_INDICATOR = false;
> }
> ").
> :- pragma foreign_proc("Java",
> @@ -2422,12 +2428,12 @@ prev_index(Str, Index, CharIndex, Char) :-
> [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
> does_not_affect_liveness, no_sharing],
> "
> - if (Index > 0) {
> + try {
> Ch = Str.codePointBefore(Index);
> PrevIndex = Index - java.lang.Character.charCount(Ch);
> SUCCESS_INDICATOR = true;
> - } else {
> - Ch = -1;
> + } catch (IndexOutOfBoundsException e) {
> + Ch = 0;
> PrevIndex = Index;
> SUCCESS_INDICATOR = false;
> }
> @@ -2437,6 +2443,7 @@ prev_index(Str, Index, CharIndex, Char) :-
> [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
> does_not_affect_liveness, may_not_duplicate, no_sharing],
> "
> + % XXX does not handle ill-formed sequences as described
> {PrevIndex, Ch} = do_unsafe_prev_index(Str, Index - 1),
> SUCCESS_INDICATOR = (Ch =/= -1)
> ").
> diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
> index 8c317da86..84e38bd34 100644
> --- a/tests/hard_coded/Mmakefile
> +++ b/tests/hard_coded/Mmakefile
> @@ -362,6 +362,7 @@ ORDINARY_PROGS = \
> string_index_next_ilseq \
> string_loop \
> string_presuffix \
> + string_prev_index_ilseq \
> string_set_char \
> string_split \
> string_split_2 \
> diff --git a/tests/hard_coded/string_prev_index_ilseq.exp
> b/tests/hard_coded/string_prev_index_ilseq.exp
> new file mode 100644
> index 000000000..3dbadd510
> --- /dev/null
> +++ b/tests/hard_coded/string_prev_index_ilseq.exp
> @@ -0,0 +1,5 @@
> +string.prev_index(S, 10, 6, 0x1f600)
> +string.prev_index(S, 6, 5, 0xfffd)
> +string.prev_index(S, 5, 4, 0xfffd)
> +string.prev_index(S, 4, 0, 0x1f600)
> +string.prev_index(S, 0, _, _) failed
> diff --git a/tests/hard_coded/string_prev_index_ilseq.exp2
> b/tests/hard_coded/string_prev_index_ilseq.exp2
> new file mode 100644
> index 000000000..8f61dd24d
> --- /dev/null
> +++ b/tests/hard_coded/string_prev_index_ilseq.exp2
> @@ -0,0 +1,5 @@
> +string.prev_index(S, 6, 4, 0x1f600)
> +string.prev_index(S, 4, 3, 0xd83d)
> +string.prev_index(S, 3, 2, 0xde00)
> +string.prev_index(S, 2, 0, 0x1f600)
> +string.prev_index(S, 0, _, _) failed
> diff --git a/tests/hard_coded/string_prev_index_ilseq.m
> b/tests/hard_coded/string_prev_index_ilseq.m
> new file mode 100644
> index 000000000..ce4b427b6
> --- /dev/null
> +++ b/tests/hard_coded/string_prev_index_ilseq.m
> @@ -0,0 +1,44 @@
>
> +%---------------------------------------------------------------------------%
> +% vim: ts=4 sw=4 et ft=mercury
>
> +%---------------------------------------------------------------------------%
> +%
> +% The .exp file is for backends using UTF-8 string encoding.
> +% The .exp2 file is for backends using UTF-16 string encoding.
> +%
>
> +%---------------------------------------------------------------------------%
> +
> +:- module string_prev_index_ilseq.
> +:- interface.
> +
> +:- import_module io.
> +
> +:- pred main(io::di, io::uo) is det.
> +
>
> +%---------------------------------------------------------------------------%
> +
> +:- implementation.
> +
> +:- import_module char.
> +:- import_module list.
> +:- import_module string.
> +
>
> +%---------------------------------------------------------------------------%
> +
> +main(!IO) :-
> + S0 = "😀",
> + S1 = string.between(S0, 0, 1),
> + S2 = string.between(S0, 1, 2),
> + S = S0 ++ S2 ++ S1 ++ S0,
> + test_prev_index(S, count_code_units(S), !IO).
> +
> +:- pred test_prev_index(string::in, int::in, io::di, io::uo) is det.
> +
> +test_prev_index(S, Index, !IO) :-
> + ( if string.prev_index(S, Index, PrevIndex, Char) then
> + io.format("string.prev_index(S, %d, %d, %#x)\n",
> + [i(Index), i(PrevIndex), i(char.to_int(Char))], !IO),
> + test_prev_index(S, PrevIndex, !IO)
> + else
> + io.format("string.prev_index(S, %d, _, _) failed\n",
> + [i(Index)], !IO)
> + ).
> --
> 2.23.0
>
> _______________________________________________
> reviews mailing list
> reviews at lists.mercurylang.org
> https://lists.mercurylang.org/listinfo/reviews
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mercurylang.org/archives/reviews/attachments/20191017/7d5c6605/attachment.html>
More information about the reviews
mailing list