[m-rev.] for review: Define behaviour of string.codepoint_offset on ill-formed sequences.

Mark Brown mark at mercurylang.org
Thu Oct 17 04:09:43 AEDT 2019


This looks fine. Thankyou for the effort!

Mark

On Wed, Oct 16, 2019 at 5:06 PM Peter Wang <novalazy at gmail.com> wrote:

> library/string.m:
>     Define how string.codepoint_offset counts code units in ill-formed
>     sequences.
>
>     Delete C and C# foreign implementations in favour of the Mercury
>     implementation that has the intended behaviour.
>     (The Java implementation uses String.offsetByCodePoints which
>     also matches our intended behaviour.)
>
> tests/hard_coded/Mmakefile:
> tests/hard_coded/string_codepoint_offset_ilseq.exp2:
> tests/hard_coded/string_codepoint_offset_ilseq.m:
>     Add test case.
> ---
>  library/string.m                              | 58 ++++-----------
>  tests/hard_coded/Mmakefile                    |  1 +
>  .../string_codepoint_offset_ilseq.exp         | 30 ++++++++
>  .../string_codepoint_offset_ilseq.exp2        | 22 ++++++
>  .../string_codepoint_offset_ilseq.m           | 72 +++++++++++++++++++
>  5 files changed, 137 insertions(+), 46 deletions(-)
>  create mode 100644 tests/hard_coded/string_codepoint_offset_ilseq.exp
>  create mode 100644 tests/hard_coded/string_codepoint_offset_ilseq.exp2
>  create mode 100644 tests/hard_coded/string_codepoint_offset_ilseq.m
>
> diff --git a/library/string.m b/library/string.m
> index 10bbf6f5b..a497293d7 100644
> --- a/library/string.m
> +++ b/library/string.m
> @@ -438,17 +438,23 @@
>      %
>  :- func count_utf8_code_units(string) = int.
>
> -    % codepoint_offset(String, StartOffset, CodePointCount,
> CodePointOffset):
> +    % codepoint_offset(String, StartOffset, Count, Offset):
>      %
> -    % Return the offset into `String' where, starting from `StartOffset',
> -    % `CodePointCount' code points are skipped. Fails if either
> `StartOffset'
> -    % or `CodePointOffset' are out of range.
> +    % Let `S' be the substring of `String' from code unit `StartOffset'
> to the
> +    % end of the string. `Offset' is code unit offset after advancing
> `Count'
> +    % steps in `S', where each step skips over either:
> +    %  - one encoding of a Unicode code point, or
> +    %  - one code unit that is part of an ill-formed sequence.
> +    %
> +    % Fails if `StartOffset' is out of range (negative, or greater than
> the
> +    % length of `String'), or if there are fewer than `Count' steps
> possible
> +    % in `S'.
>      %
>  :- pred codepoint_offset(string::in, int::in, int::in, int::out) is
> semidet.
>
> -    % codepoint_offset(String, CodePointCount, CodePointOffset):
> +    % codepoint_offset(String, Count, Offset):
>      %
> -    % Same as `codepoint_offset(String, 0, CodePointCount,
> CodePointOffset)'.
> +    % Same as `codepoint_offset(String, 0, Count, Offset)'.
>      %
>  :- pred codepoint_offset(string::in, int::in, int::out) is semidet.
>
> @@ -2807,43 +2813,6 @@ count_utf8_code_units_2(Char, !Length) :-
>
>  %---------------------%
>
> -% XXX ILSEQ Behaviour depends on target language.
> -% Should be made consistent so that each code unit in an ill-formed
> sequence
> -% counts as one (pseudo) code point.
> -
> -:- pragma foreign_proc("C",
> -    codepoint_offset(String::in, StartOffset::in, N::in, Index::out),
> -    [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
> -"
> -    size_t          len;
> -    unsigned char   b;
> -
> -    SUCCESS_INDICATOR = MR_FALSE;
> -    len = strlen(String);
> -    for (Index = StartOffset; Index < len; Index++) {
> -        b = String[Index];
> -        if (MR_utf8_is_single_byte(b) || MR_utf8_is_lead_byte(b)) {
> -            if (N-- == 0) {
> -                SUCCESS_INDICATOR = MR_TRUE;
> -                break;
> -            }
> -        }
> -    }
> -").
> -:- pragma foreign_proc("C#",
> -    codepoint_offset(String::in, StartOffset::in, N::in, Index::out),
> -    [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
> -"
> -    SUCCESS_INDICATOR = false;
> -    for (Index = StartOffset; Index < String.Length; Index++) {
> -        if (!System.Char.IsLowSurrogate(String, Index)) {
> -            if (N-- == 0) {
> -                SUCCESS_INDICATOR = true;
> -                break;
> -            }
> -        }
> -    }
> -").
>  :- pragma foreign_proc("Java",
>      codepoint_offset(String::in, StartOffset::in, N::in, Index::out),
>      [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
> @@ -2877,9 +2846,6 @@ codepoint_offset_loop(String, Offset, Length, N,
> Index) :-
>
>  %---------------------------------------------------------------------------%
>
>  codepoint_offset(String, N, Index) :-
> -    % XXX ILSEQ
> -    % Note: we do not define what happens with unpaired surrogates.
> -    %
>      codepoint_offset(String, 0, N, Index).
>
>
>  %---------------------------------------------------------------------------%
> diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
> index 626f60c6a..c7c30ba02 100644
> --- a/tests/hard_coded/Mmakefile
> +++ b/tests/hard_coded/Mmakefile
> @@ -358,6 +358,7 @@ ORDINARY_PROGS = \
>         string_class \
>         string_code_unit \
>         string_codepoint \
> +       string_codepoint_offset_ilseq \
>         string_count_codepoints_ilseq \
>         string_first_char \
>         string_fold_ilseq \
> diff --git a/tests/hard_coded/string_codepoint_offset_ilseq.exp
> b/tests/hard_coded/string_codepoint_offset_ilseq.exp
> new file mode 100644
> index 000000000..fd2b09e09
> --- /dev/null
> +++ b/tests/hard_coded/string_codepoint_offset_ilseq.exp
> @@ -0,0 +1,30 @@
> +start counting from offset 0
> +string.codepoint_offset(S, 0, 0, 0); Char = a
> +string.codepoint_offset(S, 0, 1, 1); Char = 😀
> +string.codepoint_offset(S, 0, 2, 5); Char = b
> +string.codepoint_offset(S, 0, 3, 6); Char = 0xfffd
> +string.codepoint_offset(S, 0, 4, 7); Char = 0xfffd
> +string.codepoint_offset(S, 0, 5, 8); Char = 0xfffd
> +string.codepoint_offset(S, 0, 6, 9); Char = z
> +string.codepoint_offset(S, 0, 7, _) failed
> +
> +start counting from offset 1
> +string.codepoint_offset(S, 1, 0, 1); Char = 😀
> +string.codepoint_offset(S, 1, 1, 5); Char = b
> +string.codepoint_offset(S, 1, 2, 6); Char = 0xfffd
> +string.codepoint_offset(S, 1, 3, 7); Char = 0xfffd
> +string.codepoint_offset(S, 1, 4, 8); Char = 0xfffd
> +string.codepoint_offset(S, 1, 5, 9); Char = z
> +string.codepoint_offset(S, 1, 6, _) failed
> +
> +start counting from offset 2
> +string.codepoint_offset(S, 2, 0, 2); Char = 0xfffd
> +string.codepoint_offset(S, 2, 1, 3); Char = 0xfffd
> +string.codepoint_offset(S, 2, 2, 4); Char = 0xfffd
> +string.codepoint_offset(S, 2, 3, 5); Char = b
> +string.codepoint_offset(S, 2, 4, 6); Char = 0xfffd
> +string.codepoint_offset(S, 2, 5, 7); Char = 0xfffd
> +string.codepoint_offset(S, 2, 6, 8); Char = 0xfffd
> +string.codepoint_offset(S, 2, 7, 9); Char = z
> +string.codepoint_offset(S, 2, 8, _) failed
> +
> diff --git a/tests/hard_coded/string_codepoint_offset_ilseq.exp2
> b/tests/hard_coded/string_codepoint_offset_ilseq.exp2
> new file mode 100644
> index 000000000..fd5576062
> --- /dev/null
> +++ b/tests/hard_coded/string_codepoint_offset_ilseq.exp2
> @@ -0,0 +1,22 @@
> +start counting from offset 0
> +string.codepoint_offset(S, 0, 0, 0); Char = a
> +string.codepoint_offset(S, 0, 1, 1); Char = 😀
> +string.codepoint_offset(S, 0, 2, 3); Char = b
> +string.codepoint_offset(S, 0, 3, 4); Char = 0xd83d
> +string.codepoint_offset(S, 0, 4, 5); Char = z
> +string.codepoint_offset(S, 0, 5, _) failed
> +
> +start counting from offset 1
> +string.codepoint_offset(S, 1, 0, 1); Char = 😀
> +string.codepoint_offset(S, 1, 1, 3); Char = b
> +string.codepoint_offset(S, 1, 2, 4); Char = 0xd83d
> +string.codepoint_offset(S, 1, 3, 5); Char = z
> +string.codepoint_offset(S, 1, 4, _) failed
> +
> +start counting from offset 2
> +string.codepoint_offset(S, 2, 0, 2); Char = 0xde00
> +string.codepoint_offset(S, 2, 1, 3); Char = b
> +string.codepoint_offset(S, 2, 2, 4); Char = 0xd83d
> +string.codepoint_offset(S, 2, 3, 5); Char = z
> +string.codepoint_offset(S, 2, 4, _) failed
> +
> diff --git a/tests/hard_coded/string_codepoint_offset_ilseq.m
> b/tests/hard_coded/string_codepoint_offset_ilseq.m
> new file mode 100644
> index 000000000..cecb2329c
> --- /dev/null
> +++ b/tests/hard_coded/string_codepoint_offset_ilseq.m
> @@ -0,0 +1,72 @@
>
> +%---------------------------------------------------------------------------%
> +% vim: ts=4 sw=4 et ft=mercury
>
> +%---------------------------------------------------------------------------%
> +%
> +% The .exp file is for backends using UTF-8 string encoding.
> +% The .exp2 file is for backends using UTF-16 string encoding.
> +%
>
> +%---------------------------------------------------------------------------%
> +
> +:- module string_codepoint_offset_ilseq.
> +:- interface.
> +
> +:- import_module io.
> +
> +:- pred main(io::di, io::uo) is det.
> +
>
> +%---------------------------------------------------------------------------%
> +
> +:- implementation.
> +
> +:- import_module char.
> +:- import_module int.
> +:- import_module list.
> +:- import_module string.
> +
>
> +%---------------------------------------------------------------------------%
> +
> +main(!IO) :-
> +    S0 = "😀",
> +    S1 = string.between(S0, 0, count_code_units(S0) - 1),
> +    S = "a" ++ S0 ++ "b" ++ S1 ++ "z",
> +
> +    io.write_string("start counting from offset 0\n", !IO),
> +    test_codepoint_offset(S, 0, 0, !IO),
> +    io.nl(!IO),
> +
> +    io.write_string("start counting from offset 1\n", !IO),
> +    test_codepoint_offset(S, 1, 0, !IO),
> +    io.nl(!IO),
> +
> +    io.write_string("start counting from offset 2\n", !IO),
> +    test_codepoint_offset(S, 2, 0, !IO),
> +    io.nl(!IO).
> +
> +:- pred test_codepoint_offset(string::in, int::in, int::in, io::di,
> io::uo)
> +    is det.
> +
> +test_codepoint_offset(S, StartOffset, Count, !IO) :-
> +    ( if string.codepoint_offset(S, StartOffset, Count, Offset) then
> +        io.format("string.codepoint_offset(S, %d, %d, %d); ",
> +            [i(StartOffset), i(Count), i(Offset)], !IO),
> +        ( if string.index(S, Offset, Char) then
> +            io.write_string("Char = ", !IO),
> +            write_char_or_hex(Char, !IO),
> +            io.nl(!IO)
> +        else
> +            io.write_string("string.index/3 failed\n", !IO)
> +        ),
> +        test_codepoint_offset(S, StartOffset, Count + 1, !IO)
> +    else
> +        io.format("string.codepoint_offset(S, %d, %d, _) failed\n",
> +            [i(StartOffset), i(Count)], !IO)
> +    ).
> +
> +:- pred write_char_or_hex(char::in, io::di, io::uo) is det.
> +
> +write_char_or_hex(Char, !IO) :-
> +    ( if Char = '\ufffd' ; char.is_surrogate(Char) then
> +        io.format("%#x", [i(char.to_int(Char))], !IO)
> +    else
> +        io.write_char(Char, !IO)
> +    ).
> --
> 2.23.0
>
> _______________________________________________
> reviews mailing list
> reviews at lists.mercurylang.org
> https://lists.mercurylang.org/listinfo/reviews
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mercurylang.org/archives/reviews/attachments/20191017/73ca098b/attachment-0001.html>


More information about the reviews mailing list