[m-rev.] for review: Define behaviour of string.first_char/3 on edge cases.
Mark Brown
mark at mercurylang.org
Tue Oct 29 19:12:34 AEDT 2019
This looks fine.
On Tue, Oct 29, 2019 at 12:02 PM Peter Wang <novalazy at gmail.com> wrote:
> library/string.m:
> Define first_char/3 to fail if the input string begins with an
> ill-formed code unit sequence.
>
> Define the reverse mode to throw an exception on an attempt to
> encode a null character or surrogate code point in the output
> string.
>
> Reimplement first_char/3 in Mercury.
>
> hard_coded/Mmakefile:
> hard_coded/string_first_char_ilseq.exp:
> hard_coded/string_first_char_ilseq.m:
> Add test case.
> ---
> library/string.m | 373 ++++---------------
> tests/hard_coded/Mmakefile | 1 +
> tests/hard_coded/string_first_char_ilseq.exp | 10 +
> tests/hard_coded/string_first_char_ilseq.m | 97 +++++
> 4 files changed, 183 insertions(+), 298 deletions(-)
> create mode 100644 tests/hard_coded/string_first_char_ilseq.exp
> create mode 100644 tests/hard_coded/string_first_char_ilseq.m
>
> diff --git a/library/string.m b/library/string.m
> index cbdbcd128..82bc84fa3 100644
> --- a/library/string.m
> +++ b/library/string.m
> @@ -670,8 +670,13 @@
> % Splitting up strings.
> %
>
> - % first_char(String, Char, Rest) is true iff Char is the first
> character
> - % (code point) of String, and Rest is the remainder.
> + % first_char(String, Char, Rest) is true iff `String' begins with a
> + % well-formed code unit sequence, `Char' is the code point encoded by
> + % that sequence, and `Rest' is the rest of `String' after that
> sequence.
> + %
> + % The (uo, in, in) mode throws an exception if `Char' cannot be
> encoded in
> + % a string, or if `Char' is a surrogate code point (for consistency
> with
> + % the other modes).
> %
> % WARNING: first_char makes a copy of Rest because the garbage
> collector
> % doesn't handle references into the middle of an object, at least
> not the
> @@ -2395,6 +2400,28 @@ index_next(Str, Index, NextIndex, Char) :-
> end
> ").
>
> +%---------------------%
> +
> + % XXX ILSEQ Provide public interfaces to index into strings while
> + % signalling if we encountered an ill-formed sequence.
> + %
> +:- pred index_next_not_replaced(string::in, int::in, int::out, char::uo)
> + is semidet.
> +
> +index_next_not_replaced(Str, Index, NextIndex, Char) :-
> + index_next(Str, Index, NextIndex, Char0),
> + ( if
> + internal_encoding_is_utf8,
> + Char0 = '\ufffd'
> + then
> + unsafe_between(Str, Index, NextIndex, "\ufffd")
> + else
> + true
> + ),
> + unsafe_promise_unique(Char0, Char).
> +
> +%---------------------%
> +
> prev_index(Str, Index, PrevIndex, Char) :-
> Len = length(Str),
> ( if index_check(Index - 1, Len) then
> @@ -2507,6 +2534,8 @@ do_unsafe_prev_index(Str, Index) ->
> end.
> ").
>
> +%---------------------%
> +
> % XXX We should consider making this routine a compiler built-in.
> %
> :- pred index_check(int::in, int::in) is semidet.
> @@ -3896,302 +3925,50 @@ join_list_loop(Sep, [H | T]) = Sep ++ H ++
> join_list_loop(Sep, T).
> % Splitting up strings.
> %
>
> -% XXX ILSEQ Behaviour depends on target language.
> -% - C: fails if the string begins with ill-formed sequence
> -% - Java/C#: succeeds if the string begins with an unpaired surrogate
> -
> -:- pragma foreign_proc("C",
> - first_char(Str::in, First::in, Rest::in),
> - [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
> - does_not_affect_liveness, no_sharing],
> -"
> - MR_Integer pos = 0;
> - int c = MR_utf8_get_next(Str, &pos);
> - SUCCESS_INDICATOR = (
> - c == First &&
> - First != '\\0' &&
> - strcmp(Str + pos, Rest) == 0
> - );
> -").
> -:- pragma foreign_proc("C#",
> - first_char(Str::in, First::in, Rest::in),
> - [will_not_call_mercury, promise_pure, thread_safe],
> -"
> - int len = Str.Length;
> - if (First <= 0xffff) {
> - SUCCESS_INDICATOR = (
> - len > 0 &&
> - Str[0] == First &&
> - System.String.CompareOrdinal(Str, 1, Rest, 0, len) == 0
> - );
> - } else {
> - string firstchars = System.Char.ConvertFromUtf32(First);
> - SUCCESS_INDICATOR = (
> - len > 1 &&
> - Str[0] == firstchars[0] &&
> - Str[1] == firstchars[1] &&
> - System.String.CompareOrdinal(Str, 2, Rest, 0, len) == 0
> - );
> - }
> -").
> -:- pragma foreign_proc("Java",
> - first_char(Str::in, First::in, Rest::in),
> - [will_not_call_mercury, promise_pure, thread_safe],
> -"
> - int toffset = java.lang.Character.charCount(First);
> - SUCCESS_INDICATOR = (
> - Str.length() > 0 &&
> - Str.codePointAt(0) == First &&
> - Str.regionMatches(toffset, Rest, 0, Rest.length())
> - );
> -").
> -:- pragma foreign_proc("Erlang",
> - first_char(Str::in, First::in, Rest::in),
> - [will_not_call_mercury, promise_pure, thread_safe],
> -"
> - case Str of
> - <<First/utf8, Rest/binary>> ->
> - SUCCESS_INDICATOR = true;
> - _ ->
> - SUCCESS_INDICATOR = false
> - end
> -").
> -
> -:- pragma foreign_proc("C",
> - first_char(Str::in, First::uo, Rest::in),
> - [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
> - does_not_affect_liveness, no_sharing],
> -"
> - MR_Integer pos = 0;
> - First = MR_utf8_get_next(Str, &pos);
> - SUCCESS_INDICATOR = (First > 0 && strcmp(Str + pos, Rest) == 0);
> -").
> -:- pragma foreign_proc("C#",
> - first_char(Str::in, First::uo, Rest::in),
> - [will_not_call_mercury, promise_pure, thread_safe],
> -"
> - try {
> - int len = Str.Length;
> - char c1 = Str[0];
> - if (System.Char.IsHighSurrogate(c1)) {
> - char c2 = Str[1];
> - First = System.Char.ConvertToUtf32(c1, c2);
> - SUCCESS_INDICATOR =
> - (System.String.CompareOrdinal(Str, 2, Rest, 0, len) == 0);
> - } else {
> - First = c1;
> - SUCCESS_INDICATOR =
> - (System.String.CompareOrdinal(Str, 1, Rest, 0, len) == 0);
> - }
> - } catch (System.IndexOutOfRangeException) {
> - SUCCESS_INDICATOR = false;
> - First = (char) 0;
> - } catch (System.ArgumentOutOfRangeException) {
> - SUCCESS_INDICATOR = false;
> - First = (char) 0;
> - }
> -").
> -:- pragma foreign_proc("Java",
> - first_char(Str::in, First::uo, Rest::in),
> - [will_not_call_mercury, promise_pure, thread_safe],
> -"
> - int toffset;
> - if (Str.length() > 0) {
> - First = Str.codePointAt(0);
> - toffset = java.lang.Character.charCount(First);
> - SUCCESS_INDICATOR =
> - Str.regionMatches(toffset, Rest, 0, Rest.length());
> - } else {
> - SUCCESS_INDICATOR = false;
> - // XXX to avoid uninitialized var warning
> - First = 0;
> - }
> -").
> -:- pragma foreign_proc("Erlang",
> - first_char(Str::in, First::uo, Rest::in),
> - [will_not_call_mercury, promise_pure, thread_safe],
> -"
> - case Str of
> - <<First/utf8, Rest/binary>> ->
> - SUCCESS_INDICATOR = true;
> - _ ->
> - SUCCESS_INDICATOR = false,
> - First = 0
> - end
> -").
> -
> -:- pragma foreign_proc("C",
> - first_char(Str::in, First::in, Rest::uo),
> - [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
> - does_not_affect_liveness, no_sharing],
> -"{
> - MR_Integer pos = 0;
> - int c = MR_utf8_get_next(Str, &pos);
> - if (c != First || First == '\\0') {
> - SUCCESS_INDICATOR = MR_FALSE;
> - } else {
> - Str += pos;
> - // We need to make a copy to ensure that the pointer is
> word-aligned.
> - MR_allocate_aligned_string_msg(Rest, strlen(Str), MR_ALLOC_ID);
> - strcpy(Rest, Str);
> - SUCCESS_INDICATOR = MR_TRUE;
> - }
> -}").
> -:- pragma foreign_proc("C#",
> - first_char(Str::in, First::in, Rest::uo),
> - [will_not_call_mercury, promise_pure, thread_safe],
> -"{
> - int len = Str.Length;
> -
> - if (len > 0) {
> - if (First <= 0xffff) {
> - SUCCESS_INDICATOR = (First == Str[0]);
> - Rest = Str.Substring(1);
> - } else {
> - string firststr = System.Char.ConvertFromUtf32(First);
> - SUCCESS_INDICATOR =
> - (System.String.CompareOrdinal(Str, 0, firststr, 0, 2) ==
> 0);
> - Rest = Str.Substring(2);
> - }
> - } else {
> - SUCCESS_INDICATOR = false;
> - Rest = null;
> - }
> -}").
> -:- pragma foreign_proc("Java",
> - first_char(Str::in, First::in, Rest::uo),
> - [will_not_call_mercury, promise_pure, thread_safe],
> -"{
> - int len = Str.length();
> -
> - if (len > 0) {
> - SUCCESS_INDICATOR = (First == Str.codePointAt(0));
> - Rest = Str.substring(java.lang.Character.charCount(First));
> - } else {
> - SUCCESS_INDICATOR = false;
> - // XXX to avoid uninitialized var warning
> - Rest = null;
> - }
> -}").
> -:- pragma foreign_proc("Erlang",
> - first_char(Str::in, First::in, Rest::uo),
> - [will_not_call_mercury, promise_pure, thread_safe],
> -"
> - case Str of
> - <<First/utf8, Rest/binary>> ->
> - SUCCESS_INDICATOR = true;
> - _ ->
> - SUCCESS_INDICATOR = false,
> - Rest = <<>>
> - end
> -").
> -
> -:- pragma foreign_proc("C",
> - first_char(Str::in, First::uo, Rest::uo),
> - [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
> - does_not_affect_liveness, no_sharing],
> -"{
> - MR_Integer pos = 0;
> - First = MR_utf8_get_next(Str, &pos);
> - if (First < 1) {
> - SUCCESS_INDICATOR = MR_FALSE;
> - } else {
> - Str += pos;
> - // We need to make a copy to ensure that the pointer is
> word-aligned.
> - MR_allocate_aligned_string_msg(Rest, strlen(Str), MR_ALLOC_ID);
> - strcpy(Rest, Str);
> - SUCCESS_INDICATOR = MR_TRUE;
> - }
> -}").
> -:- pragma foreign_proc("C#",
> - first_char(Str::in, First::uo, Rest::uo),
> - [will_not_call_mercury, promise_pure, thread_safe],
> -"{
> - try {
> - char c1 = Str[0];
> - if (System.Char.IsHighSurrogate(c1)) {
> - char c2 = Str[1];
> - First = System.Char.ConvertToUtf32(c1, c2);
> - Rest = Str.Substring(2);
> - } else {
> - First = Str[0];
> - Rest = Str.Substring(1);
> - }
> - SUCCESS_INDICATOR = true;
> - } catch (System.IndexOutOfRangeException) {
> - SUCCESS_INDICATOR = false;
> - First = (char) 0;
> - Rest = null;
> - } catch (System.ArgumentOutOfRangeException) {
> - SUCCESS_INDICATOR = false;
> - First = (char) 0;
> - Rest = null;
> - }
> -}").
> -:- pragma foreign_proc("Java",
> - first_char(Str::in, First::uo, Rest::uo),
> - [will_not_call_mercury, promise_pure, thread_safe],
> -"{
> - if (Str.length() == 0) {
> - SUCCESS_INDICATOR = false;
> - First = (char) 0;
> - Rest = null;
> - } else {
> - First = Str.codePointAt(0);
> - Rest = Str.substring(java.lang.Character.charCount(First));
> - SUCCESS_INDICATOR = true;
> - }
> -}").
> -:- pragma foreign_proc("Erlang",
> - first_char(Str::in, First::uo, Rest::uo),
> - [will_not_call_mercury, promise_pure, thread_safe],
> -"
> - case Str of
> - <<First/utf8, Rest/binary>> ->
> - SUCCESS_INDICATOR = true;
> - _ ->
> - SUCCESS_INDICATOR = false,
> - First = 0,
> - Rest = <<>>
> - end
> -").
> -
> -:- pragma foreign_proc("C",
> - first_char(Str::uo, First::in, Rest::in),
> - [will_not_call_mercury, promise_pure, thread_safe,
> will_not_modify_trail,
> - does_not_affect_liveness, no_sharing],
> -"{
> - size_t firstw = MR_utf8_width(First);
> - size_t len = firstw + strlen(Rest);
> - MR_allocate_aligned_string_msg(Str, len, MR_ALLOC_ID);
> - MR_utf8_encode(Str, First);
> - strcpy(Str + firstw, Rest);
> -}").
> -:- pragma foreign_proc("C#",
> - first_char(Str::uo, First::in, Rest::in),
> - [will_not_call_mercury, promise_pure, thread_safe],
> -"{
> - string FirstStr;
> - if (First <= 0xffff) {
> - FirstStr = new System.String((char) First, 1);
> - } else {
> - FirstStr = System.Char.ConvertFromUtf32(First);
> - }
> - Str = FirstStr + Rest;
> -}").
> -:- pragma foreign_proc("Java",
> - first_char(Str::uo, First::in, Rest::in),
> - [will_not_call_mercury, promise_pure, thread_safe],
> -"{
> - String FirstStr = new String(Character.toChars(First));
> - Str = FirstStr.concat(Rest);
> -}").
> -:- pragma foreign_proc("Erlang",
> - first_char(Str::uo, First::in, Rest::in),
> - [will_not_call_mercury, promise_pure, thread_safe],
> -"
> - Str = unicode:characters_to_binary([First, Rest])
> -").
> +:- pragma promise_equivalent_clauses(first_char/3).
> +
> +first_char(Str::in, First::in, Rest::in) :-
> + first_char_rest_in(Str, First, Rest).
> +first_char(Str::in, First::uo, Rest::in) :-
> + first_char_rest_in(Str, First, Rest).
> +first_char(Str::in, First::in, Rest::uo) :-
> + first_char_rest_out(Str, First, Rest).
> +first_char(Str::in, First::uo, Rest::uo) :-
> + first_char_rest_out(Str, First, Rest).
> +first_char(Str::uo, First::in, Rest::in) :-
> + first_char_str_out(Str, First, Rest).
> +
> +:- pred first_char_rest_in(string, char, string).
> +:- mode first_char_rest_in(in, in, in) is semidet.
> +:- mode first_char_rest_in(in, uo, in) is semidet.
> +
> +first_char_rest_in(Str, First, Rest) :-
> + index_next_not_replaced(Str, 0, NextIndex, First0),
> + not is_surrogate(First0),
> + unsafe_promise_unique(First0, First),
> + unsafe_compare_substrings((=), Str, NextIndex, Rest, 0, length(Rest)).
> +
> +:- pred first_char_rest_out(string, char, string).
> +:- mode first_char_rest_out(in, in, uo) is semidet.
> +:- mode first_char_rest_out(in, uo, uo) is semidet.
> +
> +first_char_rest_out(Str, First, Rest) :-
> + index_next_not_replaced(Str, 0, NextIndex, First0),
> + not is_surrogate(First0),
> + unsafe_promise_unique(First0, First),
> + unsafe_between(Str, NextIndex, length(Str), Rest).
> +
> +:- pred first_char_str_out(string, char, string).
> +:- mode first_char_str_out(uo, in, in) is det.
> +
> +first_char_str_out(Str, First, Rest) :-
> + ( if char.to_int(First, 0) then
> + unexpected($pred, "null character")
> + else if char.is_surrogate(First) then
> + unexpected($pred, "surrogate code point")
> + else
> + Str = char_to_string(First) ++ Rest
> + ).
>
> %---------------------%
>
> diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
> index 47e8ec5a8..048827e4a 100644
> --- a/tests/hard_coded/Mmakefile
> +++ b/tests/hard_coded/Mmakefile
> @@ -364,6 +364,7 @@ ORDINARY_PROGS = \
> string_compare_substrings \
> string_count_codepoints_ilseq \
> string_first_char \
> + string_first_char_ilseq \
> string_fold_ilseq \
> string_from_char_list_ilseq \
> string_index_ilseq \
> diff --git a/tests/hard_coded/string_first_char_ilseq.exp
> b/tests/hard_coded/string_first_char_ilseq.exp
> new file mode 100644
> index 000000000..1e8d934f3
> --- /dev/null
> +++ b/tests/hard_coded/string_first_char_ilseq.exp
> @@ -0,0 +1,10 @@
> +first_char(in, out, in) failed
> +first_char(in, out, out) failed
> +first_char(in, in, in) failed
> +first_char(in, in, out) failed
> +first_char(in, in, in) failed
> +first_char(in, in, out) failed
> +first_char(in, in, in) failed
> +first_char(in, in, out) failed
> +first_char(out, in, in) threw exception: software_error("predicate
> `string.first_char_str_out\'/3: Unexpected: surrogate code point")
> +first_char(out, in, in) threw exception: software_error("predicate
> `string.first_char_str_out\'/3: Unexpected: surrogate code point")
> diff --git a/tests/hard_coded/string_first_char_ilseq.m
> b/tests/hard_coded/string_first_char_ilseq.m
> new file mode 100644
> index 000000000..107189580
> --- /dev/null
> +++ b/tests/hard_coded/string_first_char_ilseq.m
> @@ -0,0 +1,97 @@
>
> +%---------------------------------------------------------------------------%
> +% vim: ts=4 sw=4 et ft=mercury
>
> +%---------------------------------------------------------------------------%
> +
> +:- module string_first_char_ilseq.
> +:- interface.
> +
> +:- import_module io.
> +
> +:- pred main(io::di, io::uo) is cc_multi.
> +
>
> +%---------------------------------------------------------------------------%
> +
> +:- implementation.
> +
> +:- import_module char.
> +:- import_module string.
> +
>
> +%---------------------------------------------------------------------------%
> +
> +main(!IO) :-
> + S0 = "😀", % UTF-16: 0xD83D 0xDE00
> + S1 = string.between(S0, 1, length(S0)), % UTF-16: 0xDE00
> + Rest = "rest",
> + S = S1 ++ Rest,
> +
> + test_first_char_ioi(S, Rest, !IO),
> + test_first_char_ioo(S, !IO),
> +
> + % An implementation might return U+FFFD as the first char of a string
> + % beginning with an ill-formed code unit sequence, which is wrong by
> our
> + % definition.
> + Replacement = char.det_from_int(0xFFFD),
> + test_first_char_iii(S, Replacement, Rest, !IO),
> + test_first_char_iio(S, Replacement, !IO),
> +
> + % An implementation might separate out a surrogate code point as the
> + % first code point, which is wrong by our definition.
> + HiSurr = char.det_from_int(0xD83D),
> + LoSurr = char.det_from_int(0xDE00),
> + test_first_char_iii(S, HiSurr, Rest, !IO),
> + test_first_char_iio(S, HiSurr, !IO),
> + test_first_char_iii(S, LoSurr, Rest, !IO),
> + test_first_char_iio(S, LoSurr, !IO),
> +
> + % Prepending a surrogate code point is disallowed.
> + test_first_char_oii(HiSurr, S, !IO),
> + test_first_char_oii(LoSurr, S, !IO).
> +
> +:- pred test_first_char_iii(string::in, char::in, string::in, io::di,
> io::uo)
> + is det.
> +
> +test_first_char_iii(Str, FirstChar, Rest, !IO) :-
> + ( if string.first_char(Str, FirstChar, Rest) then
> + io.write_string("first_char(in, in, in) succeeded\n", !IO)
> + else
> + io.write_string("first_char(in, in, in) failed\n", !IO)
> + ).
> +
> +:- pred test_first_char_ioi(string::in, string::in, io::di, io::uo) is
> det.
> +
> +test_first_char_ioi(Str, Rest, !IO) :-
> + ( if string.first_char(Str, _FirstChar, Rest) then
> + io.write_string("first_char(in, out, in) succeeded\n", !IO)
> + else
> + io.write_string("first_char(in, out, in) failed\n", !IO)
> + ).
> +
> +:- pred test_first_char_iio(string::in, char::in, io::di, io::uo) is det.
> +
> +test_first_char_iio(Str, FirstChar, !IO) :-
> + ( if string.first_char(Str, FirstChar, _Rest) then
> + io.write_string("first_char(in, in, out) succeeded\n", !IO)
> + else
> + io.write_string("first_char(in, in, out) failed\n", !IO)
> + ).
> +
> +:- pred test_first_char_ioo(string::in, io::di, io::uo) is det.
> +
> +test_first_char_ioo(Str, !IO) :-
> + ( if string.first_char(Str, _FirstChar, _Rest) then
> + io.write_string("first_char(in, out, out) succeeded\n", !IO)
> + else
> + io.write_string("first_char(in, out, out) failed\n", !IO)
> + ).
> +
> +:- pred test_first_char_oii(char::in, string::in, io::di, io::uo) is
> cc_multi.
> +
> +test_first_char_oii(FirstChar, Rest, !IO) :-
> + ( try []
> + string.first_char(_Str, FirstChar, Rest)
> + then
> + io.write_string("first_char(out, in, in) succeeded\n", !IO)
> + catch_any Excp ->
> + io.write_string("first_char(out, in, in) threw exception: ", !IO),
> + io.print_line(Excp, !IO)
> + ).
> --
> 2.23.0
>
> _______________________________________________
> reviews mailing list
> reviews at lists.mercurylang.org
> https://lists.mercurylang.org/listinfo/reviews
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mercurylang.org/archives/reviews/attachments/20191029/c9f820f1/attachment-0001.html>
More information about the reviews
mailing list