[m-rev.] for review: Define behaviour of string.first_char/3 on edge cases.

Peter Wang novalazy at gmail.com
Tue Oct 29 12:01:47 AEDT 2019


library/string.m:
    Define first_char/3 to fail if the input string begins with an
    ill-formed code unit sequence.

    Define the reverse mode to throw an exception on an attempt to
    encode a null character or surrogate code point in the output
    string.

    Reimplement first_char/3 in Mercury.

hard_coded/Mmakefile:
hard_coded/string_first_char_ilseq.exp:
hard_coded/string_first_char_ilseq.m:
    Add test case.
---
 library/string.m                             | 373 ++++---------------
 tests/hard_coded/Mmakefile                   |   1 +
 tests/hard_coded/string_first_char_ilseq.exp |  10 +
 tests/hard_coded/string_first_char_ilseq.m   |  97 +++++
 4 files changed, 183 insertions(+), 298 deletions(-)
 create mode 100644 tests/hard_coded/string_first_char_ilseq.exp
 create mode 100644 tests/hard_coded/string_first_char_ilseq.m

diff --git a/library/string.m b/library/string.m
index cbdbcd128..82bc84fa3 100644
--- a/library/string.m
+++ b/library/string.m
@@ -670,8 +670,13 @@
 % Splitting up strings.
 %
 
-    % first_char(String, Char, Rest) is true iff Char is the first character
-    % (code point) of String, and Rest is the remainder.
+    % first_char(String, Char, Rest) is true iff `String' begins with a
+    % well-formed code unit sequence, `Char' is the code point encoded by
+    % that sequence, and `Rest' is the rest of `String' after that sequence.
+    %
+    % The (uo, in, in) mode throws an exception if `Char' cannot be encoded in
+    % a string, or if `Char' is a surrogate code point (for consistency with
+    % the other modes).
     %
     % WARNING: first_char makes a copy of Rest because the garbage collector
     % doesn't handle references into the middle of an object, at least not the
@@ -2395,6 +2400,28 @@ index_next(Str, Index, NextIndex, Char) :-
     end
 ").
 
+%---------------------%
+
+    % XXX ILSEQ Provide public interfaces to index into strings while
+    % signalling if we encountered an ill-formed sequence.
+    %
+:- pred index_next_not_replaced(string::in, int::in, int::out, char::uo)
+    is semidet.
+
+index_next_not_replaced(Str, Index, NextIndex, Char) :-
+    index_next(Str, Index, NextIndex, Char0),
+    ( if
+        internal_encoding_is_utf8,
+        Char0 = '\ufffd'
+    then
+        unsafe_between(Str, Index, NextIndex, "\ufffd")
+    else
+        true
+    ),
+    unsafe_promise_unique(Char0, Char).
+
+%---------------------%
+
 prev_index(Str, Index, PrevIndex, Char) :-
     Len = length(Str),
     ( if index_check(Index - 1, Len) then
@@ -2507,6 +2534,8 @@ do_unsafe_prev_index(Str, Index) ->
     end.
 ").
 
+%---------------------%
+
     % XXX We should consider making this routine a compiler built-in.
     %
 :- pred index_check(int::in, int::in) is semidet.
@@ -3896,302 +3925,50 @@ join_list_loop(Sep, [H | T]) = Sep ++ H ++ join_list_loop(Sep, T).
 % Splitting up strings.
 %
 
-% XXX ILSEQ Behaviour depends on target language.
-%  - C: fails if the string begins with ill-formed sequence
-%  - Java/C#: succeeds if the string begins with an unpaired surrogate
-
-:- pragma foreign_proc("C",
-    first_char(Str::in, First::in, Rest::in),
-    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
-        does_not_affect_liveness, no_sharing],
-"
-    MR_Integer pos = 0;
-    int c = MR_utf8_get_next(Str, &pos);
-    SUCCESS_INDICATOR = (
-        c == First &&
-        First != '\\0' &&
-        strcmp(Str + pos, Rest) == 0
-    );
-").
-:- pragma foreign_proc("C#",
-    first_char(Str::in, First::in, Rest::in),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    int len = Str.Length;
-    if (First <= 0xffff) {
-        SUCCESS_INDICATOR = (
-            len > 0 &&
-            Str[0] == First &&
-            System.String.CompareOrdinal(Str, 1, Rest, 0, len) == 0
-        );
-    } else {
-        string firstchars = System.Char.ConvertFromUtf32(First);
-        SUCCESS_INDICATOR = (
-            len > 1 &&
-            Str[0] == firstchars[0] &&
-            Str[1] == firstchars[1] &&
-            System.String.CompareOrdinal(Str, 2, Rest, 0, len) == 0
-        );
-    }
-").
-:- pragma foreign_proc("Java",
-    first_char(Str::in, First::in, Rest::in),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    int toffset = java.lang.Character.charCount(First);
-    SUCCESS_INDICATOR = (
-        Str.length() > 0 &&
-        Str.codePointAt(0) == First &&
-        Str.regionMatches(toffset, Rest, 0, Rest.length())
-    );
-").
-:- pragma foreign_proc("Erlang",
-    first_char(Str::in, First::in, Rest::in),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    case Str of
-        <<First/utf8, Rest/binary>> ->
-            SUCCESS_INDICATOR = true;
-        _ ->
-            SUCCESS_INDICATOR = false
-    end
-").
-
-:- pragma foreign_proc("C",
-    first_char(Str::in, First::uo, Rest::in),
-    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
-        does_not_affect_liveness, no_sharing],
-"
-    MR_Integer pos = 0;
-    First = MR_utf8_get_next(Str, &pos);
-    SUCCESS_INDICATOR = (First > 0 && strcmp(Str + pos, Rest) == 0);
-").
-:- pragma foreign_proc("C#",
-    first_char(Str::in, First::uo, Rest::in),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    try {
-        int len = Str.Length;
-        char c1 = Str[0];
-        if (System.Char.IsHighSurrogate(c1)) {
-            char c2 = Str[1];
-            First = System.Char.ConvertToUtf32(c1, c2);
-            SUCCESS_INDICATOR =
-                (System.String.CompareOrdinal(Str, 2, Rest, 0, len) == 0);
-        } else {
-            First = c1;
-            SUCCESS_INDICATOR =
-                (System.String.CompareOrdinal(Str, 1, Rest, 0, len) == 0);
-        }
-    } catch (System.IndexOutOfRangeException) {
-        SUCCESS_INDICATOR = false;
-        First = (char) 0;
-    } catch (System.ArgumentOutOfRangeException) {
-        SUCCESS_INDICATOR = false;
-        First = (char) 0;
-    }
-").
-:- pragma foreign_proc("Java",
-    first_char(Str::in, First::uo, Rest::in),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    int toffset;
-    if (Str.length() > 0) {
-        First = Str.codePointAt(0);
-        toffset = java.lang.Character.charCount(First);
-        SUCCESS_INDICATOR =
-            Str.regionMatches(toffset, Rest, 0, Rest.length());
-    } else {
-        SUCCESS_INDICATOR = false;
-        // XXX to avoid uninitialized var warning
-        First = 0;
-    }
-").
-:- pragma foreign_proc("Erlang",
-    first_char(Str::in, First::uo, Rest::in),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    case Str of
-        <<First/utf8, Rest/binary>> ->
-            SUCCESS_INDICATOR = true;
-        _ ->
-            SUCCESS_INDICATOR = false,
-            First = 0
-    end
-").
-
-:- pragma foreign_proc("C",
-    first_char(Str::in, First::in, Rest::uo),
-    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
-        does_not_affect_liveness, no_sharing],
-"{
-    MR_Integer pos = 0;
-    int c = MR_utf8_get_next(Str, &pos);
-    if (c != First || First == '\\0') {
-        SUCCESS_INDICATOR = MR_FALSE;
-    } else {
-        Str += pos;
-        // We need to make a copy to ensure that the pointer is word-aligned.
-        MR_allocate_aligned_string_msg(Rest, strlen(Str), MR_ALLOC_ID);
-        strcpy(Rest, Str);
-        SUCCESS_INDICATOR = MR_TRUE;
-    }
-}").
-:- pragma foreign_proc("C#",
-    first_char(Str::in, First::in, Rest::uo),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"{
-    int len = Str.Length;
-
-    if (len > 0) {
-        if (First <= 0xffff) {
-            SUCCESS_INDICATOR = (First == Str[0]);
-            Rest = Str.Substring(1);
-        } else {
-            string firststr = System.Char.ConvertFromUtf32(First);
-            SUCCESS_INDICATOR =
-                (System.String.CompareOrdinal(Str, 0, firststr, 0, 2) == 0);
-            Rest = Str.Substring(2);
-        }
-    } else {
-        SUCCESS_INDICATOR = false;
-        Rest = null;
-    }
-}").
-:- pragma foreign_proc("Java",
-    first_char(Str::in, First::in, Rest::uo),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"{
-    int len = Str.length();
-
-    if (len > 0) {
-        SUCCESS_INDICATOR = (First == Str.codePointAt(0));
-        Rest = Str.substring(java.lang.Character.charCount(First));
-    } else {
-        SUCCESS_INDICATOR = false;
-        // XXX to avoid uninitialized var warning
-        Rest = null;
-    }
-}").
-:- pragma foreign_proc("Erlang",
-    first_char(Str::in, First::in, Rest::uo),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    case Str of
-        <<First/utf8, Rest/binary>> ->
-            SUCCESS_INDICATOR = true;
-        _ ->
-            SUCCESS_INDICATOR = false,
-            Rest = <<>>
-    end
-").
-
-:- pragma foreign_proc("C",
-    first_char(Str::in, First::uo, Rest::uo),
-    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
-        does_not_affect_liveness, no_sharing],
-"{
-    MR_Integer pos = 0;
-    First = MR_utf8_get_next(Str, &pos);
-    if (First < 1) {
-        SUCCESS_INDICATOR = MR_FALSE;
-    } else {
-        Str += pos;
-        // We need to make a copy to ensure that the pointer is word-aligned.
-        MR_allocate_aligned_string_msg(Rest, strlen(Str), MR_ALLOC_ID);
-        strcpy(Rest, Str);
-        SUCCESS_INDICATOR = MR_TRUE;
-    }
-}").
-:- pragma foreign_proc("C#",
-    first_char(Str::in, First::uo, Rest::uo),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"{
-    try {
-        char c1 = Str[0];
-        if (System.Char.IsHighSurrogate(c1)) {
-            char c2 = Str[1];
-            First = System.Char.ConvertToUtf32(c1, c2);
-            Rest = Str.Substring(2);
-        } else {
-            First = Str[0];
-            Rest = Str.Substring(1);
-        }
-        SUCCESS_INDICATOR = true;
-    } catch (System.IndexOutOfRangeException) {
-        SUCCESS_INDICATOR = false;
-        First = (char) 0;
-        Rest = null;
-    } catch (System.ArgumentOutOfRangeException) {
-        SUCCESS_INDICATOR = false;
-        First = (char) 0;
-        Rest = null;
-    }
-}").
-:- pragma foreign_proc("Java",
-    first_char(Str::in, First::uo, Rest::uo),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"{
-    if (Str.length() == 0) {
-        SUCCESS_INDICATOR = false;
-        First = (char) 0;
-        Rest = null;
-    } else {
-        First = Str.codePointAt(0);
-        Rest = Str.substring(java.lang.Character.charCount(First));
-        SUCCESS_INDICATOR = true;
-    }
-}").
-:- pragma foreign_proc("Erlang",
-    first_char(Str::in, First::uo, Rest::uo),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    case Str of
-        <<First/utf8, Rest/binary>> ->
-            SUCCESS_INDICATOR = true;
-        _ ->
-            SUCCESS_INDICATOR = false,
-            First = 0,
-            Rest = <<>>
-    end
-").
-
-:- pragma foreign_proc("C",
-    first_char(Str::uo, First::in, Rest::in),
-    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
-        does_not_affect_liveness, no_sharing],
-"{
-    size_t firstw = MR_utf8_width(First);
-    size_t len = firstw + strlen(Rest);
-    MR_allocate_aligned_string_msg(Str, len, MR_ALLOC_ID);
-    MR_utf8_encode(Str, First);
-    strcpy(Str + firstw, Rest);
-}").
-:- pragma foreign_proc("C#",
-    first_char(Str::uo, First::in, Rest::in),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"{
-    string FirstStr;
-    if (First <= 0xffff) {
-        FirstStr = new System.String((char) First, 1);
-    } else {
-        FirstStr = System.Char.ConvertFromUtf32(First);
-    }
-    Str = FirstStr + Rest;
-}").
-:- pragma foreign_proc("Java",
-    first_char(Str::uo, First::in, Rest::in),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"{
-    String FirstStr = new String(Character.toChars(First));
-    Str = FirstStr.concat(Rest);
-}").
-:- pragma foreign_proc("Erlang",
-    first_char(Str::uo, First::in, Rest::in),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    Str = unicode:characters_to_binary([First, Rest])
-").
+:- pragma promise_equivalent_clauses(first_char/3).
+
+first_char(Str::in, First::in, Rest::in) :-
+    first_char_rest_in(Str, First, Rest).
+first_char(Str::in, First::uo, Rest::in) :-
+    first_char_rest_in(Str, First, Rest).
+first_char(Str::in, First::in, Rest::uo) :-
+    first_char_rest_out(Str, First, Rest).
+first_char(Str::in, First::uo, Rest::uo) :-
+    first_char_rest_out(Str, First, Rest).
+first_char(Str::uo, First::in, Rest::in) :-
+    first_char_str_out(Str, First, Rest).
+
+:- pred first_char_rest_in(string, char, string).
+:- mode first_char_rest_in(in, in, in) is semidet.
+:- mode first_char_rest_in(in, uo, in) is semidet.
+
+first_char_rest_in(Str, First, Rest) :-
+    index_next_not_replaced(Str, 0, NextIndex, First0),
+    not is_surrogate(First0),
+    unsafe_promise_unique(First0, First),
+    unsafe_compare_substrings((=), Str, NextIndex, Rest, 0, length(Rest)).
+
+:- pred first_char_rest_out(string, char, string).
+:- mode first_char_rest_out(in, in, uo) is semidet.
+:- mode first_char_rest_out(in, uo, uo) is semidet.
+
+first_char_rest_out(Str, First, Rest) :-
+    index_next_not_replaced(Str, 0, NextIndex, First0),
+    not is_surrogate(First0),
+    unsafe_promise_unique(First0, First),
+    unsafe_between(Str, NextIndex, length(Str), Rest).
+
+:- pred first_char_str_out(string, char, string).
+:- mode first_char_str_out(uo, in, in) is det.
+
+first_char_str_out(Str, First, Rest) :-
+    ( if char.to_int(First, 0) then
+        unexpected($pred, "null character")
+    else if char.is_surrogate(First) then
+        unexpected($pred, "surrogate code point")
+    else
+        Str = char_to_string(First) ++ Rest
+    ).
 
 %---------------------%
 
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 47e8ec5a8..048827e4a 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -364,6 +364,7 @@ ORDINARY_PROGS = \
 	string_compare_substrings \
 	string_count_codepoints_ilseq \
 	string_first_char \
+	string_first_char_ilseq \
 	string_fold_ilseq \
 	string_from_char_list_ilseq \
 	string_index_ilseq \
diff --git a/tests/hard_coded/string_first_char_ilseq.exp b/tests/hard_coded/string_first_char_ilseq.exp
new file mode 100644
index 000000000..1e8d934f3
--- /dev/null
+++ b/tests/hard_coded/string_first_char_ilseq.exp
@@ -0,0 +1,10 @@
+first_char(in, out, in) failed
+first_char(in, out, out) failed
+first_char(in, in, in) failed
+first_char(in, in, out) failed
+first_char(in, in, in) failed
+first_char(in, in, out) failed
+first_char(in, in, in) failed
+first_char(in, in, out) failed
+first_char(out, in, in) threw exception: software_error("predicate `string.first_char_str_out\'/3: Unexpected: surrogate code point")
+first_char(out, in, in) threw exception: software_error("predicate `string.first_char_str_out\'/3: Unexpected: surrogate code point")
diff --git a/tests/hard_coded/string_first_char_ilseq.m b/tests/hard_coded/string_first_char_ilseq.m
new file mode 100644
index 000000000..107189580
--- /dev/null
+++ b/tests/hard_coded/string_first_char_ilseq.m
@@ -0,0 +1,97 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+
+:- module string_first_char_ilseq.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is cc_multi.
+
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    S0 = "😀",                              % UTF-16: 0xD83D 0xDE00
+    S1 = string.between(S0, 1, length(S0)), % UTF-16: 0xDE00
+    Rest = "rest",
+    S = S1 ++ Rest,
+
+    test_first_char_ioi(S, Rest, !IO),
+    test_first_char_ioo(S, !IO),
+
+    % An implementation might return U+FFFD as the first char of a string
+    % beginning with an ill-formed code unit sequence, which is wrong by our
+    % definition.
+    Replacement = char.det_from_int(0xFFFD),
+    test_first_char_iii(S, Replacement, Rest, !IO),
+    test_first_char_iio(S, Replacement, !IO),
+
+    % An implementation might separate out a surrogate code point as the
+    % first code point, which is wrong by our definition.
+    HiSurr = char.det_from_int(0xD83D),
+    LoSurr = char.det_from_int(0xDE00),
+    test_first_char_iii(S, HiSurr, Rest, !IO),
+    test_first_char_iio(S, HiSurr, !IO),
+    test_first_char_iii(S, LoSurr, Rest, !IO),
+    test_first_char_iio(S, LoSurr, !IO),
+
+    % Prepending a surrogate code point is disallowed.
+    test_first_char_oii(HiSurr, S, !IO),
+    test_first_char_oii(LoSurr, S, !IO).
+
+:- pred test_first_char_iii(string::in, char::in, string::in, io::di, io::uo)
+    is det.
+
+test_first_char_iii(Str, FirstChar, Rest, !IO) :-
+    ( if string.first_char(Str, FirstChar, Rest) then
+        io.write_string("first_char(in, in, in) succeeded\n", !IO)
+    else
+        io.write_string("first_char(in, in, in) failed\n", !IO)
+    ).
+
+:- pred test_first_char_ioi(string::in, string::in, io::di, io::uo) is det.
+
+test_first_char_ioi(Str, Rest, !IO) :-
+    ( if string.first_char(Str, _FirstChar, Rest) then
+        io.write_string("first_char(in, out, in) succeeded\n", !IO)
+    else
+        io.write_string("first_char(in, out, in) failed\n", !IO)
+    ).
+
+:- pred test_first_char_iio(string::in, char::in, io::di, io::uo) is det.
+
+test_first_char_iio(Str, FirstChar, !IO) :-
+    ( if string.first_char(Str, FirstChar, _Rest) then
+        io.write_string("first_char(in, in, out) succeeded\n", !IO)
+    else
+        io.write_string("first_char(in, in, out) failed\n", !IO)
+    ).
+
+:- pred test_first_char_ioo(string::in, io::di, io::uo) is det.
+
+test_first_char_ioo(Str, !IO) :-
+    ( if string.first_char(Str, _FirstChar, _Rest) then
+        io.write_string("first_char(in, out, out) succeeded\n", !IO)
+    else
+        io.write_string("first_char(in, out, out) failed\n", !IO)
+    ).
+
+:- pred test_first_char_oii(char::in, string::in, io::di, io::uo) is cc_multi.
+
+test_first_char_oii(FirstChar, Rest, !IO) :-
+    ( try []
+        string.first_char(_Str, FirstChar, Rest)
+    then
+        io.write_string("first_char(out, in, in) succeeded\n", !IO)
+    catch_any Excp ->
+        io.write_string("first_char(out, in, in) threw exception: ", !IO),
+        io.print_line(Excp, !IO)
+    ).
-- 
2.23.0



More information about the reviews mailing list