[m-rev.] for review: Define behaviour of string.first_char/3 on edge cases.
Peter Wang
novalazy at gmail.com
Tue Oct 29 12:01:47 AEDT 2019
library/string.m:
Define first_char/3 to fail if the input string begins with an
ill-formed code unit sequence.
Define the reverse mode to throw an exception on an attempt to
encode a null character or surrogate code point in the output
string.
Reimplement first_char/3 in Mercury.
hard_coded/Mmakefile:
hard_coded/string_first_char_ilseq.exp:
hard_coded/string_first_char_ilseq.m:
Add test case.
---
library/string.m | 373 ++++---------------
tests/hard_coded/Mmakefile | 1 +
tests/hard_coded/string_first_char_ilseq.exp | 10 +
tests/hard_coded/string_first_char_ilseq.m | 97 +++++
4 files changed, 183 insertions(+), 298 deletions(-)
create mode 100644 tests/hard_coded/string_first_char_ilseq.exp
create mode 100644 tests/hard_coded/string_first_char_ilseq.m
diff --git a/library/string.m b/library/string.m
index cbdbcd128..82bc84fa3 100644
--- a/library/string.m
+++ b/library/string.m
@@ -670,8 +670,13 @@
% Splitting up strings.
%
- % first_char(String, Char, Rest) is true iff Char is the first character
- % (code point) of String, and Rest is the remainder.
+ % first_char(String, Char, Rest) is true iff `String' begins with a
+ % well-formed code unit sequence, `Char' is the code point encoded by
+ % that sequence, and `Rest' is the rest of `String' after that sequence.
+ %
+ % The (uo, in, in) mode throws an exception if `Char' cannot be encoded in
+ % a string, or if `Char' is a surrogate code point (for consistency with
+ % the other modes).
%
% WARNING: first_char makes a copy of Rest because the garbage collector
% doesn't handle references into the middle of an object, at least not the
@@ -2395,6 +2400,28 @@ index_next(Str, Index, NextIndex, Char) :-
end
").
+%---------------------%
+
+ % XXX ILSEQ Provide public interfaces to index into strings while
+ % signalling if we encountered an ill-formed sequence.
+ %
+:- pred index_next_not_replaced(string::in, int::in, int::out, char::uo)
+ is semidet.
+
+index_next_not_replaced(Str, Index, NextIndex, Char) :-
+ index_next(Str, Index, NextIndex, Char0),
+ ( if
+ internal_encoding_is_utf8,
+ Char0 = '\ufffd'
+ then
+ unsafe_between(Str, Index, NextIndex, "\ufffd")
+ else
+ true
+ ),
+ unsafe_promise_unique(Char0, Char).
+
+%---------------------%
+
prev_index(Str, Index, PrevIndex, Char) :-
Len = length(Str),
( if index_check(Index - 1, Len) then
@@ -2507,6 +2534,8 @@ do_unsafe_prev_index(Str, Index) ->
end.
").
+%---------------------%
+
% XXX We should consider making this routine a compiler built-in.
%
:- pred index_check(int::in, int::in) is semidet.
@@ -3896,302 +3925,50 @@ join_list_loop(Sep, [H | T]) = Sep ++ H ++ join_list_loop(Sep, T).
% Splitting up strings.
%
-% XXX ILSEQ Behaviour depends on target language.
-% - C: fails if the string begins with ill-formed sequence
-% - Java/C#: succeeds if the string begins with an unpaired surrogate
-
-:- pragma foreign_proc("C",
- first_char(Str::in, First::in, Rest::in),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
- does_not_affect_liveness, no_sharing],
-"
- MR_Integer pos = 0;
- int c = MR_utf8_get_next(Str, &pos);
- SUCCESS_INDICATOR = (
- c == First &&
- First != '\\0' &&
- strcmp(Str + pos, Rest) == 0
- );
-").
-:- pragma foreign_proc("C#",
- first_char(Str::in, First::in, Rest::in),
- [will_not_call_mercury, promise_pure, thread_safe],
-"
- int len = Str.Length;
- if (First <= 0xffff) {
- SUCCESS_INDICATOR = (
- len > 0 &&
- Str[0] == First &&
- System.String.CompareOrdinal(Str, 1, Rest, 0, len) == 0
- );
- } else {
- string firstchars = System.Char.ConvertFromUtf32(First);
- SUCCESS_INDICATOR = (
- len > 1 &&
- Str[0] == firstchars[0] &&
- Str[1] == firstchars[1] &&
- System.String.CompareOrdinal(Str, 2, Rest, 0, len) == 0
- );
- }
-").
-:- pragma foreign_proc("Java",
- first_char(Str::in, First::in, Rest::in),
- [will_not_call_mercury, promise_pure, thread_safe],
-"
- int toffset = java.lang.Character.charCount(First);
- SUCCESS_INDICATOR = (
- Str.length() > 0 &&
- Str.codePointAt(0) == First &&
- Str.regionMatches(toffset, Rest, 0, Rest.length())
- );
-").
-:- pragma foreign_proc("Erlang",
- first_char(Str::in, First::in, Rest::in),
- [will_not_call_mercury, promise_pure, thread_safe],
-"
- case Str of
- <<First/utf8, Rest/binary>> ->
- SUCCESS_INDICATOR = true;
- _ ->
- SUCCESS_INDICATOR = false
- end
-").
-
-:- pragma foreign_proc("C",
- first_char(Str::in, First::uo, Rest::in),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
- does_not_affect_liveness, no_sharing],
-"
- MR_Integer pos = 0;
- First = MR_utf8_get_next(Str, &pos);
- SUCCESS_INDICATOR = (First > 0 && strcmp(Str + pos, Rest) == 0);
-").
-:- pragma foreign_proc("C#",
- first_char(Str::in, First::uo, Rest::in),
- [will_not_call_mercury, promise_pure, thread_safe],
-"
- try {
- int len = Str.Length;
- char c1 = Str[0];
- if (System.Char.IsHighSurrogate(c1)) {
- char c2 = Str[1];
- First = System.Char.ConvertToUtf32(c1, c2);
- SUCCESS_INDICATOR =
- (System.String.CompareOrdinal(Str, 2, Rest, 0, len) == 0);
- } else {
- First = c1;
- SUCCESS_INDICATOR =
- (System.String.CompareOrdinal(Str, 1, Rest, 0, len) == 0);
- }
- } catch (System.IndexOutOfRangeException) {
- SUCCESS_INDICATOR = false;
- First = (char) 0;
- } catch (System.ArgumentOutOfRangeException) {
- SUCCESS_INDICATOR = false;
- First = (char) 0;
- }
-").
-:- pragma foreign_proc("Java",
- first_char(Str::in, First::uo, Rest::in),
- [will_not_call_mercury, promise_pure, thread_safe],
-"
- int toffset;
- if (Str.length() > 0) {
- First = Str.codePointAt(0);
- toffset = java.lang.Character.charCount(First);
- SUCCESS_INDICATOR =
- Str.regionMatches(toffset, Rest, 0, Rest.length());
- } else {
- SUCCESS_INDICATOR = false;
- // XXX to avoid uninitialized var warning
- First = 0;
- }
-").
-:- pragma foreign_proc("Erlang",
- first_char(Str::in, First::uo, Rest::in),
- [will_not_call_mercury, promise_pure, thread_safe],
-"
- case Str of
- <<First/utf8, Rest/binary>> ->
- SUCCESS_INDICATOR = true;
- _ ->
- SUCCESS_INDICATOR = false,
- First = 0
- end
-").
-
-:- pragma foreign_proc("C",
- first_char(Str::in, First::in, Rest::uo),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
- does_not_affect_liveness, no_sharing],
-"{
- MR_Integer pos = 0;
- int c = MR_utf8_get_next(Str, &pos);
- if (c != First || First == '\\0') {
- SUCCESS_INDICATOR = MR_FALSE;
- } else {
- Str += pos;
- // We need to make a copy to ensure that the pointer is word-aligned.
- MR_allocate_aligned_string_msg(Rest, strlen(Str), MR_ALLOC_ID);
- strcpy(Rest, Str);
- SUCCESS_INDICATOR = MR_TRUE;
- }
-}").
-:- pragma foreign_proc("C#",
- first_char(Str::in, First::in, Rest::uo),
- [will_not_call_mercury, promise_pure, thread_safe],
-"{
- int len = Str.Length;
-
- if (len > 0) {
- if (First <= 0xffff) {
- SUCCESS_INDICATOR = (First == Str[0]);
- Rest = Str.Substring(1);
- } else {
- string firststr = System.Char.ConvertFromUtf32(First);
- SUCCESS_INDICATOR =
- (System.String.CompareOrdinal(Str, 0, firststr, 0, 2) == 0);
- Rest = Str.Substring(2);
- }
- } else {
- SUCCESS_INDICATOR = false;
- Rest = null;
- }
-}").
-:- pragma foreign_proc("Java",
- first_char(Str::in, First::in, Rest::uo),
- [will_not_call_mercury, promise_pure, thread_safe],
-"{
- int len = Str.length();
-
- if (len > 0) {
- SUCCESS_INDICATOR = (First == Str.codePointAt(0));
- Rest = Str.substring(java.lang.Character.charCount(First));
- } else {
- SUCCESS_INDICATOR = false;
- // XXX to avoid uninitialized var warning
- Rest = null;
- }
-}").
-:- pragma foreign_proc("Erlang",
- first_char(Str::in, First::in, Rest::uo),
- [will_not_call_mercury, promise_pure, thread_safe],
-"
- case Str of
- <<First/utf8, Rest/binary>> ->
- SUCCESS_INDICATOR = true;
- _ ->
- SUCCESS_INDICATOR = false,
- Rest = <<>>
- end
-").
-
-:- pragma foreign_proc("C",
- first_char(Str::in, First::uo, Rest::uo),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
- does_not_affect_liveness, no_sharing],
-"{
- MR_Integer pos = 0;
- First = MR_utf8_get_next(Str, &pos);
- if (First < 1) {
- SUCCESS_INDICATOR = MR_FALSE;
- } else {
- Str += pos;
- // We need to make a copy to ensure that the pointer is word-aligned.
- MR_allocate_aligned_string_msg(Rest, strlen(Str), MR_ALLOC_ID);
- strcpy(Rest, Str);
- SUCCESS_INDICATOR = MR_TRUE;
- }
-}").
-:- pragma foreign_proc("C#",
- first_char(Str::in, First::uo, Rest::uo),
- [will_not_call_mercury, promise_pure, thread_safe],
-"{
- try {
- char c1 = Str[0];
- if (System.Char.IsHighSurrogate(c1)) {
- char c2 = Str[1];
- First = System.Char.ConvertToUtf32(c1, c2);
- Rest = Str.Substring(2);
- } else {
- First = Str[0];
- Rest = Str.Substring(1);
- }
- SUCCESS_INDICATOR = true;
- } catch (System.IndexOutOfRangeException) {
- SUCCESS_INDICATOR = false;
- First = (char) 0;
- Rest = null;
- } catch (System.ArgumentOutOfRangeException) {
- SUCCESS_INDICATOR = false;
- First = (char) 0;
- Rest = null;
- }
-}").
-:- pragma foreign_proc("Java",
- first_char(Str::in, First::uo, Rest::uo),
- [will_not_call_mercury, promise_pure, thread_safe],
-"{
- if (Str.length() == 0) {
- SUCCESS_INDICATOR = false;
- First = (char) 0;
- Rest = null;
- } else {
- First = Str.codePointAt(0);
- Rest = Str.substring(java.lang.Character.charCount(First));
- SUCCESS_INDICATOR = true;
- }
-}").
-:- pragma foreign_proc("Erlang",
- first_char(Str::in, First::uo, Rest::uo),
- [will_not_call_mercury, promise_pure, thread_safe],
-"
- case Str of
- <<First/utf8, Rest/binary>> ->
- SUCCESS_INDICATOR = true;
- _ ->
- SUCCESS_INDICATOR = false,
- First = 0,
- Rest = <<>>
- end
-").
-
-:- pragma foreign_proc("C",
- first_char(Str::uo, First::in, Rest::in),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
- does_not_affect_liveness, no_sharing],
-"{
- size_t firstw = MR_utf8_width(First);
- size_t len = firstw + strlen(Rest);
- MR_allocate_aligned_string_msg(Str, len, MR_ALLOC_ID);
- MR_utf8_encode(Str, First);
- strcpy(Str + firstw, Rest);
-}").
-:- pragma foreign_proc("C#",
- first_char(Str::uo, First::in, Rest::in),
- [will_not_call_mercury, promise_pure, thread_safe],
-"{
- string FirstStr;
- if (First <= 0xffff) {
- FirstStr = new System.String((char) First, 1);
- } else {
- FirstStr = System.Char.ConvertFromUtf32(First);
- }
- Str = FirstStr + Rest;
-}").
-:- pragma foreign_proc("Java",
- first_char(Str::uo, First::in, Rest::in),
- [will_not_call_mercury, promise_pure, thread_safe],
-"{
- String FirstStr = new String(Character.toChars(First));
- Str = FirstStr.concat(Rest);
-}").
-:- pragma foreign_proc("Erlang",
- first_char(Str::uo, First::in, Rest::in),
- [will_not_call_mercury, promise_pure, thread_safe],
-"
- Str = unicode:characters_to_binary([First, Rest])
-").
+:- pragma promise_equivalent_clauses(first_char/3).
+
+first_char(Str::in, First::in, Rest::in) :-
+ first_char_rest_in(Str, First, Rest).
+first_char(Str::in, First::uo, Rest::in) :-
+ first_char_rest_in(Str, First, Rest).
+first_char(Str::in, First::in, Rest::uo) :-
+ first_char_rest_out(Str, First, Rest).
+first_char(Str::in, First::uo, Rest::uo) :-
+ first_char_rest_out(Str, First, Rest).
+first_char(Str::uo, First::in, Rest::in) :-
+ first_char_str_out(Str, First, Rest).
+
+:- pred first_char_rest_in(string, char, string).
+:- mode first_char_rest_in(in, in, in) is semidet.
+:- mode first_char_rest_in(in, uo, in) is semidet.
+
+first_char_rest_in(Str, First, Rest) :-
+ index_next_not_replaced(Str, 0, NextIndex, First0),
+ not is_surrogate(First0),
+ unsafe_promise_unique(First0, First),
+ unsafe_compare_substrings((=), Str, NextIndex, Rest, 0, length(Rest)).
+
+:- pred first_char_rest_out(string, char, string).
+:- mode first_char_rest_out(in, in, uo) is semidet.
+:- mode first_char_rest_out(in, uo, uo) is semidet.
+
+first_char_rest_out(Str, First, Rest) :-
+ index_next_not_replaced(Str, 0, NextIndex, First0),
+ not is_surrogate(First0),
+ unsafe_promise_unique(First0, First),
+ unsafe_between(Str, NextIndex, length(Str), Rest).
+
+:- pred first_char_str_out(string, char, string).
+:- mode first_char_str_out(uo, in, in) is det.
+
+first_char_str_out(Str, First, Rest) :-
+ ( if char.to_int(First, 0) then
+ unexpected($pred, "null character")
+ else if char.is_surrogate(First) then
+ unexpected($pred, "surrogate code point")
+ else
+ Str = char_to_string(First) ++ Rest
+ ).
%---------------------%
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 47e8ec5a8..048827e4a 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -364,6 +364,7 @@ ORDINARY_PROGS = \
string_compare_substrings \
string_count_codepoints_ilseq \
string_first_char \
+ string_first_char_ilseq \
string_fold_ilseq \
string_from_char_list_ilseq \
string_index_ilseq \
diff --git a/tests/hard_coded/string_first_char_ilseq.exp b/tests/hard_coded/string_first_char_ilseq.exp
new file mode 100644
index 000000000..1e8d934f3
--- /dev/null
+++ b/tests/hard_coded/string_first_char_ilseq.exp
@@ -0,0 +1,10 @@
+first_char(in, out, in) failed
+first_char(in, out, out) failed
+first_char(in, in, in) failed
+first_char(in, in, out) failed
+first_char(in, in, in) failed
+first_char(in, in, out) failed
+first_char(in, in, in) failed
+first_char(in, in, out) failed
+first_char(out, in, in) threw exception: software_error("predicate `string.first_char_str_out\'/3: Unexpected: surrogate code point")
+first_char(out, in, in) threw exception: software_error("predicate `string.first_char_str_out\'/3: Unexpected: surrogate code point")
diff --git a/tests/hard_coded/string_first_char_ilseq.m b/tests/hard_coded/string_first_char_ilseq.m
new file mode 100644
index 000000000..107189580
--- /dev/null
+++ b/tests/hard_coded/string_first_char_ilseq.m
@@ -0,0 +1,97 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+
+:- module string_first_char_ilseq.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is cc_multi.
+
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+ S0 = "😀", % UTF-16: 0xD83D 0xDE00
+ S1 = string.between(S0, 1, length(S0)), % UTF-16: 0xDE00
+ Rest = "rest",
+ S = S1 ++ Rest,
+
+ test_first_char_ioi(S, Rest, !IO),
+ test_first_char_ioo(S, !IO),
+
+ % An implementation might return U+FFFD as the first char of a string
+ % beginning with an ill-formed code unit sequence, which is wrong by our
+ % definition.
+ Replacement = char.det_from_int(0xFFFD),
+ test_first_char_iii(S, Replacement, Rest, !IO),
+ test_first_char_iio(S, Replacement, !IO),
+
+ % An implementation might separate out a surrogate code point as the
+ % first code point, which is wrong by our definition.
+ HiSurr = char.det_from_int(0xD83D),
+ LoSurr = char.det_from_int(0xDE00),
+ test_first_char_iii(S, HiSurr, Rest, !IO),
+ test_first_char_iio(S, HiSurr, !IO),
+ test_first_char_iii(S, LoSurr, Rest, !IO),
+ test_first_char_iio(S, LoSurr, !IO),
+
+ % Prepending a surrogate code point is disallowed.
+ test_first_char_oii(HiSurr, S, !IO),
+ test_first_char_oii(LoSurr, S, !IO).
+
+:- pred test_first_char_iii(string::in, char::in, string::in, io::di, io::uo)
+ is det.
+
+test_first_char_iii(Str, FirstChar, Rest, !IO) :-
+ ( if string.first_char(Str, FirstChar, Rest) then
+ io.write_string("first_char(in, in, in) succeeded\n", !IO)
+ else
+ io.write_string("first_char(in, in, in) failed\n", !IO)
+ ).
+
+:- pred test_first_char_ioi(string::in, string::in, io::di, io::uo) is det.
+
+test_first_char_ioi(Str, Rest, !IO) :-
+ ( if string.first_char(Str, _FirstChar, Rest) then
+ io.write_string("first_char(in, out, in) succeeded\n", !IO)
+ else
+ io.write_string("first_char(in, out, in) failed\n", !IO)
+ ).
+
+:- pred test_first_char_iio(string::in, char::in, io::di, io::uo) is det.
+
+test_first_char_iio(Str, FirstChar, !IO) :-
+ ( if string.first_char(Str, FirstChar, _Rest) then
+ io.write_string("first_char(in, in, out) succeeded\n", !IO)
+ else
+ io.write_string("first_char(in, in, out) failed\n", !IO)
+ ).
+
+:- pred test_first_char_ioo(string::in, io::di, io::uo) is det.
+
+test_first_char_ioo(Str, !IO) :-
+ ( if string.first_char(Str, _FirstChar, _Rest) then
+ io.write_string("first_char(in, out, out) succeeded\n", !IO)
+ else
+ io.write_string("first_char(in, out, out) failed\n", !IO)
+ ).
+
+:- pred test_first_char_oii(char::in, string::in, io::di, io::uo) is cc_multi.
+
+test_first_char_oii(FirstChar, Rest, !IO) :-
+ ( try []
+ string.first_char(_Str, FirstChar, Rest)
+ then
+ io.write_string("first_char(out, in, in) succeeded\n", !IO)
+ catch_any Excp ->
+ io.write_string("first_char(out, in, in) threw exception: ", !IO),
+ io.print_line(Excp, !IO)
+ ).
--
2.23.0
More information about the reviews
mailing list