[m-rev.] for review: Define behaviour of string.index on ill-formed sequences.

Peter Wang novalazy at gmail.com
Wed Oct 16 17:04:17 AEDT 2019


library/string.m:
    Make string.index/3 and string.unsafe_index/3
    return either U+FFFD or an unpaired surrogate code point
    when an ill-formed code unit sequence is detected.

tests/hard_coded/Mmakefile:
tests/hard_coded/string_index_ilseq.exp:
tests/hard_coded/string_index_ilseq.exp2:
tests/hard_coded/string_index_ilseq.m:
    Add test case.
---
 library/string.m                         | 70 ++++++++++--------------
 tests/hard_coded/Mmakefile               |  1 +
 tests/hard_coded/string_index_ilseq.exp  | 10 ++++
 tests/hard_coded/string_index_ilseq.exp2 |  6 ++
 tests/hard_coded/string_index_ilseq.m    | 42 ++++++++++++++
 5 files changed, 87 insertions(+), 42 deletions(-)
 create mode 100644 tests/hard_coded/string_index_ilseq.exp
 create mode 100644 tests/hard_coded/string_index_ilseq.exp2
 create mode 100644 tests/hard_coded/string_index_ilseq.m

diff --git a/library/string.m b/library/string.m
index bffebc68f..09716dc49 100644
--- a/library/string.m
+++ b/library/string.m
@@ -228,29 +228,32 @@
 
     % index(String, Index, Char):
     %
-    % `Char' is the character (code point) in `String', beginning at the
-    % code unit `Index'. Fails if `Index' is out of range (negative, or
-    % greater than or equal to the length of `String').
+    % If `Index' is the initial code unit offset of a well-formed code unit
+    % sequence in `String' then `Char' is the code point encoded by that
+    % sequence.
+    %
+    % If the code unit in `String' at `Index' is part of an ill-formed sequence
+    % then `Char' is either U+FFFD REPLACEMENT CHARACTER (if strings use UTF-8
+    % encoding) or the unpaired surrogate code point at `Index' (if strings use
+    % UTF-16 encoding).
     %
-    % Calls error/1 if an illegal sequence is detected.
+    % Fails if `Index' is out of range (negative, or greater than or equal to
+    % the length of `String').
     %
 :- pred index(string::in, int::in, char::uo) is semidet.
 
     % det_index(String, Index, Char):
     %
-    % `Char' is the character (code point) in `String', beginning at the
-    % code unit `Index'.
-    % Calls error/1 if `Index' is out of range (negative, or greater than
-    % or equal to the length of `String'), or if an illegal sequence is
-    % detected.
+    % Like index/3 but throws an exception if `Index' is out of range
+    % (negative, or greater than or equal to the length of `String').
     %
 :- func det_index(string, int) = char.
 :- pred det_index(string::in, int::in, char::uo) is det.
 
     % unsafe_index(String, Index, Char):
     %
-    % `Char' is the character (code point) in `String', beginning at the
-    % code unit `Index'.
+    % Like index/3 but does not check that `Index' is in range.
+    %
     % WARNING: behavior is UNDEFINED if `Index' is out of range
     % (negative, or greater than or equal to the length of `String').
     % This version is constant time, whereas det_index
@@ -2163,16 +2166,12 @@ duplicate_char(Char, Count, String) :-
 % so that the compiler can do loop invariant hoisting on calls to them
 % that occur in loops.
 
-% XXX ILSEQ index/3 and unsafe_index/3 throw an exception if an ill-formed
-% sequence is detected. The index_next family fails instead. Both families
-% should be made consistent.
+% XXX ILSEQ The index_next family fails if an ill-formed sequence is detected.
+% It needs to be updated to match the behaviour of the index family.
 %
 % We should allow the possibility of working with strings containing ill-formed
 % sequences. That would require predicates that can return either a code point
 % when possible, or else code units from ill-formed sequences.
-%
-% It might have been overzealous to throw an exception for unpaired surrogate
-% code points in UTF-16.
 
 :- pragma inline(index/3).
 :- pragma inline(det_index/3).
@@ -2200,17 +2199,8 @@ det_index(String, Int, Char) :-
 unsafe_index(S, N) = C :-
     unsafe_index(S, N, C).
 
-unsafe_index(Str, Index, Char) :-
-    ( if unsafe_index_2(Str, Index, CharPrime) then
-        Char = CharPrime
-    else
-        unexpected($pred, "illegal sequence")
-    ).
-
-:- pred unsafe_index_2(string::in, int::in, char::uo) is semidet.
-
 :- pragma foreign_proc("C",
-    unsafe_index_2(Str::in, Index::in, Ch::uo),
+    unsafe_index(Str::in, Index::in, Ch::uo),
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
@@ -2218,44 +2208,40 @@ unsafe_index(Str, Index, Char) :-
     if (!MR_is_ascii(Ch)) {
         int width;
         Ch = MR_utf8_get_mb(Str, Index, &width);
+        if (Ch < 0) {
+            Ch = 0xFFFD;
+        }
     }
-    SUCCESS_INDICATOR = (Ch > 0);
 ").
 :- pragma foreign_proc("C#",
-    unsafe_index_2(Str::in, Index::in, Ch::uo),
+    unsafe_index(Str::in, Index::in, Ch::uo),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
     char c1 = Str[Index];
+    Ch = c1;
     if (System.Char.IsSurrogate(c1)) {
         try {
             char c2 = Str[Index + 1];
             Ch = System.Char.ConvertToUtf32(c1, c2);
         } catch (System.ArgumentOutOfRangeException) {
-            Ch = -1;
+            // Return unpaired surrogate code point.
         } catch (System.IndexOutOfRangeException) {
-            Ch = -1;
+            // Return unpaired surrogate code point.
         }
-    } else {
-        // Common case.
-        Ch = c1;
     }
-    SUCCESS_INDICATOR = (Ch >= 0);
 ").
 :- pragma foreign_proc("Java",
-    unsafe_index_2(Str::in, Index::in, Ch::uo),
+    unsafe_index(Str::in, Index::in, Ch::uo),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
     Ch = Str.codePointAt(Index);
-    SUCCESS_INDICATOR =
-        !java.lang.Character.isHighSurrogate((char) Ch) &&
-        !java.lang.Character.isLowSurrogate((char) Ch);
 ").
 :- pragma foreign_proc("Erlang",
-    unsafe_index_2(Str::in, Index::in, Ch::uo),
+    unsafe_index(Str::in, Index::in, Ch::uo),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    <<_:Index/binary, Ch/utf8, _/binary>> = Str,
-    SUCCESS_INDICATOR = true
+    % XXX does not handle ill-formed sequences as described
+    <<_:Index/binary, Ch/utf8, _/binary>> = Str
 ").
 
 %---------------------%
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 3ff70e0fb..631d0da19 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -358,6 +358,7 @@ ORDINARY_PROGS = \
 	string_code_unit \
 	string_codepoint \
 	string_first_char \
+	string_index_ilseq \
 	string_loop \
 	string_presuffix \
 	string_set_char \
diff --git a/tests/hard_coded/string_index_ilseq.exp b/tests/hard_coded/string_index_ilseq.exp
new file mode 100644
index 000000000..9b129bd17
--- /dev/null
+++ b/tests/hard_coded/string_index_ilseq.exp
@@ -0,0 +1,10 @@
+string.index(S, 0, 0x1f600)
+string.index(S, 1, 0xfffd)
+string.index(S, 2, 0xfffd)
+string.index(S, 3, 0xfffd)
+string.index(S, 4, 0xfffd)
+string.index(S, 5, 0x1f600)
+string.index(S, 6, 0xfffd)
+string.index(S, 7, 0xfffd)
+string.index(S, 8, 0xfffd)
+string.index(S, 9, _) failed
diff --git a/tests/hard_coded/string_index_ilseq.exp2 b/tests/hard_coded/string_index_ilseq.exp2
new file mode 100644
index 000000000..784cba43b
--- /dev/null
+++ b/tests/hard_coded/string_index_ilseq.exp2
@@ -0,0 +1,6 @@
+string.index(S, 0, 0x1f600)
+string.index(S, 1, 0xde00)
+string.index(S, 2, 0xd83d)
+string.index(S, 3, 0x1f600)
+string.index(S, 4, 0xde00)
+string.index(S, 5, _) failed
diff --git a/tests/hard_coded/string_index_ilseq.m b/tests/hard_coded/string_index_ilseq.m
new file mode 100644
index 000000000..143cd31ba
--- /dev/null
+++ b/tests/hard_coded/string_index_ilseq.m
@@ -0,0 +1,42 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+%
+% The .exp file is for backends using UTF-8 string encoding.
+% The .exp2 file is for backends using UTF-16 string encoding.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_index_ilseq.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    S0 = "😀",
+    S1 = string.between(S0, 0, 1),
+    S = S0 ++ S1 ++ S0,
+    list.foldl(test_index(S), 0 .. count_code_units(S), !IO).
+
+:- pred test_index(string::in, int::in, io::di, io::uo) is det.
+
+test_index(S, Index, !IO) :-
+    ( if string.index(S, Index, Char) then
+        io.format("string.index(S, %d, %#x)\n",
+            [i(Index), i(char.to_int(Char))], !IO)
+    else
+        io.format("string.index(S, %d, _) failed\n",
+            [i(Index)], !IO)
+    ).
-- 
2.23.0



More information about the reviews mailing list