[m-rev.] for review: Define behaviour of string.index_next on ill-formed sequences.

Peter Wang novalazy at gmail.com
Wed Oct 16 17:04:18 AEDT 2019


library/string.m:
    Make string.index_next and string.unsafe_index_next
    return either U+FFFD or an unpaired surrogate code point
    when an ill-formed code unit sequence is detected.

tests/hard_coded/Mmakefile:
tests/hard_coded/string_index_next_ilseq.exp:
tests/hard_coded/string_index_next_ilseq.exp2:
tests/hard_coded/string_index_next_ilseq.m:
    Add test case.
---
 library/string.m                              | 69 ++++++++++---------
 tests/hard_coded/Mmakefile                    |  1 +
 tests/hard_coded/string_index_next_ilseq.exp  |  5 ++
 tests/hard_coded/string_index_next_ilseq.exp2 |  5 ++
 tests/hard_coded/string_index_next_ilseq.m    | 44 ++++++++++++
 5 files changed, 90 insertions(+), 34 deletions(-)
 create mode 100644 tests/hard_coded/string_index_next_ilseq.exp
 create mode 100644 tests/hard_coded/string_index_next_ilseq.exp2
 create mode 100644 tests/hard_coded/string_index_next_ilseq.m

diff --git a/library/string.m b/library/string.m
index 09716dc49..b687cece3 100644
--- a/library/string.m
+++ b/library/string.m
@@ -274,17 +274,26 @@
 
     % index_next(String, Index, NextIndex, Char):
     %
-    % Like `index'/3 but also returns the position of the code unit
-    % that follows the code point beginning at `Index',
-    % i.e. NextIndex = Index + num_code_units_to_encode(Char).
+    % If `Index' is the initial code unit offset of a well-formed code unit
+    % sequence in `String' then `Char' is the code point encoded by that
+    % sequence, and `NextIndex' is the offset immediately following that
+    % sequence.
+    %
+    % If the code unit in `String' at `Index' is part of an ill-formed sequence
+    % then `Char' is either U+FFFD REPLACEMENT CHARACTER (if strings use UTF-8
+    % encoding) or the unpaired surrogate code point at `Index' (if strings use
+    % UTF-16 encoding), and `NextIndex' is Index + 1.
+    %
+    % Fails if `Index' is out of range (negative, or greater than or equal to
+    % the length of `String').
     %
 :- pred index_next(string::in, int::in, int::out, char::uo) is semidet.
 
     % unsafe_index_next(String, Index, NextIndex, Char):
     %
-    % `Char' is the character (code point) in `String', beginning at the
-    % code unit `Index'. `NextIndex' is the offset following the encoding
-    % of `Char'. Fails if `Index' is equal to the length of `String'.
+    % Like index_next/4 but does not check that `Index' is in range.
+    % Fails if `Index' is equal to the length of `String'.
+    %
     % WARNING: behavior is UNDEFINED if `Index' is out of range
     % (negative, or greater than the length of `String').
     %
@@ -2166,9 +2175,7 @@ duplicate_char(Char, Count, String) :-
 % so that the compiler can do loop invariant hoisting on calls to them
 % that occur in loops.
 
-% XXX ILSEQ The index_next family fails if an ill-formed sequence is detected.
-% It needs to be updated to match the behaviour of the index family.
-%
+% XXX ILSEQ
 % We should allow the possibility of working with strings containing ill-formed
 % sequences. That would require predicates that can return either a code point
 % when possible, or else code units from ill-formed sequences.
@@ -2251,11 +2258,6 @@ String ^ unsafe_elem(Index) = unsafe_index(String, Index).
 
 %---------------------%
 
-% XXX ILSEQ index_next/4 and unsafe_index_next/4 fail when an ill-formed
-% sequence is detected, unlike the index family. Most programs will probably
-% work as intended if we just replace code units in ill-formed sequences with
-% replacement char (for UTF-8) or return the unpaired surrogate code point
-% (for UTF-16).
 index_next(Str, Index, NextIndex, Char) :-
     Len = length(Str),
     ( if index_check(Index, Len) then
@@ -2264,10 +2266,6 @@ index_next(Str, Index, NextIndex, Char) :-
         fail
     ).
 
-% XXX ILSEQ Behaviour of unsafe_index_next depends on target language.
-%   - c: fails at ill-formed sequence
-%   - java: fails at unpaired surrogate
-%   - csharp: System.Char.ConvertToUtf32 throws exception for unpaired surrogate
 :- pragma foreign_proc("C",
     unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
@@ -2280,7 +2278,11 @@ index_next(Str, Index, NextIndex, Char) :-
     } else {
         NextIndex = Index;
         Ch = MR_utf8_get_next_mb(Str, &NextIndex);
-        SUCCESS_INDICATOR = (Ch > 0);
+        if (Ch < 0) {
+            Ch = 0xfffd;
+            NextIndex = Index + 1;
+        }
+        SUCCESS_INDICATOR = MR_TRUE;
     }
 ").
 :- pragma foreign_proc("C#",
@@ -2288,7 +2290,7 @@ index_next(Str, Index, NextIndex, Char) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
-    if (Index < Str.Length) {
+    try {
         Ch = System.Char.ConvertToUtf32(Str, Index);
         if (Ch <= 0xffff) {
             NextIndex = Index + 1;
@@ -2296,10 +2298,15 @@ index_next(Str, Index, NextIndex, Char) :-
             NextIndex = Index + 2;
         }
         SUCCESS_INDICATOR = true;
-    } else {
-        Ch = -1;
+    } catch (System.ArgumentOutOfRangeException) {
+        Ch = 0;
         NextIndex = Index;
         SUCCESS_INDICATOR = false;
+    } catch (System.ArgumentException) {
+        // Return unpaired surrogate code point.
+        Ch = Str[Index];
+        NextIndex = Index + 1;
+        SUCCESS_INDICATOR = true;
     }
 ").
 :- pragma foreign_proc("Java",
@@ -2307,19 +2314,12 @@ index_next(Str, Index, NextIndex, Char) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
-    if (Index < Str.length()) {
+    try {
         Ch = Str.codePointAt(Index);
-        SUCCESS_INDICATOR =
-            !java.lang.Character.isHighSurrogate((char) Ch) &&
-            !java.lang.Character.isLowSurrogate((char) Ch);
-        if (SUCCESS_INDICATOR) {
-            NextIndex = Index + java.lang.Character.charCount(Ch);
-        } else {
-            Ch = -1;
-            NextIndex = Index;
-        }
-    } else {
-        Ch = -1;
+        NextIndex = Index + java.lang.Character.charCount(Ch);
+        SUCCESS_INDICATOR = true;
+    } catch (IndexOutOfBoundsException e) {
+        Ch = 0;
         NextIndex = Index;
         SUCCESS_INDICATOR = false;
     }
@@ -2329,6 +2329,7 @@ index_next(Str, Index, NextIndex, Char) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
+    % XXX does not handle ill-formed sequences as described
     case Str of
         << _:Index/binary, Ch/utf8, _/binary >> ->
             if
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 631d0da19..8c317da86 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -359,6 +359,7 @@ ORDINARY_PROGS = \
 	string_codepoint \
 	string_first_char \
 	string_index_ilseq \
+	string_index_next_ilseq \
 	string_loop \
 	string_presuffix \
 	string_set_char \
diff --git a/tests/hard_coded/string_index_next_ilseq.exp b/tests/hard_coded/string_index_next_ilseq.exp
new file mode 100644
index 000000000..1ad4aba9a
--- /dev/null
+++ b/tests/hard_coded/string_index_next_ilseq.exp
@@ -0,0 +1,5 @@
+string.index_next(S, 0, 4, 0x1f600)
+string.index_next(S, 4, 5, 0xfffd)
+string.index_next(S, 5, 6, 0xfffd)
+string.index_next(S, 6, 10, 0x1f600)
+string.index_next(S, 10, _, _) failed
diff --git a/tests/hard_coded/string_index_next_ilseq.exp2 b/tests/hard_coded/string_index_next_ilseq.exp2
new file mode 100644
index 000000000..c889bf0d8
--- /dev/null
+++ b/tests/hard_coded/string_index_next_ilseq.exp2
@@ -0,0 +1,5 @@
+string.index_next(S, 0, 2, 0x1f600)
+string.index_next(S, 2, 3, 0xde00)
+string.index_next(S, 3, 4, 0xd83d)
+string.index_next(S, 4, 6, 0x1f600)
+string.index_next(S, 6, _, _) failed
diff --git a/tests/hard_coded/string_index_next_ilseq.m b/tests/hard_coded/string_index_next_ilseq.m
new file mode 100644
index 000000000..848a6fbae
--- /dev/null
+++ b/tests/hard_coded/string_index_next_ilseq.m
@@ -0,0 +1,44 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+%
+% The .exp file is for backends using UTF-8 string encoding.
+% The .exp2 file is for backends using UTF-16 string encoding.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_index_next_ilseq.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    S0 = "😀",
+    S1 = string.between(S0, 0, 1),
+    S2 = string.between(S0, 1, 2),
+    S = S0 ++ S2 ++ S1 ++ S0,
+    test_index_next(S, 0, !IO).
+
+:- pred test_index_next(string::in, int::in, io::di, io::uo) is det.
+
+test_index_next(S, Index, !IO) :-
+    ( if string.index_next(S, Index, NextIndex, Char) then
+        io.format("string.index_next(S, %d, %d, %#x)\n",
+            [i(Index), i(NextIndex), i(char.to_int(Char))], !IO),
+        test_index_next(S, NextIndex, !IO)
+    else
+        io.format("string.index_next(S, %d, _, _) failed\n",
+            [i(Index)], !IO)
+    ).
-- 
2.23.0



More information about the reviews mailing list