[m-rev.] for review: Change behaviour of string.prev_index on ill-formed sequences.

Peter Wang novalazy at gmail.com
Wed Oct 16 17:04:19 AEDT 2019


library/string.m:
    Make string.prev_index and string.unsafe_prev_index
    return either U+FFFD or an unpaired surrogate code point
    when an ill-formed code unit sequence is detected.

tests/hard_coded/Mmakefile:
tests/hard_coded/string_prev_index_ilseq.exp:
tests/hard_coded/string_prev_index_ilseq.exp2:
tests/hard_coded/string_prev_index_ilseq.m:
    Add test case.
---
 library/string.m                              | 91 ++++++++++---------
 tests/hard_coded/Mmakefile                    |  1 +
 tests/hard_coded/string_prev_index_ilseq.exp  |  5 +
 tests/hard_coded/string_prev_index_ilseq.exp2 |  5 +
 tests/hard_coded/string_prev_index_ilseq.m    | 44 +++++++++
 5 files changed, 104 insertions(+), 42 deletions(-)
 create mode 100644 tests/hard_coded/string_prev_index_ilseq.exp
 create mode 100644 tests/hard_coded/string_prev_index_ilseq.exp2
 create mode 100644 tests/hard_coded/string_prev_index_ilseq.m

diff --git a/library/string.m b/library/string.m
index b687cece3..1b0d32e87 100644
--- a/library/string.m
+++ b/library/string.m
@@ -299,19 +299,27 @@
     %
 :- pred unsafe_index_next(string::in, int::in, int::out, char::uo) is semidet.
 
-    % prev_index(String, Index, CharIndex, Char):
+    % prev_index(String, Index, PrevIndex, Char):
     %
-    % `Char' is the character (code point) in `String' immediately _before_
-    % the code unit `Index'. Fails if `Index' is out of range (non-positive,
-    % or greater than the length of `String').
+    % If `Index - 1' is the final code unit offset of a well-formed sequence in
+    % `String' then `Char' is the code point encoded by that sequence, and
+    % `PrevIndex' is the initial code unit offset of that sequence.
+    %
+    % If the code unit in `String' at `Index - 1' is part of an ill-formed
+    % sequence then `Char' is either U+FFFD REPLACEMENT CHARACTER (if strings
+    % use UTF-8 encoding) or the unpaired surrogate code point at `Index - 1'
+    % (if strings use UTF-16 encoding), and `PrevIndex' is `Index - 1'.
+    %
+    % Fails if `Index' is out of range (non-positive, or greater than the
+    % length of `String').
     %
 :- pred prev_index(string::in, int::in, int::out, char::uo) is semidet.
 
-    % unsafe_prev_index(String, Index, CharIndex, Char):
+    % unsafe_prev_index(String, Index, PrevIndex, Char):
+    %
+    % Like prev_index/4 but does not check that `Index' is in range.
+    % Fails if `Index' is zero.
     %
-    % `Char' is the character (code point) in `String' immediately _before_
-    % the code unit `Index'. `CharIndex' is the offset of the beginning of
-    % `Char'. Fails if `Index' is zero.
     % WARNING: behavior is UNDEFINED if `Index' is out of range
     % (negative, or greater than the length of `String').
     %
@@ -2350,37 +2358,37 @@ index_next(Str, Index, NextIndex, Char) :-
     end
 ").
 
-prev_index(Str, Index, CharIndex, Char) :-
+prev_index(Str, Index, PrevIndex, Char) :-
     Len = length(Str),
     ( if index_check(Index - 1, Len) then
-        unsafe_prev_index(Str, Index, CharIndex, Char)
+        unsafe_prev_index(Str, Index, PrevIndex, Char)
     else
         fail
     ).
 
-% XXX ILSEQ Behaviour of unsafe_prev_index depends on target language.
-%   - c: fails on ill-formed sequence
-%   - java: returns unpaired surrogate
-%   - csharp: fails on unpaired surrogate (excepting for the bug)
 :- pragma foreign_proc("C",
     unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
-    if (Index > 0) {
+    if (Index <= 0) {
+        PrevIndex = Index;
+        Ch = 0;
+        SUCCESS_INDICATOR = MR_FALSE;
+    } else {
         PrevIndex = Index - 1;
         Ch = Str[PrevIndex];
-        if (MR_is_ascii(Ch)) {
-            SUCCESS_INDICATOR = (Ch != 0);
-        } else {
+        if (! MR_is_ascii(Ch)) {
             Ch = MR_utf8_prev_get(Str, &PrevIndex);
-            // XXX ILSEQ
-            // SUCCESS_INDICATOR should be false if there are any extra bytes
-            // after the encoding of Ch, but before Index.
-            SUCCESS_INDICATOR = (Ch > 0);
+            // XXX MR_utf8_prev_get currently just scans backwards to find a
+            // lead byte, so we need a separate check to ensure no bytes are
+            // unaccounted for.
+            if (Ch < 0 || PrevIndex + MR_utf8_width(Ch) != Index) {
+                Ch = 0xfffd;
+                PrevIndex = Index - 1;
+            }
         }
-    } else {
-        SUCCESS_INDICATOR = MR_FALSE;
+        SUCCESS_INDICATOR = MR_TRUE;
     }
 ").
 :- pragma foreign_proc("C#",
@@ -2388,33 +2396,31 @@ prev_index(Str, Index, CharIndex, Char) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
-    if (Index > 0) {
+    if (Index <= 0) {
+        Ch = 0;
+        PrevIndex = Index;
+        SUCCESS_INDICATOR = false;
+    } else {
         char c2 = Str[Index - 1];
-        if (System.Char.IsSurrogate(c2)) {
+        if (System.Char.IsLowSurrogate(c2)) {
             try {
                 char c1 = Str[Index - 2];
                 Ch = System.Char.ConvertToUtf32(c1, c2);
                 PrevIndex = Index - 2;
-            } catch (System.IndexOutOfRangeException) {
-                Ch = -1;
-                PrevIndex = Index;
-                SUCCESS_INDICATOR = false;
             } catch (System.ArgumentOutOfRangeException) {
-                Ch = -1;
-                PrevIndex = Index;
-                SUCCESS_INDICATOR = false;
+                // Return unpaired surrogate code point.
+                Ch = (int) c2;
+                PrevIndex = Index - 1;
+            } catch (System.IndexOutOfRangeException) {
+                // Return unpaired surrogate code point.
+                Ch = (int) c2;
+                PrevIndex = Index - 1;
             }
         } else {
-            // Common case.
             Ch = (int) c2;
             PrevIndex = Index - 1;
         }
-        // XXX ILSEQ bug, check (Ch >= 0)
         SUCCESS_INDICATOR = true;
-    } else {
-        Ch = -1;
-        PrevIndex = Index;
-        SUCCESS_INDICATOR = false;
     }
 ").
 :- pragma foreign_proc("Java",
@@ -2422,12 +2428,12 @@ prev_index(Str, Index, CharIndex, Char) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
-    if (Index > 0) {
+    try {
         Ch = Str.codePointBefore(Index);
         PrevIndex = Index - java.lang.Character.charCount(Ch);
         SUCCESS_INDICATOR = true;
-    } else {
-        Ch = -1;
+    } catch (IndexOutOfBoundsException e) {
+        Ch = 0;
         PrevIndex = Index;
         SUCCESS_INDICATOR = false;
     }
@@ -2437,6 +2443,7 @@ prev_index(Str, Index, CharIndex, Char) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, may_not_duplicate, no_sharing],
 "
+    % XXX does not handle ill-formed sequences as described
     {PrevIndex, Ch} = do_unsafe_prev_index(Str, Index - 1),
     SUCCESS_INDICATOR = (Ch =/= -1)
 ").
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 8c317da86..84e38bd34 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -362,6 +362,7 @@ ORDINARY_PROGS = \
 	string_index_next_ilseq \
 	string_loop \
 	string_presuffix \
+	string_prev_index_ilseq \
 	string_set_char \
 	string_split \
 	string_split_2 \
diff --git a/tests/hard_coded/string_prev_index_ilseq.exp b/tests/hard_coded/string_prev_index_ilseq.exp
new file mode 100644
index 000000000..3dbadd510
--- /dev/null
+++ b/tests/hard_coded/string_prev_index_ilseq.exp
@@ -0,0 +1,5 @@
+string.prev_index(S, 10, 6, 0x1f600)
+string.prev_index(S, 6, 5, 0xfffd)
+string.prev_index(S, 5, 4, 0xfffd)
+string.prev_index(S, 4, 0, 0x1f600)
+string.prev_index(S, 0, _, _) failed
diff --git a/tests/hard_coded/string_prev_index_ilseq.exp2 b/tests/hard_coded/string_prev_index_ilseq.exp2
new file mode 100644
index 000000000..8f61dd24d
--- /dev/null
+++ b/tests/hard_coded/string_prev_index_ilseq.exp2
@@ -0,0 +1,5 @@
+string.prev_index(S, 6, 4, 0x1f600)
+string.prev_index(S, 4, 3, 0xd83d)
+string.prev_index(S, 3, 2, 0xde00)
+string.prev_index(S, 2, 0, 0x1f600)
+string.prev_index(S, 0, _, _) failed
diff --git a/tests/hard_coded/string_prev_index_ilseq.m b/tests/hard_coded/string_prev_index_ilseq.m
new file mode 100644
index 000000000..ce4b427b6
--- /dev/null
+++ b/tests/hard_coded/string_prev_index_ilseq.m
@@ -0,0 +1,44 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+%
+% The .exp file is for backends using UTF-8 string encoding.
+% The .exp2 file is for backends using UTF-16 string encoding.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_prev_index_ilseq.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    S0 = "😀",
+    S1 = string.between(S0, 0, 1),
+    S2 = string.between(S0, 1, 2),
+    S = S0 ++ S2 ++ S1 ++ S0,
+    test_prev_index(S, count_code_units(S), !IO).
+
+:- pred test_prev_index(string::in, int::in, io::di, io::uo) is det.
+
+test_prev_index(S, Index, !IO) :-
+    ( if string.prev_index(S, Index, PrevIndex, Char) then
+        io.format("string.prev_index(S, %d, %d, %#x)\n",
+            [i(Index), i(PrevIndex), i(char.to_int(Char))], !IO),
+        test_prev_index(S, PrevIndex, !IO)
+    else
+        io.format("string.prev_index(S, %d, _, _) failed\n",
+            [i(Index)], !IO)
+    ).
-- 
2.23.0



More information about the reviews mailing list