<div dir="ltr">Looks good to me.</div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Wed, Oct 16, 2019 at 5:04 PM Peter Wang <<a href="mailto:novalazy@gmail.com">novalazy@gmail.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">library/string.m:<br>
    Make string.prev_index and string.unsafe_prev_index<br>
    return either U+FFFD or an unpaired surrogate code point<br>
    when an ill-formed code unit sequence is detected.<br>
<br>
tests/hard_coded/Mmakefile:<br>
tests/hard_coded/string_prev_index_ilseq.exp:<br>
tests/hard_coded/string_prev_index_ilseq.exp2:<br>
tests/hard_coded/string_prev_index_ilseq.m:<br>
    Add test case.<br>
---<br>
 library/string.m                              | 91 ++++++++++---------<br>
 tests/hard_coded/Mmakefile                    |  1 +<br>
 tests/hard_coded/string_prev_index_ilseq.exp  |  5 +<br>
 tests/hard_coded/string_prev_index_ilseq.exp2 |  5 +<br>
 tests/hard_coded/string_prev_index_ilseq.m    | 44 +++++++++<br>
 5 files changed, 104 insertions(+), 42 deletions(-)<br>
 create mode 100644 tests/hard_coded/string_prev_index_ilseq.exp<br>
 create mode 100644 tests/hard_coded/string_prev_index_ilseq.exp2<br>
 create mode 100644 tests/hard_coded/string_prev_index_ilseq.m<br>
<br>
diff --git a/library/string.m b/library/string.m<br>
index b687cece3..1b0d32e87 100644<br>
--- a/library/string.m<br>
+++ b/library/string.m<br>
@@ -299,19 +299,27 @@<br>
     %<br>
 :- pred unsafe_index_next(string::in, int::in, int::out, char::uo) is semidet.<br>
<br>
-    % prev_index(String, Index, CharIndex, Char):<br>
+    % prev_index(String, Index, PrevIndex, Char):<br>
     %<br>
-    % `Char' is the character (code point) in `String' immediately _before_<br>
-    % the code unit `Index'. Fails if `Index' is out of range (non-positive,<br>
-    % or greater than the length of `String').<br>
+    % If `Index - 1' is the final code unit offset of a well-formed sequence in<br>
+    % `String' then `Char' is the code point encoded by that sequence, and<br>
+    % `PrevIndex' is the initial code unit offset of that sequence.<br>
+    %<br>
+    % If the code unit in `String' at `Index - 1' is part of an ill-formed<br>
+    % sequence then `Char' is either U+FFFD REPLACEMENT CHARACTER (if strings<br>
+    % use UTF-8 encoding) or the unpaired surrogate code point at `Index - 1'<br>
+    % (if strings use UTF-16 encoding), and `PrevIndex' is `Index - 1'.<br>
+    %<br>
+    % Fails if `Index' is out of range (non-positive, or greater than the<br>
+    % length of `String').<br>
     %<br>
 :- pred prev_index(string::in, int::in, int::out, char::uo) is semidet.<br>
<br>
-    % unsafe_prev_index(String, Index, CharIndex, Char):<br>
+    % unsafe_prev_index(String, Index, PrevIndex, Char):<br>
+    %<br>
+    % Like prev_index/4 but does not check that `Index' is in range.<br>
+    % Fails if `Index' is zero.<br>
     %<br>
-    % `Char' is the character (code point) in `String' immediately _before_<br>
-    % the code unit `Index'. `CharIndex' is the offset of the beginning of<br>
-    % `Char'. Fails if `Index' is zero.<br>
     % WARNING: behavior is UNDEFINED if `Index' is out of range<br>
     % (negative, or greater than the length of `String').<br>
     %<br>
@@ -2350,37 +2358,37 @@ index_next(Str, Index, NextIndex, Char) :-<br>
     end<br>
 ").<br>
<br>
-prev_index(Str, Index, CharIndex, Char) :-<br>
+prev_index(Str, Index, PrevIndex, Char) :-<br>
     Len = length(Str),<br>
     ( if index_check(Index - 1, Len) then<br>
-        unsafe_prev_index(Str, Index, CharIndex, Char)<br>
+        unsafe_prev_index(Str, Index, PrevIndex, Char)<br>
     else<br>
         fail<br>
     ).<br>
<br>
-% XXX ILSEQ Behaviour of unsafe_prev_index depends on target language.<br>
-%   - c: fails on ill-formed sequence<br>
-%   - java: returns unpaired surrogate<br>
-%   - csharp: fails on unpaired surrogate (excepting for the bug)<br>
 :- pragma foreign_proc("C",<br>
     unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),<br>
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,<br>
         does_not_affect_liveness, no_sharing],<br>
 "<br>
-    if (Index > 0) {<br>
+    if (Index <= 0) {<br>
+        PrevIndex = Index;<br>
+        Ch = 0;<br>
+        SUCCESS_INDICATOR = MR_FALSE;<br>
+    } else {<br>
         PrevIndex = Index - 1;<br>
         Ch = Str[PrevIndex];<br>
-        if (MR_is_ascii(Ch)) {<br>
-            SUCCESS_INDICATOR = (Ch != 0);<br>
-        } else {<br>
+        if (! MR_is_ascii(Ch)) {<br>
             Ch = MR_utf8_prev_get(Str, &PrevIndex);<br>
-            // XXX ILSEQ<br>
-            // SUCCESS_INDICATOR should be false if there are any extra bytes<br>
-            // after the encoding of Ch, but before Index.<br>
-            SUCCESS_INDICATOR = (Ch > 0);<br>
+            // XXX MR_utf8_prev_get currently just scans backwards to find a<br>
+            // lead byte, so we need a separate check to ensure no bytes are<br>
+            // unaccounted for.<br>
+            if (Ch < 0 || PrevIndex + MR_utf8_width(Ch) != Index) {<br>
+                Ch = 0xfffd;<br>
+                PrevIndex = Index - 1;<br>
+            }<br>
         }<br>
-    } else {<br>
-        SUCCESS_INDICATOR = MR_FALSE;<br>
+        SUCCESS_INDICATOR = MR_TRUE;<br>
     }<br>
 ").<br>
 :- pragma foreign_proc("C#",<br>
@@ -2388,33 +2396,31 @@ prev_index(Str, Index, CharIndex, Char) :-<br>
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,<br>
         does_not_affect_liveness, no_sharing],<br>
 "<br>
-    if (Index > 0) {<br>
+    if (Index <= 0) {<br>
+        Ch = 0;<br>
+        PrevIndex = Index;<br>
+        SUCCESS_INDICATOR = false;<br>
+    } else {<br>
         char c2 = Str[Index - 1];<br>
-        if (System.Char.IsSurrogate(c2)) {<br>
+        if (System.Char.IsLowSurrogate(c2)) {<br>
             try {<br>
                 char c1 = Str[Index - 2];<br>
                 Ch = System.Char.ConvertToUtf32(c1, c2);<br>
                 PrevIndex = Index - 2;<br>
-            } catch (System.IndexOutOfRangeException) {<br>
-                Ch = -1;<br>
-                PrevIndex = Index;<br>
-                SUCCESS_INDICATOR = false;<br>
             } catch (System.ArgumentOutOfRangeException) {<br>
-                Ch = -1;<br>
-                PrevIndex = Index;<br>
-                SUCCESS_INDICATOR = false;<br>
+                // Return unpaired surrogate code point.<br>
+                Ch = (int) c2;<br>
+                PrevIndex = Index - 1;<br>
+            } catch (System.IndexOutOfRangeException) {<br>
+                // Return unpaired surrogate code point.<br>
+                Ch = (int) c2;<br>
+                PrevIndex = Index - 1;<br>
             }<br>
         } else {<br>
-            // Common case.<br>
             Ch = (int) c2;<br>
             PrevIndex = Index - 1;<br>
         }<br>
-        // XXX ILSEQ bug, check (Ch >= 0)<br>
         SUCCESS_INDICATOR = true;<br>
-    } else {<br>
-        Ch = -1;<br>
-        PrevIndex = Index;<br>
-        SUCCESS_INDICATOR = false;<br>
     }<br>
 ").<br>
 :- pragma foreign_proc("Java",<br>
@@ -2422,12 +2428,12 @@ prev_index(Str, Index, CharIndex, Char) :-<br>
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,<br>
         does_not_affect_liveness, no_sharing],<br>
 "<br>
-    if (Index > 0) {<br>
+    try {<br>
         Ch = Str.codePointBefore(Index);<br>
         PrevIndex = Index - java.lang.Character.charCount(Ch);<br>
         SUCCESS_INDICATOR = true;<br>
-    } else {<br>
-        Ch = -1;<br>
+    } catch (IndexOutOfBoundsException e) {<br>
+        Ch = 0;<br>
         PrevIndex = Index;<br>
         SUCCESS_INDICATOR = false;<br>
     }<br>
@@ -2437,6 +2443,7 @@ prev_index(Str, Index, CharIndex, Char) :-<br>
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,<br>
         does_not_affect_liveness, may_not_duplicate, no_sharing],<br>
 "<br>
+    % XXX does not handle ill-formed sequences as described<br>
     {PrevIndex, Ch} = do_unsafe_prev_index(Str, Index - 1),<br>
     SUCCESS_INDICATOR = (Ch =/= -1)<br>
 ").<br>
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile<br>
index 8c317da86..84e38bd34 100644<br>
--- a/tests/hard_coded/Mmakefile<br>
+++ b/tests/hard_coded/Mmakefile<br>
@@ -362,6 +362,7 @@ ORDINARY_PROGS = \<br>
        string_index_next_ilseq \<br>
        string_loop \<br>
        string_presuffix \<br>
+       string_prev_index_ilseq \<br>
        string_set_char \<br>
        string_split \<br>
        string_split_2 \<br>
diff --git a/tests/hard_coded/string_prev_index_ilseq.exp b/tests/hard_coded/string_prev_index_ilseq.exp<br>
new file mode 100644<br>
index 000000000..3dbadd510<br>
--- /dev/null<br>
+++ b/tests/hard_coded/string_prev_index_ilseq.exp<br>
@@ -0,0 +1,5 @@<br>
+string.prev_index(S, 10, 6, 0x1f600)<br>
+string.prev_index(S, 6, 5, 0xfffd)<br>
+string.prev_index(S, 5, 4, 0xfffd)<br>
+string.prev_index(S, 4, 0, 0x1f600)<br>
+string.prev_index(S, 0, _, _) failed<br>
diff --git a/tests/hard_coded/string_prev_index_ilseq.exp2 b/tests/hard_coded/string_prev_index_ilseq.exp2<br>
new file mode 100644<br>
index 000000000..8f61dd24d<br>
--- /dev/null<br>
+++ b/tests/hard_coded/string_prev_index_ilseq.exp2<br>
@@ -0,0 +1,5 @@<br>
+string.prev_index(S, 6, 4, 0x1f600)<br>
+string.prev_index(S, 4, 3, 0xd83d)<br>
+string.prev_index(S, 3, 2, 0xde00)<br>
+string.prev_index(S, 2, 0, 0x1f600)<br>
+string.prev_index(S, 0, _, _) failed<br>
diff --git a/tests/hard_coded/string_prev_index_ilseq.m b/tests/hard_coded/string_prev_index_ilseq.m<br>
new file mode 100644<br>
index 000000000..ce4b427b6<br>
--- /dev/null<br>
+++ b/tests/hard_coded/string_prev_index_ilseq.m<br>
@@ -0,0 +1,44 @@<br>
+%---------------------------------------------------------------------------%<br>
+% vim: ts=4 sw=4 et ft=mercury<br>
+%---------------------------------------------------------------------------%<br>
+%<br>
+% The .exp file is for backends using UTF-8 string encoding.<br>
+% The .exp2 file is for backends using UTF-16 string encoding.<br>
+%<br>
+%---------------------------------------------------------------------------%<br>
+<br>
+:- module string_prev_index_ilseq.<br>
+:- interface.<br>
+<br>
+:- import_module io.<br>
+<br>
+:- pred main(io::di, io::uo) is det.<br>
+<br>
+%---------------------------------------------------------------------------%<br>
+<br>
+:- implementation.<br>
+<br>
+:- import_module char.<br>
+:- import_module list.<br>
+:- import_module string.<br>
+<br>
+%---------------------------------------------------------------------------%<br>
+<br>
+main(!IO) :-<br>
+    S0 = "😀",<br>
+    S1 = string.between(S0, 0, 1),<br>
+    S2 = string.between(S0, 1, 2),<br>
+    S = S0 ++ S2 ++ S1 ++ S0,<br>
+    test_prev_index(S, count_code_units(S), !IO).<br>
+<br>
+:- pred test_prev_index(string::in, int::in, io::di, io::uo) is det.<br>
+<br>
+test_prev_index(S, Index, !IO) :-<br>
+    ( if string.prev_index(S, Index, PrevIndex, Char) then<br>
+        io.format("string.prev_index(S, %d, %d, %#x)\n",<br>
+            [i(Index), i(PrevIndex), i(char.to_int(Char))], !IO),<br>
+        test_prev_index(S, PrevIndex, !IO)<br>
+    else<br>
+        io.format("string.prev_index(S, %d, _, _) failed\n",<br>
+            [i(Index)], !IO)<br>
+    ).<br>
-- <br>
2.23.0<br>
<br>
_______________________________________________<br>
reviews mailing list<br>
<a href="mailto:reviews@lists.mercurylang.org" target="_blank">reviews@lists.mercurylang.org</a><br>
<a href="https://lists.mercurylang.org/listinfo/reviews" rel="noreferrer" target="_blank">https://lists.mercurylang.org/listinfo/reviews</a><br>
</blockquote></div>