<div dir="ltr">This looks fine. Thankyou for the effort!<div><br></div><div>Mark</div></div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Wed, Oct 16, 2019 at 5:06 PM Peter Wang <<a href="mailto:novalazy@gmail.com">novalazy@gmail.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">library/string.m:<br>
    Define how string.codepoint_offset counts code units in ill-formed<br>
    sequences.<br>
<br>
    Delete C and C# foreign implementations in favour of the Mercury<br>
    implementation that has the intended behaviour.<br>
    (The Java implementation uses String.offsetByCodePoints which<br>
    also matches our intended behaviour.)<br>
<br>
tests/hard_coded/Mmakefile:<br>
tests/hard_coded/string_codepoint_offset_ilseq.exp2:<br>
tests/hard_coded/string_codepoint_offset_ilseq.m:<br>
    Add test case.<br>
---<br>
 library/string.m                              | 58 ++++-----------<br>
 tests/hard_coded/Mmakefile                    |  1 +<br>
 .../string_codepoint_offset_ilseq.exp         | 30 ++++++++<br>
 .../string_codepoint_offset_ilseq.exp2        | 22 ++++++<br>
 .../string_codepoint_offset_ilseq.m           | 72 +++++++++++++++++++<br>
 5 files changed, 137 insertions(+), 46 deletions(-)<br>
 create mode 100644 tests/hard_coded/string_codepoint_offset_ilseq.exp<br>
 create mode 100644 tests/hard_coded/string_codepoint_offset_ilseq.exp2<br>
 create mode 100644 tests/hard_coded/string_codepoint_offset_ilseq.m<br>
<br>
diff --git a/library/string.m b/library/string.m<br>
index 10bbf6f5b..a497293d7 100644<br>
--- a/library/string.m<br>
+++ b/library/string.m<br>
@@ -438,17 +438,23 @@<br>
     %<br>
 :- func count_utf8_code_units(string) = int.<br>
<br>
-    % codepoint_offset(String, StartOffset, CodePointCount, CodePointOffset):<br>
+    % codepoint_offset(String, StartOffset, Count, Offset):<br>
     %<br>
-    % Return the offset into `String' where, starting from `StartOffset',<br>
-    % `CodePointCount' code points are skipped. Fails if either `StartOffset'<br>
-    % or `CodePointOffset' are out of range.<br>
+    % Let `S' be the substring of `String' from code unit `StartOffset' to the<br>
+    % end of the string. `Offset' is code unit offset after advancing `Count'<br>
+    % steps in `S', where each step skips over either:<br>
+    %  - one encoding of a Unicode code point, or<br>
+    %  - one code unit that is part of an ill-formed sequence.<br>
+    %<br>
+    % Fails if `StartOffset' is out of range (negative, or greater than the<br>
+    % length of `String'), or if there are fewer than `Count' steps possible<br>
+    % in `S'.<br>
     %<br>
 :- pred codepoint_offset(string::in, int::in, int::in, int::out) is semidet.<br>
<br>
-    % codepoint_offset(String, CodePointCount, CodePointOffset):<br>
+    % codepoint_offset(String, Count, Offset):<br>
     %<br>
-    % Same as `codepoint_offset(String, 0, CodePointCount, CodePointOffset)'.<br>
+    % Same as `codepoint_offset(String, 0, Count, Offset)'.<br>
     %<br>
 :- pred codepoint_offset(string::in, int::in, int::out) is semidet.<br>
<br>
@@ -2807,43 +2813,6 @@ count_utf8_code_units_2(Char, !Length) :-<br>
<br>
 %---------------------%<br>
<br>
-% XXX ILSEQ Behaviour depends on target language.<br>
-% Should be made consistent so that each code unit in an ill-formed sequence<br>
-% counts as one (pseudo) code point.<br>
-<br>
-:- pragma foreign_proc("C",<br>
-    codepoint_offset(String::in, StartOffset::in, N::in, Index::out),<br>
-    [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],<br>
-"<br>
-    size_t          len;<br>
-    unsigned char   b;<br>
-<br>
-    SUCCESS_INDICATOR = MR_FALSE;<br>
-    len = strlen(String);<br>
-    for (Index = StartOffset; Index < len; Index++) {<br>
-        b = String[Index];<br>
-        if (MR_utf8_is_single_byte(b) || MR_utf8_is_lead_byte(b)) {<br>
-            if (N-- == 0) {<br>
-                SUCCESS_INDICATOR = MR_TRUE;<br>
-                break;<br>
-            }<br>
-        }<br>
-    }<br>
-").<br>
-:- pragma foreign_proc("C#",<br>
-    codepoint_offset(String::in, StartOffset::in, N::in, Index::out),<br>
-    [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],<br>
-"<br>
-    SUCCESS_INDICATOR = false;<br>
-    for (Index = StartOffset; Index < String.Length; Index++) {<br>
-        if (!System.Char.IsLowSurrogate(String, Index)) {<br>
-            if (N-- == 0) {<br>
-                SUCCESS_INDICATOR = true;<br>
-                break;<br>
-            }<br>
-        }<br>
-    }<br>
-").<br>
 :- pragma foreign_proc("Java",<br>
     codepoint_offset(String::in, StartOffset::in, N::in, Index::out),<br>
     [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],<br>
@@ -2877,9 +2846,6 @@ codepoint_offset_loop(String, Offset, Length, N, Index) :-<br>
 %---------------------------------------------------------------------------%<br>
<br>
 codepoint_offset(String, N, Index) :-<br>
-    % XXX ILSEQ<br>
-    % Note: we do not define what happens with unpaired surrogates.<br>
-    %<br>
     codepoint_offset(String, 0, N, Index).<br>
<br>
 %---------------------------------------------------------------------------%<br>
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile<br>
index 626f60c6a..c7c30ba02 100644<br>
--- a/tests/hard_coded/Mmakefile<br>
+++ b/tests/hard_coded/Mmakefile<br>
@@ -358,6 +358,7 @@ ORDINARY_PROGS = \<br>
        string_class \<br>
        string_code_unit \<br>
        string_codepoint \<br>
+       string_codepoint_offset_ilseq \<br>
        string_count_codepoints_ilseq \<br>
        string_first_char \<br>
        string_fold_ilseq \<br>
diff --git a/tests/hard_coded/string_codepoint_offset_ilseq.exp b/tests/hard_coded/string_codepoint_offset_ilseq.exp<br>
new file mode 100644<br>
index 000000000..fd2b09e09<br>
--- /dev/null<br>
+++ b/tests/hard_coded/string_codepoint_offset_ilseq.exp<br>
@@ -0,0 +1,30 @@<br>
+start counting from offset 0<br>
+string.codepoint_offset(S, 0, 0, 0); Char = a<br>
+string.codepoint_offset(S, 0, 1, 1); Char = 😀<br>
+string.codepoint_offset(S, 0, 2, 5); Char = b<br>
+string.codepoint_offset(S, 0, 3, 6); Char = 0xfffd<br>
+string.codepoint_offset(S, 0, 4, 7); Char = 0xfffd<br>
+string.codepoint_offset(S, 0, 5, 8); Char = 0xfffd<br>
+string.codepoint_offset(S, 0, 6, 9); Char = z<br>
+string.codepoint_offset(S, 0, 7, _) failed<br>
+<br>
+start counting from offset 1<br>
+string.codepoint_offset(S, 1, 0, 1); Char = 😀<br>
+string.codepoint_offset(S, 1, 1, 5); Char = b<br>
+string.codepoint_offset(S, 1, 2, 6); Char = 0xfffd<br>
+string.codepoint_offset(S, 1, 3, 7); Char = 0xfffd<br>
+string.codepoint_offset(S, 1, 4, 8); Char = 0xfffd<br>
+string.codepoint_offset(S, 1, 5, 9); Char = z<br>
+string.codepoint_offset(S, 1, 6, _) failed<br>
+<br>
+start counting from offset 2<br>
+string.codepoint_offset(S, 2, 0, 2); Char = 0xfffd<br>
+string.codepoint_offset(S, 2, 1, 3); Char = 0xfffd<br>
+string.codepoint_offset(S, 2, 2, 4); Char = 0xfffd<br>
+string.codepoint_offset(S, 2, 3, 5); Char = b<br>
+string.codepoint_offset(S, 2, 4, 6); Char = 0xfffd<br>
+string.codepoint_offset(S, 2, 5, 7); Char = 0xfffd<br>
+string.codepoint_offset(S, 2, 6, 8); Char = 0xfffd<br>
+string.codepoint_offset(S, 2, 7, 9); Char = z<br>
+string.codepoint_offset(S, 2, 8, _) failed<br>
+<br>
diff --git a/tests/hard_coded/string_codepoint_offset_ilseq.exp2 b/tests/hard_coded/string_codepoint_offset_ilseq.exp2<br>
new file mode 100644<br>
index 000000000..fd5576062<br>
--- /dev/null<br>
+++ b/tests/hard_coded/string_codepoint_offset_ilseq.exp2<br>
@@ -0,0 +1,22 @@<br>
+start counting from offset 0<br>
+string.codepoint_offset(S, 0, 0, 0); Char = a<br>
+string.codepoint_offset(S, 0, 1, 1); Char = 😀<br>
+string.codepoint_offset(S, 0, 2, 3); Char = b<br>
+string.codepoint_offset(S, 0, 3, 4); Char = 0xd83d<br>
+string.codepoint_offset(S, 0, 4, 5); Char = z<br>
+string.codepoint_offset(S, 0, 5, _) failed<br>
+<br>
+start counting from offset 1<br>
+string.codepoint_offset(S, 1, 0, 1); Char = 😀<br>
+string.codepoint_offset(S, 1, 1, 3); Char = b<br>
+string.codepoint_offset(S, 1, 2, 4); Char = 0xd83d<br>
+string.codepoint_offset(S, 1, 3, 5); Char = z<br>
+string.codepoint_offset(S, 1, 4, _) failed<br>
+<br>
+start counting from offset 2<br>
+string.codepoint_offset(S, 2, 0, 2); Char = 0xde00<br>
+string.codepoint_offset(S, 2, 1, 3); Char = b<br>
+string.codepoint_offset(S, 2, 2, 4); Char = 0xd83d<br>
+string.codepoint_offset(S, 2, 3, 5); Char = z<br>
+string.codepoint_offset(S, 2, 4, _) failed<br>
+<br>
diff --git a/tests/hard_coded/string_codepoint_offset_ilseq.m b/tests/hard_coded/string_codepoint_offset_ilseq.m<br>
new file mode 100644<br>
index 000000000..cecb2329c<br>
--- /dev/null<br>
+++ b/tests/hard_coded/string_codepoint_offset_ilseq.m<br>
@@ -0,0 +1,72 @@<br>
+%---------------------------------------------------------------------------%<br>
+% vim: ts=4 sw=4 et ft=mercury<br>
+%---------------------------------------------------------------------------%<br>
+%<br>
+% The .exp file is for backends using UTF-8 string encoding.<br>
+% The .exp2 file is for backends using UTF-16 string encoding.<br>
+%<br>
+%---------------------------------------------------------------------------%<br>
+<br>
+:- module string_codepoint_offset_ilseq.<br>
+:- interface.<br>
+<br>
+:- import_module io.<br>
+<br>
+:- pred main(io::di, io::uo) is det.<br>
+<br>
+%---------------------------------------------------------------------------%<br>
+<br>
+:- implementation.<br>
+<br>
+:- import_module char.<br>
+:- import_module int.<br>
+:- import_module list.<br>
+:- import_module string.<br>
+<br>
+%---------------------------------------------------------------------------%<br>
+<br>
+main(!IO) :-<br>
+    S0 = "😀",<br>
+    S1 = string.between(S0, 0, count_code_units(S0) - 1),<br>
+    S = "a" ++ S0 ++ "b" ++ S1 ++ "z",<br>
+<br>
+    io.write_string("start counting from offset 0\n", !IO),<br>
+    test_codepoint_offset(S, 0, 0, !IO),<br>
+    <a href="http://io.nl" rel="noreferrer" target="_blank">io.nl</a>(!IO),<br>
+<br>
+    io.write_string("start counting from offset 1\n", !IO),<br>
+    test_codepoint_offset(S, 1, 0, !IO),<br>
+    <a href="http://io.nl" rel="noreferrer" target="_blank">io.nl</a>(!IO),<br>
+<br>
+    io.write_string("start counting from offset 2\n", !IO),<br>
+    test_codepoint_offset(S, 2, 0, !IO),<br>
+    <a href="http://io.nl" rel="noreferrer" target="_blank">io.nl</a>(!IO).<br>
+<br>
+:- pred test_codepoint_offset(string::in, int::in, int::in, io::di, io::uo)<br>
+    is det.<br>
+<br>
+test_codepoint_offset(S, StartOffset, Count, !IO) :-<br>
+    ( if string.codepoint_offset(S, StartOffset, Count, Offset) then<br>
+        io.format("string.codepoint_offset(S, %d, %d, %d); ",<br>
+            [i(StartOffset), i(Count), i(Offset)], !IO),<br>
+        ( if string.index(S, Offset, Char) then<br>
+            io.write_string("Char = ", !IO),<br>
+            write_char_or_hex(Char, !IO),<br>
+            <a href="http://io.nl" rel="noreferrer" target="_blank">io.nl</a>(!IO)<br>
+        else<br>
+            io.write_string("string.index/3 failed\n", !IO)<br>
+        ),<br>
+        test_codepoint_offset(S, StartOffset, Count + 1, !IO)<br>
+    else<br>
+        io.format("string.codepoint_offset(S, %d, %d, _) failed\n",<br>
+            [i(StartOffset), i(Count)], !IO)<br>
+    ).<br>
+<br>
+:- pred write_char_or_hex(char::in, io::di, io::uo) is det.<br>
+<br>
+write_char_or_hex(Char, !IO) :-<br>
+    ( if Char = '\ufffd' ; char.is_surrogate(Char) then<br>
+        io.format("%#x", [i(char.to_int(Char))], !IO)<br>
+    else<br>
+        io.write_char(Char, !IO)<br>
+    ).<br>
-- <br>
2.23.0<br>
<br>
_______________________________________________<br>
reviews mailing list<br>
<a href="mailto:reviews@lists.mercurylang.org" target="_blank">reviews@lists.mercurylang.org</a><br>
<a href="https://lists.mercurylang.org/listinfo/reviews" rel="noreferrer" target="_blank">https://lists.mercurylang.org/listinfo/reviews</a><br>
</blockquote></div>