[m-rev.] for review: Define behaviour of string.codepoint_offset on ill-formed sequences.

Peter Wang novalazy at gmail.com
Wed Oct 16 17:04:23 AEDT 2019


library/string.m:
    Define how string.codepoint_offset counts code units in ill-formed
    sequences.

    Delete C and C# foreign implementations in favour of the Mercury
    implementation that has the intended behaviour.
    (The Java implementation uses String.offsetByCodePoints which
    also matches our intended behaviour.)

tests/hard_coded/Mmakefile:
tests/hard_coded/string_codepoint_offset_ilseq.exp2:
tests/hard_coded/string_codepoint_offset_ilseq.m:
    Add test case.
---
 library/string.m                              | 58 ++++-----------
 tests/hard_coded/Mmakefile                    |  1 +
 .../string_codepoint_offset_ilseq.exp         | 30 ++++++++
 .../string_codepoint_offset_ilseq.exp2        | 22 ++++++
 .../string_codepoint_offset_ilseq.m           | 72 +++++++++++++++++++
 5 files changed, 137 insertions(+), 46 deletions(-)
 create mode 100644 tests/hard_coded/string_codepoint_offset_ilseq.exp
 create mode 100644 tests/hard_coded/string_codepoint_offset_ilseq.exp2
 create mode 100644 tests/hard_coded/string_codepoint_offset_ilseq.m

diff --git a/library/string.m b/library/string.m
index 10bbf6f5b..a497293d7 100644
--- a/library/string.m
+++ b/library/string.m
@@ -438,17 +438,23 @@
     %
 :- func count_utf8_code_units(string) = int.
 
-    % codepoint_offset(String, StartOffset, CodePointCount, CodePointOffset):
+    % codepoint_offset(String, StartOffset, Count, Offset):
     %
-    % Return the offset into `String' where, starting from `StartOffset',
-    % `CodePointCount' code points are skipped. Fails if either `StartOffset'
-    % or `CodePointOffset' are out of range.
+    % Let `S' be the substring of `String' from code unit `StartOffset' to the
+    % end of the string. `Offset' is code unit offset after advancing `Count'
+    % steps in `S', where each step skips over either:
+    %  - one encoding of a Unicode code point, or
+    %  - one code unit that is part of an ill-formed sequence.
+    %
+    % Fails if `StartOffset' is out of range (negative, or greater than the
+    % length of `String'), or if there are fewer than `Count' steps possible
+    % in `S'.
     %
 :- pred codepoint_offset(string::in, int::in, int::in, int::out) is semidet.
 
-    % codepoint_offset(String, CodePointCount, CodePointOffset):
+    % codepoint_offset(String, Count, Offset):
     %
-    % Same as `codepoint_offset(String, 0, CodePointCount, CodePointOffset)'.
+    % Same as `codepoint_offset(String, 0, Count, Offset)'.
     %
 :- pred codepoint_offset(string::in, int::in, int::out) is semidet.
 
@@ -2807,43 +2813,6 @@ count_utf8_code_units_2(Char, !Length) :-
 
 %---------------------%
 
-% XXX ILSEQ Behaviour depends on target language.
-% Should be made consistent so that each code unit in an ill-formed sequence
-% counts as one (pseudo) code point.
-
-:- pragma foreign_proc("C",
-    codepoint_offset(String::in, StartOffset::in, N::in, Index::out),
-    [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
-"
-    size_t          len;
-    unsigned char   b;
-
-    SUCCESS_INDICATOR = MR_FALSE;
-    len = strlen(String);
-    for (Index = StartOffset; Index < len; Index++) {
-        b = String[Index];
-        if (MR_utf8_is_single_byte(b) || MR_utf8_is_lead_byte(b)) {
-            if (N-- == 0) {
-                SUCCESS_INDICATOR = MR_TRUE;
-                break;
-            }
-        }
-    }
-").
-:- pragma foreign_proc("C#",
-    codepoint_offset(String::in, StartOffset::in, N::in, Index::out),
-    [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
-"
-    SUCCESS_INDICATOR = false;
-    for (Index = StartOffset; Index < String.Length; Index++) {
-        if (!System.Char.IsLowSurrogate(String, Index)) {
-            if (N-- == 0) {
-                SUCCESS_INDICATOR = true;
-                break;
-            }
-        }
-    }
-").
 :- pragma foreign_proc("Java",
     codepoint_offset(String::in, StartOffset::in, N::in, Index::out),
     [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
@@ -2877,9 +2846,6 @@ codepoint_offset_loop(String, Offset, Length, N, Index) :-
 %---------------------------------------------------------------------------%
 
 codepoint_offset(String, N, Index) :-
-    % XXX ILSEQ
-    % Note: we do not define what happens with unpaired surrogates.
-    %
     codepoint_offset(String, 0, N, Index).
 
 %---------------------------------------------------------------------------%
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 626f60c6a..c7c30ba02 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -358,6 +358,7 @@ ORDINARY_PROGS = \
 	string_class \
 	string_code_unit \
 	string_codepoint \
+	string_codepoint_offset_ilseq \
 	string_count_codepoints_ilseq \
 	string_first_char \
 	string_fold_ilseq \
diff --git a/tests/hard_coded/string_codepoint_offset_ilseq.exp b/tests/hard_coded/string_codepoint_offset_ilseq.exp
new file mode 100644
index 000000000..fd2b09e09
--- /dev/null
+++ b/tests/hard_coded/string_codepoint_offset_ilseq.exp
@@ -0,0 +1,30 @@
+start counting from offset 0
+string.codepoint_offset(S, 0, 0, 0); Char = a
+string.codepoint_offset(S, 0, 1, 1); Char = 😀
+string.codepoint_offset(S, 0, 2, 5); Char = b
+string.codepoint_offset(S, 0, 3, 6); Char = 0xfffd
+string.codepoint_offset(S, 0, 4, 7); Char = 0xfffd
+string.codepoint_offset(S, 0, 5, 8); Char = 0xfffd
+string.codepoint_offset(S, 0, 6, 9); Char = z
+string.codepoint_offset(S, 0, 7, _) failed
+
+start counting from offset 1
+string.codepoint_offset(S, 1, 0, 1); Char = 😀
+string.codepoint_offset(S, 1, 1, 5); Char = b
+string.codepoint_offset(S, 1, 2, 6); Char = 0xfffd
+string.codepoint_offset(S, 1, 3, 7); Char = 0xfffd
+string.codepoint_offset(S, 1, 4, 8); Char = 0xfffd
+string.codepoint_offset(S, 1, 5, 9); Char = z
+string.codepoint_offset(S, 1, 6, _) failed
+
+start counting from offset 2
+string.codepoint_offset(S, 2, 0, 2); Char = 0xfffd
+string.codepoint_offset(S, 2, 1, 3); Char = 0xfffd
+string.codepoint_offset(S, 2, 2, 4); Char = 0xfffd
+string.codepoint_offset(S, 2, 3, 5); Char = b
+string.codepoint_offset(S, 2, 4, 6); Char = 0xfffd
+string.codepoint_offset(S, 2, 5, 7); Char = 0xfffd
+string.codepoint_offset(S, 2, 6, 8); Char = 0xfffd
+string.codepoint_offset(S, 2, 7, 9); Char = z
+string.codepoint_offset(S, 2, 8, _) failed
+
diff --git a/tests/hard_coded/string_codepoint_offset_ilseq.exp2 b/tests/hard_coded/string_codepoint_offset_ilseq.exp2
new file mode 100644
index 000000000..fd5576062
--- /dev/null
+++ b/tests/hard_coded/string_codepoint_offset_ilseq.exp2
@@ -0,0 +1,22 @@
+start counting from offset 0
+string.codepoint_offset(S, 0, 0, 0); Char = a
+string.codepoint_offset(S, 0, 1, 1); Char = 😀
+string.codepoint_offset(S, 0, 2, 3); Char = b
+string.codepoint_offset(S, 0, 3, 4); Char = 0xd83d
+string.codepoint_offset(S, 0, 4, 5); Char = z
+string.codepoint_offset(S, 0, 5, _) failed
+
+start counting from offset 1
+string.codepoint_offset(S, 1, 0, 1); Char = 😀
+string.codepoint_offset(S, 1, 1, 3); Char = b
+string.codepoint_offset(S, 1, 2, 4); Char = 0xd83d
+string.codepoint_offset(S, 1, 3, 5); Char = z
+string.codepoint_offset(S, 1, 4, _) failed
+
+start counting from offset 2
+string.codepoint_offset(S, 2, 0, 2); Char = 0xde00
+string.codepoint_offset(S, 2, 1, 3); Char = b
+string.codepoint_offset(S, 2, 2, 4); Char = 0xd83d
+string.codepoint_offset(S, 2, 3, 5); Char = z
+string.codepoint_offset(S, 2, 4, _) failed
+
diff --git a/tests/hard_coded/string_codepoint_offset_ilseq.m b/tests/hard_coded/string_codepoint_offset_ilseq.m
new file mode 100644
index 000000000..cecb2329c
--- /dev/null
+++ b/tests/hard_coded/string_codepoint_offset_ilseq.m
@@ -0,0 +1,72 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+%
+% The .exp file is for backends using UTF-8 string encoding.
+% The .exp2 file is for backends using UTF-16 string encoding.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_codepoint_offset_ilseq.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module int.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    S0 = "😀",
+    S1 = string.between(S0, 0, count_code_units(S0) - 1),
+    S = "a" ++ S0 ++ "b" ++ S1 ++ "z",
+
+    io.write_string("start counting from offset 0\n", !IO),
+    test_codepoint_offset(S, 0, 0, !IO),
+    io.nl(!IO),
+
+    io.write_string("start counting from offset 1\n", !IO),
+    test_codepoint_offset(S, 1, 0, !IO),
+    io.nl(!IO),
+
+    io.write_string("start counting from offset 2\n", !IO),
+    test_codepoint_offset(S, 2, 0, !IO),
+    io.nl(!IO).
+
+:- pred test_codepoint_offset(string::in, int::in, int::in, io::di, io::uo)
+    is det.
+
+test_codepoint_offset(S, StartOffset, Count, !IO) :-
+    ( if string.codepoint_offset(S, StartOffset, Count, Offset) then
+        io.format("string.codepoint_offset(S, %d, %d, %d); ",
+            [i(StartOffset), i(Count), i(Offset)], !IO),
+        ( if string.index(S, Offset, Char) then
+            io.write_string("Char = ", !IO),
+            write_char_or_hex(Char, !IO),
+            io.nl(!IO)
+        else
+            io.write_string("string.index/3 failed\n", !IO)
+        ),
+        test_codepoint_offset(S, StartOffset, Count + 1, !IO)
+    else
+        io.format("string.codepoint_offset(S, %d, %d, _) failed\n",
+            [i(StartOffset), i(Count)], !IO)
+    ).
+
+:- pred write_char_or_hex(char::in, io::di, io::uo) is det.
+
+write_char_or_hex(Char, !IO) :-
+    ( if Char = '\ufffd' ; char.is_surrogate(Char) then
+        io.format("%#x", [i(char.to_int(Char))], !IO)
+    else
+        io.write_char(Char, !IO)
+    ).
-- 
2.23.0



More information about the reviews mailing list