[m-rev.] for review: Define behaviour of string.count_codepoints with ill-formed sequences.

Peter Wang novalazy at gmail.com
Wed Oct 16 17:04:21 AEDT 2019


library/string.m:
    Make each code unit in an ill-formed sequence contribute one
    to the value of string.count_codepoints.

tests/hard_coded/Mmakefile:
tests/hard_coded/string_count_codepoints_ilseq.exp:
tests/hard_coded/string_count_codepoints_ilseq.exp2:
tests/hard_coded/string_count_codepoints_ilseq.m:
    Add test case.
---
 library/string.m                              | 44 +++----------------
 tests/hard_coded/Mmakefile                    |  1 +
 .../string_count_codepoints_ilseq.exp         |  1 +
 .../string_count_codepoints_ilseq.exp2        |  1 +
 .../string_count_codepoints_ilseq.m           | 35 +++++++++++++++
 5 files changed, 43 insertions(+), 39 deletions(-)
 create mode 100644 tests/hard_coded/string_count_codepoints_ilseq.exp
 create mode 100644 tests/hard_coded/string_count_codepoints_ilseq.exp2
 create mode 100644 tests/hard_coded/string_count_codepoints_ilseq.m

diff --git a/library/string.m b/library/string.m
index 0501a3198..884363c82 100644
--- a/library/string.m
+++ b/library/string.m
@@ -421,6 +421,11 @@
 
     % Determine the number of code points in a string.
     %
+    % Each valid code point, and each code unit that is part of an ill-formed
+    % sequence, contributes one to the result.
+    % (This matches the number of steps it would take to iterate over the
+    % string using string.index_next or string.prev_index.)
+    %
     % NOTE The names of this predicate and several other predicates
     % may be changed in the future to refer to code_points, not codepoints,
     % for consistency with predicate names that talk about code_units.
@@ -2737,48 +2742,9 @@ count_code_units(Str, Length) :-
 
 %---------------------%
 
-% XXX ILSEQ Behaviour depends on target language.
-%   - c: counts lead bytes without checking trailing bytes
-%   - java: unpaired surrogates count as one code point each
-%   - csharp: counts non-surrogates and high surrogates
-%   - generic: unsafe_index_next stops at the first ill-formed sequence
-%
-% The Java behaviour seems best: count well-formed code points plus each code
-% unit in an ill-formed sequence. index_next and other iteration predicates
-% should be made consistent with however count_codepoints ends up counting.
-
 count_codepoints(String) = Count :-
     count_codepoints(String, Count).
 
-:- pragma foreign_proc("C",
-    count_codepoints(String::in, Count::out),
-    [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
-"
-    unsigned char   b;
-    int             i;
-
-    Count = 0;
-    for (i = 0; ; i++) {
-        b = String[i];
-        if (b == '\\0') {
-            break;
-        }
-        if (MR_utf8_is_single_byte(b) || MR_utf8_is_lead_byte(b)) {
-            Count++;
-        }
-    }
-").
-:- pragma foreign_proc("C#",
-    count_codepoints(String::in, Count::out),
-    [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
-"
-    Count = 0;
-    foreach (char c in String) {
-        if (!System.Char.IsLowSurrogate(c)) {
-            Count++;
-        }
-    }
-").
 :- pragma foreign_proc("Java",
     count_codepoints(String::in, Count::out),
     [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index f11b93eb0..7906dcbf3 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -358,6 +358,7 @@ ORDINARY_PROGS = \
 	string_class \
 	string_code_unit \
 	string_codepoint \
+	string_count_codepoints_ilseq \
 	string_first_char \
 	string_index_ilseq \
 	string_index_next_ilseq \
diff --git a/tests/hard_coded/string_count_codepoints_ilseq.exp b/tests/hard_coded/string_count_codepoints_ilseq.exp
new file mode 100644
index 000000000..7f8f011eb
--- /dev/null
+++ b/tests/hard_coded/string_count_codepoints_ilseq.exp
@@ -0,0 +1 @@
+7
diff --git a/tests/hard_coded/string_count_codepoints_ilseq.exp2 b/tests/hard_coded/string_count_codepoints_ilseq.exp2
new file mode 100644
index 000000000..7ed6ff82d
--- /dev/null
+++ b/tests/hard_coded/string_count_codepoints_ilseq.exp2
@@ -0,0 +1 @@
+5
diff --git a/tests/hard_coded/string_count_codepoints_ilseq.m b/tests/hard_coded/string_count_codepoints_ilseq.m
new file mode 100644
index 000000000..a382a00b0
--- /dev/null
+++ b/tests/hard_coded/string_count_codepoints_ilseq.m
@@ -0,0 +1,35 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+%
+% The .exp file is for backends using UTF-8 string encoding.
+% The .exp2 file is for backends using UTF-16 string encoding.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_count_codepoints_ilseq.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module int.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    S0 = "😀",
+    S1 = string.between(S0, 0, 1),
+    S2 = string.between(S0, 1, count_code_units(S0)),
+    S = S0 ++ S1 ++ "," ++ S2 ++ S0,
+    string.count_codepoints(S, Num),
+    io.write_int(Num, !IO),
+    io.nl(!IO).
-- 
2.23.0



More information about the reviews mailing list