[m-rev.] for review: Define behaviour of string.to_char_list (and rev) on ill-formed sequences.
Peter Wang
novalazy at gmail.com
Wed Oct 16 17:04:20 AEDT 2019
library/string.m:
Define string.to_char_list and string.to_rev_char_list to either
replace code units in ill-formed sequences with U+FFFD or return
unpaired surrogate code points.
Use Mercury version of do_to_char_list instead of updating
the foreign language implementations.
tests/hard_coded/Mmakefile:
tests/hard_coded/string_char_list_ilseq.exp:
tests/hard_coded/string_char_list_ilseq.exp2:
tests/hard_coded/string_char_list_ilseq.m:
Add test case.
---
library/string.m | 123 ++++---------------
tests/hard_coded/Mmakefile | 1 +
tests/hard_coded/string_char_list_ilseq.exp | 5 +
tests/hard_coded/string_char_list_ilseq.exp2 | 5 +
tests/hard_coded/string_char_list_ilseq.m | 50 ++++++++
5 files changed, 88 insertions(+), 96 deletions(-)
create mode 100644 tests/hard_coded/string_char_list_ilseq.exp
create mode 100644 tests/hard_coded/string_char_list_ilseq.exp2
create mode 100644 tests/hard_coded/string_char_list_ilseq.m
diff --git a/library/string.m b/library/string.m
index 1b0d32e87..0501a3198 100644
--- a/library/string.m
+++ b/library/string.m
@@ -124,9 +124,15 @@
%
% Convert the string to a list of characters (code points).
+ %
+ % In the forward mode:
+ % If strings use UTF-8 encoding then each code unit in an ill-formed
+ % sequence is replaced by U+FFFD REPLACEMENT CHARACTER in the list.
+ % If strings use UTF-16 encoding then each unpaired surrogate code point
+ % is returned as a separate code point in the list.
+ %
% The reverse mode of the predicate throws an exception if
% the list of characters contains a null character.
- %
% NOTE: In the future we may also throw an exception if the list contains
% a surrogate code point.
%
@@ -135,10 +141,17 @@
:- mode to_char_list(in, out) is det.
:- mode to_char_list(uo, in) is det.
- % Convert the string to a list of characters (code points) in reverse order.
+ % Convert the string to a list of characters (code points) in reverse
+ % order.
+ %
+ % In the forward mode:
+ % If strings use UTF-8 encoding then each code unit in an ill-formed
+ % sequence is replaced by U+FFFD REPLACEMENT CHARACTER in the list.
+ % If strings use UTF-16 encoding then each unpaired surrogate code point
+ % is returned as a separate code point in the list.
+ %
% The reverse mode of the predicate throws an exception if
% the list of characters contains a null character.
- %
% NOTE: In the future we may also throw an exception if the list contains
% a surrogate code point.
%
@@ -1514,26 +1527,6 @@
% Conversions between strings and lists of characters.
%
-% XXX ILSEQ Behaviour on ill-formed sequences differs by target language.
-% - C: stops at first ill-formed sequence from the right
-% - C#: unpaired surrogates are replaced by U+FFFD
-% - Java: unpaired surrogates are returned as-is
-%
-% Perhaps we need versions of to_char_list/to_rev_char_list with choice of
-% action on ill-formed sequences:
-% - throw an exception
-% - ignore
-% - replace with U+FFFD
-%
-% Replacing with U+FFFD is only necessary for UTF-8. Unpaired surrogates in
-% UTF-16 can represent themselves since they are just code points.
-%
-% For users that want round-trip conversion of potentially invalid UTF-8,
-% one possibility would be to map bytes in ill-formed sequences to chars
-% in the low surrogate range U+DC80..U+DCFF, see
-% https://hyperreal.org/~est/utf-8b/releases/utf-8b-20060413043934/kuhn-utf-8b.html
-% A simpler possibility is to introduce a char_or_code_unit type.
-
to_char_list(S) = Cs :-
to_char_list(S, Cs).
@@ -1546,84 +1539,22 @@ to_char_list(Str::uo, CharList::in) :-
:- pred do_to_char_list(string::in, list(char)::out) is det.
-:- pragma foreign_proc("C",
- do_to_char_list(Str::in, CharList::out),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
- does_not_affect_liveness, no_sharing],
-"{
- MR_Integer pos = strlen(Str);
- int c;
+do_to_char_list(Str, CharList) :-
+ do_to_char_list_loop(Str, length(Str), [], CharList).
- CharList = MR_list_empty_msg(MR_ALLOC_ID);
- for (;;) {
- c = MR_utf8_prev_get(Str, &pos);
- if (c <= 0) {
- break;
- }
- CharList = MR_char_list_cons_msg((MR_UnsignedChar) c, CharList,
- MR_ALLOC_ID);
- }
-}").
-:- pragma foreign_proc("C#",
- do_to_char_list(Str::in, CharList::out),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
- does_not_affect_liveness, no_sharing],
-"
- list.List_1 lst = list.empty_list();
- for (int i = Str.Length - 1; i >= 0; i--) {
- int c;
- char c2 = Str[i];
- if (System.Char.IsLowSurrogate(c2)) {
- try {
- char c1 = Str[i - 1];
- c = System.Char.ConvertToUtf32(c1, c2);
- i--;
- } catch (System.ArgumentOutOfRangeException) {
- c = 0xfffd;
- } catch (System.IndexOutOfRangeException) {
- c = 0xfffd;
- }
- } else if (System.Char.IsHighSurrogate(c2)) {
- c = 0xfffd;
- } else {
- c = c2;
- }
- lst = list.cons(c, lst);
- }
- CharList = lst;
-").
-:- pragma foreign_proc("Java",
- do_to_char_list(Str::in, CharList::out),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
- does_not_affect_liveness, no_sharing],
-"
- list.List_1<Integer> lst = list.empty_list();
- for (int i = Str.length(); i > 0; ) {
- int c = Str.codePointBefore(i);
- lst = list.cons(c, lst);
- i -= java.lang.Character.charCount(c);
- }
- CharList = lst;
-").
-:- pragma foreign_proc("Erlang",
- do_to_char_list(Str::in, CharList::out),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
- does_not_affect_liveness],
-"
- CharList = unicode:characters_to_list(Str)
-").
+:- pred do_to_char_list_loop(string::in, int::in,
+ list(char)::in, list(char)::out) is det.
- % It would be worth checking if this Mercury implementation (or another
- % one) is as fast the foreign_proc implementations, the next time any
- % changes are required.
- %
-do_to_char_list(Str, CharList) :-
- foldr(list.cons, Str, [], CharList).
+do_to_char_list_loop(Str, Index0, !CharList) :-
+ ( if string.unsafe_prev_index(Str, Index0, Index1, C) then
+ !:CharList = [C | !.CharList],
+ do_to_char_list_loop(Str, Index1, !CharList)
+ else
+ true
+ ).
%---------------------%
-% XXX ILSEQ unsafe_index_next causes truncation at first ill-formed sequence.
-
to_rev_char_list(S) = Cs :-
to_rev_char_list(S, Cs).
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 84e38bd34..f11b93eb0 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -354,6 +354,7 @@ ORDINARY_PROGS = \
string_append_ooi \
string_builder_test \
string_case \
+ string_char_list_ilseq \
string_class \
string_code_unit \
string_codepoint \
diff --git a/tests/hard_coded/string_char_list_ilseq.exp b/tests/hard_coded/string_char_list_ilseq.exp
new file mode 100644
index 000000000..1999b22a4
--- /dev/null
+++ b/tests/hard_coded/string_char_list_ilseq.exp
@@ -0,0 +1,5 @@
+string.to_char_list
+[a, b, c, 😀, 0xfffd, 0xfffd, 0xfffd, x, y, z]
+
+string.to_rev_char_list
+[z, y, x, 0xfffd, 0xfffd, 0xfffd, 😀, c, b, a]
diff --git a/tests/hard_coded/string_char_list_ilseq.exp2 b/tests/hard_coded/string_char_list_ilseq.exp2
new file mode 100644
index 000000000..e45d8e4dd
--- /dev/null
+++ b/tests/hard_coded/string_char_list_ilseq.exp2
@@ -0,0 +1,5 @@
+string.to_char_list
+[a, b, c, 😀, 0xd83d, x, y, z]
+
+string.to_rev_char_list
+[z, y, x, 0xd83d, 😀, c, b, a]
diff --git a/tests/hard_coded/string_char_list_ilseq.m b/tests/hard_coded/string_char_list_ilseq.m
new file mode 100644
index 000000000..615eea350
--- /dev/null
+++ b/tests/hard_coded/string_char_list_ilseq.m
@@ -0,0 +1,50 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+%
+% The .exp file is for backends using UTF-8 string encoding.
+% The .exp2 file is for backends using UTF-16 string encoding.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_char_list_ilseq.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module int.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+ S0 = "😀",
+ S1 = string.between(S0, 0, count_code_units(S0) - 1),
+ S = "abc" ++ S0 ++ S1 ++ "xyz",
+
+ string.to_char_list(S, CharList),
+ io.write_string("string.to_char_list\n[", !IO),
+ io.write_list(CharList, ", ", write_char_or_hex, !IO),
+ io.write_string("]\n\n", !IO),
+
+ string.to_rev_char_list(S, RevCharList),
+ io.write_string("string.to_rev_char_list\n[", !IO),
+ io.write_list(RevCharList, ", ", write_char_or_hex, !IO),
+ io.write_string("]\n", !IO).
+
+:- pred write_char_or_hex(char::in, io::di, io::uo) is det.
+
+write_char_or_hex(Char, !IO) :-
+ ( if Char = '\ufffd' ; char.is_surrogate(Char) then
+ io.format("%#x", [i(char.to_int(Char))], !IO)
+ else
+ io.write_char(Char, !IO)
+ ).
--
2.23.0
More information about the reviews
mailing list