[m-rev.] for review: Check for surrogates when converting list of char to string.
Peter Wang
novalazy at gmail.com
Tue Oct 29 12:01:46 AEDT 2019
library/string.m:
Make from_char_list, from_rev_char_list, to_char_list throw an
exception if the list of chars includes a surrogate code point that
cannot be encoded in a UTF-8 string.
Make semidet_from_char_list, semidet_from_rev_char_list,
to_char_list fail if the list of chars includes a surrogate code
point that cannot be encoded in a UTF-8 string.
runtime/mercury_string.h:
Document return value of MR_utf8_width.
tests/hard_coded/Mmakefile:
tests/hard_coded/string_from_char_list_ilseq.exp:
tests/hard_coded/string_from_char_list_ilseq.exp2:
tests/hard_coded/string_from_char_list_ilseq.m:
Add test case.
tests/hard_coded/null_char.exp:
Expect new message in exceptions thrown by from_char_list,
from_rev_char_list.
tests/hard_coded/string_hash.m:
Don't generate surrogate code points in random strings.
---
library/string.m | 162 +++++++++---------
runtime/mercury_string.h | 1 +
tests/hard_coded/Mmakefile | 1 +
tests/hard_coded/null_char.exp | 4 +-
.../string_from_char_list_ilseq.exp | 15 ++
.../string_from_char_list_ilseq.exp2 | 15 ++
.../hard_coded/string_from_char_list_ilseq.m | 88 ++++++++++
tests/hard_coded/string_hash.m | 7 +-
8 files changed, 211 insertions(+), 82 deletions(-)
create mode 100644 tests/hard_coded/string_from_char_list_ilseq.exp
create mode 100644 tests/hard_coded/string_from_char_list_ilseq.exp2
create mode 100644 tests/hard_coded/string_from_char_list_ilseq.m
diff --git a/library/string.m b/library/string.m
index ab19974d2..cbdbcd128 100644
--- a/library/string.m
+++ b/library/string.m
@@ -131,10 +131,10 @@
% If strings use UTF-16 encoding then each unpaired surrogate code point
% is returned as a separate code point in the list.
%
- % The reverse mode of the predicate throws an exception if
- % the list of characters contains a null character.
- % NOTE: In the future we may also throw an exception if the list contains
- % a surrogate code point.
+ % The reverse mode of the predicate throws an exception if the list
+ % contains a null character or code point that cannot be encoded in a
+ % string (namely, surrogate code points cannot be encoded in UTF-8
+ % strings).
%
% The reverse mode of to_char_list/2 is deprecated because the implied
% ability to round trip convert a string to a list then back to the same
@@ -155,10 +155,10 @@
% If strings use UTF-16 encoding then each unpaired surrogate code point
% is returned as a separate code point in the list.
%
- % The reverse mode of the predicate throws an exception if
- % the list of characters contains a null character.
- % NOTE: In the future we may also throw an exception if the list contains
- % a surrogate code point.
+ % The reverse mode of the predicate throws an exception if the list
+ % contains a null character or code point that cannot be encoded in a
+ % string (namely, surrogate code points cannot be encoded in UTF-8
+ % strings).
%
% The reverse mode of to_rev_char_list/2 is deprecated because the implied
% ability to round trip convert a string to a list then back to the same
@@ -171,10 +171,9 @@
:- mode to_rev_char_list(uo, in) is det.
% Convert a list of characters (code points) to a string.
- % Throws an exception if the list of characters contains a null character.
- %
- % NOTE: In the future we may also throw an exception if the list contains
- % a surrogate code point.
+ % Throws an exception if the list contains a null character or code point
+ % that cannot be encoded in a string (namely, surrogate code points cannot
+ % be encoded in UTF-8 strings).
%
% The forward mode of from_char_list/2 is deprecated because the implied
% ability to round trip convert a string to a list then back to the same
@@ -187,28 +186,21 @@
:- mode from_char_list(out, in) is det.
% As above, but fail instead of throwing an exception if the list contains
- % a null character.
- %
- % NOTE: In the future we may also throw an exception if the list contains
- % a surrogate code point.
+ % a null character or code point that cannot be encoded in a string.
%
:- pred semidet_from_char_list(list(char)::in, string::uo) is semidet.
% Same as from_char_list, except that it reverses the order
% of the characters.
- % Throws an exception if the list of characters contains a null character.
- %
- % NOTE: In the future we may also throw an exception if the list contains
- % a surrogate code point.
+ % Throws an exception if the list contains a null character or code point
+ % that cannot be encoded in a string (namely, surrogate code points cannot
+ % be encoded in UTF-8 strings).
%
:- func from_rev_char_list(list(char)::in) = (string::uo) is det.
:- pred from_rev_char_list(list(char)::in, string::uo) is det.
% As above, but fail instead of throwing an exception if the list contains
- % a null character.
- %
- % NOTE: In the future we may also throw an exception if the list contains
- % a surrogate code point.
+ % a null character or code point that cannot be encoded in a string.
%
:- pred semidet_from_rev_char_list(list(char)::in, string::uo) is semidet.
@@ -1696,7 +1688,7 @@ from_char_list(Chars::in, Str::uo) :-
( if semidet_from_char_list(Chars, Str0) then
Str = Str0
else
- unexpected($pred, "null character in list")
+ unexpected($pred, "null character or surrogate code point in list")
).
:- pragma foreign_proc("C",
@@ -1704,51 +1696,55 @@ from_char_list(Chars::in, Str::uo) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, may_not_duplicate, no_sharing],
"{
- // mode (uo, in) is det
MR_Word char_list_ptr;
size_t size;
- MR_Char c;
// Loop to calculate list length + sizeof(MR_Word) in `size'
// using list in `char_list_ptr'.
- size = 0;
- char_list_ptr = CharList;
- while (! MR_list_is_empty(char_list_ptr)) {
- c = (MR_Char) MR_list_head(char_list_ptr);
- if (MR_is_ascii(c)) {
- size++;
- } else {
- // XXX ILSEQ Do something if c is a surrogate code point.
- size += MR_utf8_width(c);
- }
- char_list_ptr = MR_list_tail(char_list_ptr);
- }
-
- // Allocate heap space for string.
- MR_allocate_aligned_string_msg(Str, size, MR_ALLOC_ID);
-
- // Loop to copy the characters from the char_list to the string.
SUCCESS_INDICATOR = MR_TRUE;
size = 0;
char_list_ptr = CharList;
while (! MR_list_is_empty(char_list_ptr)) {
- c = (MR_Char) MR_list_head(char_list_ptr);
- // It is an error to put a null character in a string
- // (see the comments at the top of this file).
+ MR_Char c = (MR_Char) MR_list_head(char_list_ptr);
if (c == '\\0') {
SUCCESS_INDICATOR = MR_FALSE;
break;
}
if (MR_is_ascii(c)) {
- Str[size] = c;
size++;
} else {
- size += MR_utf8_encode(Str + size, c);
+ size_t csize = MR_utf8_width(c);
+ if (csize == 0) {
+ // c is a surrogate code point (or even out of range,
+ // but that is not supposed to happen).
+ SUCCESS_INDICATOR = MR_FALSE;
+ break;
+ }
+ size += csize;
}
char_list_ptr = MR_list_tail(char_list_ptr);
}
- Str[size] = '\\0';
+ if (SUCCESS_INDICATOR) {
+ // Allocate heap space for string.
+ MR_allocate_aligned_string_msg(Str, size, MR_ALLOC_ID);
+
+ // Loop to copy the characters from the char_list to the string.
+ size = 0;
+ char_list_ptr = CharList;
+ while (! MR_list_is_empty(char_list_ptr)) {
+ MR_Char c = (MR_Char) MR_list_head(char_list_ptr);
+ if (MR_is_ascii(c)) {
+ Str[size] = c;
+ size++;
+ } else {
+ size += MR_utf8_encode(Str + size, c);
+ }
+ char_list_ptr = MR_list_tail(char_list_ptr);
+ }
+
+ Str[size] = '\\0';
+ }
}").
:- pragma foreign_proc("C#",
semidet_from_char_list(CharList::in, Str::uo),
@@ -1808,6 +1804,7 @@ semidet_from_char_list(CharList, Str) :-
;
CharList = [C | Cs],
not char.to_int(C, 0),
+ internal_encoding_is_utf8 => not char.is_surrogate(C),
semidet_from_char_list(Cs, Str0),
first_char(Str, C, Str0)
).
@@ -1827,7 +1824,7 @@ from_rev_char_list(Chars, Str) :-
( if semidet_from_rev_char_list(Chars, Str0) then
Str = Str0
else
- unexpected($pred, "null character in list")
+ unexpected($pred, "null character or surrogate code point in list")
).
:- pragma foreign_proc("C",
@@ -1837,48 +1834,55 @@ from_rev_char_list(Chars, Str) :-
"{
MR_Word list_ptr;
MR_Word size;
- MR_Char c;
// Loop to calculate list length in `size' using list in `list_ptr'.
+ SUCCESS_INDICATOR = MR_TRUE;
size = 0;
list_ptr = Chars;
while (!MR_list_is_empty(list_ptr)) {
- c = (MR_Char) MR_list_head(list_ptr);
- if (MR_is_ascii(c)) {
- size++;
- } else {
- // XXX ILSEQ Do something if c is a surrogate code point.
- size += MR_utf8_width(c);
- }
- list_ptr = MR_list_tail(list_ptr);
- }
-
- // Allocate heap space for string.
- MR_allocate_aligned_string_msg(Str, size, MR_ALLOC_ID);
-
- // Set size to be the offset of the end of the string
- // (ie the \\0) and null terminate the string.
- Str[size] = '\\0';
-
- // Loop to copy the characters from the list_ptr to the string
- // in reverse order.
- list_ptr = Chars;
- SUCCESS_INDICATOR = MR_TRUE;
- while (!MR_list_is_empty(list_ptr)) {
- c = (MR_Char) MR_list_head(list_ptr);
+ MR_Char c = (MR_Char) MR_list_head(list_ptr);
if (c == '\\0') {
SUCCESS_INDICATOR = MR_FALSE;
break;
}
if (MR_is_ascii(c)) {
- size--;
- Str[size] = c;
+ size++;
} else {
- size -= MR_utf8_width(c);
- MR_utf8_encode(Str + size, c);
+ size_t csize = MR_utf8_width(c);
+ if (csize == 0) {
+ // c is a surrogate code point (or even out of range,
+ // but that is not supposed to happen).
+ SUCCESS_INDICATOR = MR_FALSE;
+ break;
+ }
+ size += csize;
}
list_ptr = MR_list_tail(list_ptr);
}
+
+ if (SUCCESS_INDICATOR) {
+ // Allocate heap space for string.
+ MR_allocate_aligned_string_msg(Str, size, MR_ALLOC_ID);
+
+ // Set size to be the offset of the end of the string
+ // (ie the \\0) and null terminate the string.
+ Str[size] = '\\0';
+
+ // Loop to copy the characters from the list_ptr to the string
+ // in reverse order.
+ list_ptr = Chars;
+ while (! MR_list_is_empty(list_ptr)) {
+ MR_Char c = (MR_Char) MR_list_head(list_ptr);
+ if (MR_is_ascii(c)) {
+ size--;
+ Str[size] = c;
+ } else {
+ size -= MR_utf8_width(c);
+ MR_utf8_encode(Str + size, c);
+ }
+ list_ptr = MR_list_tail(list_ptr);
+ }
+ }
}").
:- pragma foreign_proc("C#",
semidet_from_rev_char_list(Chars::in, Str::uo),
diff --git a/runtime/mercury_string.h b/runtime/mercury_string.h
index d3cb00262..a4dcf7c3c 100644
--- a/runtime/mercury_string.h
+++ b/runtime/mercury_string.h
@@ -485,6 +485,7 @@ extern MR_int_least32_t MR_utf8_get_next_mb(const MR_String s,
extern MR_int_least32_t MR_utf8_prev_get(const MR_String s, MR_Integer *pos);
// Return the number of bytes required to encode the code point `c' in UTF-8.
+// Returns 0 for surrogate code points or illegal values.
extern size_t MR_utf8_width(MR_Char c);
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index ee9f51179..47e8ec5a8 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -365,6 +365,7 @@ ORDINARY_PROGS = \
string_count_codepoints_ilseq \
string_first_char \
string_fold_ilseq \
+ string_from_char_list_ilseq \
string_index_ilseq \
string_index_next_ilseq \
string_loop \
diff --git a/tests/hard_coded/null_char.exp b/tests/hard_coded/null_char.exp
index 7e03f6907..842f8aa9e 100644
--- a/tests/hard_coded/null_char.exp
+++ b/tests/hard_coded/null_char.exp
@@ -1,6 +1,6 @@
All tests should result in exceptions being thrown.
-exception(univ_cons(software_error("predicate `string.from_char_list\'/2: Unexpected: null character in list")))
-exception(univ_cons(software_error("predicate `string.from_rev_char_list\'/2: Unexpected: null character in list")))
+exception(univ_cons(software_error("predicate `string.from_char_list\'/2: Unexpected: null character or surrogate code point in list")))
+exception(univ_cons(software_error("predicate `string.from_rev_char_list\'/2: Unexpected: null character or surrogate code point in list")))
exception(univ_cons(software_error("predicate `string.set_char\'/4: Unexpected: null character")))
exception(univ_cons(software_error("predicate `string.unsafe_set_char\'/4: Unexpected: null character")))
error("", io_error("null character in input"))
diff --git a/tests/hard_coded/string_from_char_list_ilseq.exp b/tests/hard_coded/string_from_char_list_ilseq.exp
new file mode 100644
index 000000000..73f24d867
--- /dev/null
+++ b/tests/hard_coded/string_from_char_list_ilseq.exp
@@ -0,0 +1,15 @@
+semidet_from_char_list:
+a b
+semidet_from_char_list failed
+semidet_from_char_list failed
+semidet_from_char_list failed
+semidet_from_char_list failed
+semidet_from_char_list failed
+
+semidet_from_rev_char_list:
+b a
+semidet_from_rev_char_list failed
+semidet_from_rev_char_list failed
+semidet_from_rev_char_list failed
+semidet_from_rev_char_list failed
+semidet_from_rev_char_list failed
diff --git a/tests/hard_coded/string_from_char_list_ilseq.exp2 b/tests/hard_coded/string_from_char_list_ilseq.exp2
new file mode 100644
index 000000000..c77d3d069
--- /dev/null
+++ b/tests/hard_coded/string_from_char_list_ilseq.exp2
@@ -0,0 +1,15 @@
+semidet_from_char_list:
+a b
+semidet_from_char_list failed
+a 0xd83d b
+a 0xde00 b
+a 😀 b
+a 0xde00 0xd83d b
+
+semidet_from_rev_char_list:
+b a
+semidet_from_rev_char_list failed
+b 0xd83d a
+b 0xde00 a
+b 0xde00 0xd83d a
+b 😀 a
diff --git a/tests/hard_coded/string_from_char_list_ilseq.m b/tests/hard_coded/string_from_char_list_ilseq.m
new file mode 100644
index 000000000..740f256b3
--- /dev/null
+++ b/tests/hard_coded/string_from_char_list_ilseq.m
@@ -0,0 +1,88 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+%
+% The .exp file is for backends using UTF-8 string encoding.
+% The .exp2 file is for backends using UTF-16 string encoding.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_from_char_list_ilseq.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+ io.write_string("semidet_from_char_list:\n", !IO),
+ list.foldl(test_from_char_list, test_cases, !IO),
+
+ io.write_string("\nsemidet_from_rev_char_list:\n", !IO),
+ list.foldl(test_from_rev_char_list, test_cases, !IO).
+
+:- func test_cases = list(list(char)).
+
+test_cases = [
+ ['a', 'b'],
+ ['a', char.det_from_int(0), 'b'],
+ ['a', char.det_from_int(0xd83d), 'b'],
+ ['a', char.det_from_int(0xde00), 'b'],
+ ['a', char.det_from_int(0xd83d), char.det_from_int(0xde00), 'b'],
+ ['a', char.det_from_int(0xde00), char.det_from_int(0xd83d), 'b']
+].
+
+:- pred test_from_char_list(list(char)::in, io::di, io::uo) is det.
+
+test_from_char_list(Chars, !IO) :-
+ ( if semidet_from_char_list(Chars, Str) then
+ write_string_debug(Str, !IO),
+ io.nl(!IO)
+ else
+ io.write_string("semidet_from_char_list failed\n", !IO)
+ ).
+
+:- pred test_from_rev_char_list(list(char)::in, io::di, io::uo) is det.
+
+test_from_rev_char_list(Chars, !IO) :-
+ ( if semidet_from_rev_char_list(Chars, Str) then
+ write_string_debug(Str, !IO),
+ io.nl(!IO)
+ else
+ io.write_string("semidet_from_rev_char_list failed\n", !IO)
+ ).
+
+:- pred write_string_debug(string::in, io::di, io::uo) is det.
+
+write_string_debug(S, !IO) :-
+ write_string_debug_loop(S, 0, !IO).
+
+:- pred write_string_debug_loop(string::in, int::in, io::di, io::uo) is det.
+
+write_string_debug_loop(S, Index, !IO) :-
+ ( if string.index_next(S, Index, NextIndex, Char) then
+ ( if is_surrogate(Char) then
+ write_hex(char.to_int(Char), !IO)
+ else
+ io.write_char(Char, !IO)
+ ),
+ io.write_char(' ', !IO),
+ write_string_debug_loop(S, NextIndex, !IO)
+ else
+ true
+ ).
+
+:- pred write_hex(int::in, io::di, io::uo) is det.
+
+write_hex(I, !IO) :-
+ io.format("%#x", [i(I)], !IO).
diff --git a/tests/hard_coded/string_hash.m b/tests/hard_coded/string_hash.m
index 1e8afe68b..5d534935b 100644
--- a/tests/hard_coded/string_hash.m
+++ b/tests/hard_coded/string_hash.m
@@ -93,7 +93,12 @@ rand_char(Char, !RS) :-
random.random(Rand, !RS),
% U+0001..U+10ffff (avoid null character).
Int = 1 + (Rand `mod` char.max_char_value),
- char.det_from_int(Int, Char).
+ char.det_from_int(Int, Char0),
+ ( if is_surrogate(Char0) then
+ rand_char(Char, !RS)
+ else
+ Char = Char0
+ ).
:- pred test_hash(string::in, int::in, int::in, string::in,
bool::in, bool::out, io::di, io::uo) is det.
--
2.23.0
More information about the reviews
mailing list