[m-rev.] for review: Tighten up string.from_code_unit_list et al.
Peter Wang
novalazy at gmail.com
Mon Nov 4 16:51:38 AEDT 2019
library/string.m:
Document that from_code_unit_list fails if the result string would
contain a null character, and enforce that in the Java and C#
implementations. It was already enforced in the C implementation.
Make from_code_unit_list fail if the code unit list contains an
invalid value (negative or >0xff or >0xffff).
Document that from_utf{8,16}_code_unit_list fail if the result
string would contain a null character.
Make from_utf8_code_unit_list call semidet_from_rev_char_list rather
than from_rev_char_list so that it fails as documented instead of
throwing an exception if the code unit list correctly encodes a list
of code points, but the code points cannot be encoded into a string.
Similarly for from_utf16_code_unit_list.
tests/hard_coded/Mmakefile:
tests/hard_coded/string_from_code_unit_list.exp:
tests/hard_coded/string_from_code_unit_list.exp2:
tests/hard_coded/string_from_code_unit_list.m:
Add test case.
---
library/string.m | 43 ++++++++----
tests/hard_coded/Mmakefile | 1 +
.../hard_coded/string_from_code_unit_list.exp | 4 ++
.../string_from_code_unit_list.exp2 | 4 ++
tests/hard_coded/string_from_code_unit_list.m | 67 +++++++++++++++++++
5 files changed, 105 insertions(+), 14 deletions(-)
create mode 100644 tests/hard_coded/string_from_code_unit_list.exp
create mode 100644 tests/hard_coded/string_from_code_unit_list.exp2
create mode 100644 tests/hard_coded/string_from_code_unit_list.m
diff --git a/library/string.m b/library/string.m
index b0930a767..10428a0b4 100644
--- a/library/string.m
+++ b/library/string.m
@@ -218,18 +218,21 @@
:- pred to_utf16_code_unit_list(string::in, list(int)::out) is det.
% Convert a list of code units to a string.
- % Fails if the list does not contain a valid encoding of a string,
- % in the encoding expected by the current process.
+ % Fails if the list does not contain a valid encoding of a string
+ % (in the encoding expected by the current process),
+ % or if the string would contain a null character.
%
:- pred from_code_unit_list(list(int)::in, string::uo) is semidet.
% Convert a list of UTF-8 code units to a string.
- % Fails if the list does not contain a valid encoding of a string.
+ % Fails if the list does not contain a valid encoding of a string
+ % or if the string would contain a null character.
%
:- pred from_utf8_code_unit_list(list(int)::in, string::uo) is semidet.
- % Convert a list of UTF-8 code units to a string.
- % Fails if the list does not contain a valid encoding of a string.
+ % Convert a list of UTF-16 code units to a string.
+ % Fails if the list does not contain a valid encoding of a string
+ % or if the string would contain a null character.
%
:- pred from_utf16_code_unit_list(list(int)::in, string::uo) is semidet.
@@ -1690,6 +1693,10 @@ do_to_rev_char_list_loop(Str, Index0, !RevCharList) :-
).
%---------------------%
+%
+% XXX There is an inconsistency in that from_char_list/from_rev_char_list
+% throw exceptions unlike from_code_unit_list/from_{utf8,utf16}_code_unit_list
+% which fail when the list of code points cannot be encoded in a string.
from_char_list(Cs) = S :-
from_char_list(Cs, S).
@@ -2027,11 +2034,9 @@ encode_utf16(Char, CodeList0, CodeList) :-
size = 0;
list_ptr = CodeList;
while (! MR_list_is_empty(list_ptr)) {
- int c;
- c = MR_list_head(list_ptr);
- // It is an error to put a null character in a string
- // (see the comments at the top of this file).
- if (c == '\\0' || c > 0xff) {
+ unsigned c = (unsigned) MR_list_head(list_ptr);
+ // Check for null character or invalid code unit.
+ if (c == 0 || c > 0xff) {
SUCCESS_INDICATOR = MR_FALSE;
break;
}
@@ -2056,6 +2061,11 @@ encode_utf16(Char, CodeList0, CodeList) :-
Iterable<Integer> iterable = new list.ListIterator<Integer>(CodeList);
for (int i : iterable) {
+ // Check for null character or invalid code unit.
+ if (i <= 0 || i > 0xffff) {
+ SUCCESS_INDICATOR = false;
+ break;
+ }
char c = (char) i;
if (prev_high) {
if (!java.lang.Character.isLowSurrogate(c)) {
@@ -2091,8 +2101,13 @@ encode_utf16(Char, CodeList0, CodeList) :-
SUCCESS_INDICATOR = true;
while (!list.is_empty(CodeList)) {
- // Both casts are required.
- char c = (char) (int) list.det_head(CodeList);
+ int i = (int) list.det_head(CodeList);
+ // Check for null character or invalid code unit.
+ if (i <= 0 || i > 0xffff) {
+ SUCCESS_INDICATOR = false;
+ break;
+ }
+ char c = (char) i;
if (prev_high) {
if (!System.Char.IsLowSurrogate(c)) {
SUCCESS_INDICATOR = false;
@@ -2134,7 +2149,7 @@ from_utf8_code_unit_list(CodeList, String) :-
from_code_unit_list(CodeList, String)
else
decode_utf8(CodeList, [], RevChars),
- from_rev_char_list(RevChars, String)
+ semidet_from_rev_char_list(RevChars, String)
).
:- pred decode_utf8(list(int)::in, list(char)::in, list(char)::out) is semidet.
@@ -2188,7 +2203,7 @@ utf8_is_trail_byte(C) :-
from_utf16_code_unit_list(CodeList, String) :-
( if internal_encoding_is_utf8 then
decode_utf16(CodeList, [], RevChars),
- from_rev_char_list(RevChars, String)
+ semidet_from_rev_char_list(RevChars, String)
else
from_code_unit_list(CodeList, String)
).
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 826ea7d0d..46b05b836 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -368,6 +368,7 @@ ORDINARY_PROGS = \
string_first_char_ilseq \
string_fold_ilseq \
string_from_char_list_ilseq \
+ string_from_code_unit_list \
string_index_ilseq \
string_index_next_ilseq \
string_loop \
diff --git a/tests/hard_coded/string_from_code_unit_list.exp b/tests/hard_coded/string_from_code_unit_list.exp
new file mode 100644
index 000000000..32e5b9f99
--- /dev/null
+++ b/tests/hard_coded/string_from_code_unit_list.exp
@@ -0,0 +1,4 @@
+from_code_unit_list([]) = ""
+from_code_unit_list([65, 0, 66]) failed
+from_code_unit_list([240, 159, 152, 128]) = "😀"
+from_code_unit_list([-16, 159, 152, 128]) failed
diff --git a/tests/hard_coded/string_from_code_unit_list.exp2 b/tests/hard_coded/string_from_code_unit_list.exp2
new file mode 100644
index 000000000..a168ecc72
--- /dev/null
+++ b/tests/hard_coded/string_from_code_unit_list.exp2
@@ -0,0 +1,4 @@
+from_code_unit_list([]) = ""
+from_code_unit_list([65, 0, 66]) failed
+from_code_unit_list([55357, 56832]) = "😀"
+from_code_unit_list([-10179, 56832]) failed
diff --git a/tests/hard_coded/string_from_code_unit_list.m b/tests/hard_coded/string_from_code_unit_list.m
new file mode 100644
index 000000000..5f2fd2ca2
--- /dev/null
+++ b/tests/hard_coded/string_from_code_unit_list.m
@@ -0,0 +1,67 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+%
+% The .exp file is for backends using UTF-8 string encoding.
+% The .exp2 file is for backends using UTF-16 string encoding.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_from_code_unit_list.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module int.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+ ( if length("😀") = 4 then
+ Cases = test_cases_8
+ else
+ Cases = test_cases_16
+ ),
+ list.foldl(test_from_code_unit_list, Cases, !IO).
+
+:- func test_cases_8 = list(list(int)).
+
+test_cases_8 = [
+ [],
+ [65, 0, 66], % null char
+ [0xf0, 0x9f, 0x98, 0x80],
+ % Sign extend first byte: the old implementation used to silently ignore
+ % higher bits so this case would succeed.
+ [\0xff \/ 0xf0, 0x9f, 0x98, 0x80]
+].
+
+:- func test_cases_16 = list(list(int)).
+
+test_cases_16 = [
+ [],
+ [65, 0, 66],
+ [0xd83d, 0xde00],
+ [\0xffff \/ 0xd83d, 0xde00]
+].
+
+:- pred test_from_code_unit_list(list(int)::in, io::di, io::uo) is det.
+
+test_from_code_unit_list(CodeList, !IO) :-
+ io.write_string("from_code_unit_list(", !IO),
+ io.write(CodeList, !IO),
+ io.write_string(")", !IO),
+ ( if string.from_code_unit_list(CodeList, String) then
+ io.write_string(" = """, !IO),
+ io.write_string(String, !IO),
+ io.write_string("""\n", !IO)
+ else
+ io.write_string(" failed\n", !IO)
+ ).
--
2.23.0
More information about the reviews
mailing list