[m-rev.] for review: Tighten up string.from_code_unit_list et al.

Peter Wang novalazy at gmail.com
Mon Nov 4 16:51:38 AEDT 2019


library/string.m:
    Document that from_code_unit_list fails if the result string would
    contain a null character, and enforce that in the Java and C#
    implementations. It was already enforced in the C implementation.

    Make from_code_unit_list fail if the code unit list contains an
    invalid value (negative or >0xff or >0xffff).

    Document that from_utf{8,16}_code_unit_list fail if the result
    string would contain a null character.

    Make from_utf8_code_unit_list call semidet_from_rev_char_list rather
    than from_rev_char_list so that it fails as documented instead of
    throwing an exception if the code unit list correctly encodes a list
    of code points, but the code points cannot be encoded into a string.

    Similarly for from_utf16_code_unit_list.

tests/hard_coded/Mmakefile:
tests/hard_coded/string_from_code_unit_list.exp:
tests/hard_coded/string_from_code_unit_list.exp2:
tests/hard_coded/string_from_code_unit_list.m:
    Add test case.
---
 library/string.m                              | 43 ++++++++----
 tests/hard_coded/Mmakefile                    |  1 +
 .../hard_coded/string_from_code_unit_list.exp |  4 ++
 .../string_from_code_unit_list.exp2           |  4 ++
 tests/hard_coded/string_from_code_unit_list.m | 67 +++++++++++++++++++
 5 files changed, 105 insertions(+), 14 deletions(-)
 create mode 100644 tests/hard_coded/string_from_code_unit_list.exp
 create mode 100644 tests/hard_coded/string_from_code_unit_list.exp2
 create mode 100644 tests/hard_coded/string_from_code_unit_list.m

diff --git a/library/string.m b/library/string.m
index b0930a767..10428a0b4 100644
--- a/library/string.m
+++ b/library/string.m
@@ -218,18 +218,21 @@
 :- pred to_utf16_code_unit_list(string::in, list(int)::out) is det.
 
     % Convert a list of code units to a string.
-    % Fails if the list does not contain a valid encoding of a string,
-    % in the encoding expected by the current process.
+    % Fails if the list does not contain a valid encoding of a string
+    % (in the encoding expected by the current process),
+    % or if the string would contain a null character.
     %
 :- pred from_code_unit_list(list(int)::in, string::uo) is semidet.
 
     % Convert a list of UTF-8 code units to a string.
-    % Fails if the list does not contain a valid encoding of a string.
+    % Fails if the list does not contain a valid encoding of a string
+    % or if the string would contain a null character.
     %
 :- pred from_utf8_code_unit_list(list(int)::in, string::uo) is semidet.
 
-    % Convert a list of UTF-8 code units to a string.
-    % Fails if the list does not contain a valid encoding of a string.
+    % Convert a list of UTF-16 code units to a string.
+    % Fails if the list does not contain a valid encoding of a string
+    % or if the string would contain a null character.
     %
 :- pred from_utf16_code_unit_list(list(int)::in, string::uo) is semidet.
 
@@ -1690,6 +1693,10 @@ do_to_rev_char_list_loop(Str, Index0, !RevCharList) :-
     ).
 
 %---------------------%
+%
+% XXX There is an inconsistency in that from_char_list/from_rev_char_list
+% throw exceptions unlike from_code_unit_list/from_{utf8,utf16}_code_unit_list
+% which fail when the list of code points cannot be encoded in a string.
 
 from_char_list(Cs) = S :-
     from_char_list(Cs, S).
@@ -2027,11 +2034,9 @@ encode_utf16(Char, CodeList0, CodeList) :-
     size = 0;
     list_ptr = CodeList;
     while (! MR_list_is_empty(list_ptr)) {
-        int c;
-        c = MR_list_head(list_ptr);
-        // It is an error to put a null character in a string
-        // (see the comments at the top of this file).
-        if (c == '\\0' || c > 0xff) {
+        unsigned c = (unsigned) MR_list_head(list_ptr);
+        // Check for null character or invalid code unit.
+        if (c == 0 || c > 0xff) {
             SUCCESS_INDICATOR = MR_FALSE;
             break;
         }
@@ -2056,6 +2061,11 @@ encode_utf16(Char, CodeList0, CodeList) :-
 
     Iterable<Integer> iterable = new list.ListIterator<Integer>(CodeList);
     for (int i : iterable) {
+        // Check for null character or invalid code unit.
+        if (i <= 0 || i > 0xffff) {
+            SUCCESS_INDICATOR = false;
+            break;
+        }
         char c = (char) i;
         if (prev_high) {
             if (!java.lang.Character.isLowSurrogate(c)) {
@@ -2091,8 +2101,13 @@ encode_utf16(Char, CodeList0, CodeList) :-
     SUCCESS_INDICATOR = true;
 
     while (!list.is_empty(CodeList)) {
-        // Both casts are required.
-        char c = (char) (int) list.det_head(CodeList);
+        int i = (int) list.det_head(CodeList);
+        // Check for null character or invalid code unit.
+        if (i <= 0 || i > 0xffff) {
+            SUCCESS_INDICATOR = false;
+            break;
+        }
+        char c = (char) i;
         if (prev_high) {
             if (!System.Char.IsLowSurrogate(c)) {
                 SUCCESS_INDICATOR = false;
@@ -2134,7 +2149,7 @@ from_utf8_code_unit_list(CodeList, String) :-
         from_code_unit_list(CodeList, String)
     else
         decode_utf8(CodeList, [], RevChars),
-        from_rev_char_list(RevChars, String)
+        semidet_from_rev_char_list(RevChars, String)
     ).
 
 :- pred decode_utf8(list(int)::in, list(char)::in, list(char)::out) is semidet.
@@ -2188,7 +2203,7 @@ utf8_is_trail_byte(C) :-
 from_utf16_code_unit_list(CodeList, String) :-
     ( if internal_encoding_is_utf8 then
         decode_utf16(CodeList, [], RevChars),
-        from_rev_char_list(RevChars, String)
+        semidet_from_rev_char_list(RevChars, String)
     else
         from_code_unit_list(CodeList, String)
     ).
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 826ea7d0d..46b05b836 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -368,6 +368,7 @@ ORDINARY_PROGS = \
 	string_first_char_ilseq \
 	string_fold_ilseq \
 	string_from_char_list_ilseq \
+	string_from_code_unit_list \
 	string_index_ilseq \
 	string_index_next_ilseq \
 	string_loop \
diff --git a/tests/hard_coded/string_from_code_unit_list.exp b/tests/hard_coded/string_from_code_unit_list.exp
new file mode 100644
index 000000000..32e5b9f99
--- /dev/null
+++ b/tests/hard_coded/string_from_code_unit_list.exp
@@ -0,0 +1,4 @@
+from_code_unit_list([]) = ""
+from_code_unit_list([65, 0, 66]) failed
+from_code_unit_list([240, 159, 152, 128]) = "😀"
+from_code_unit_list([-16, 159, 152, 128]) failed
diff --git a/tests/hard_coded/string_from_code_unit_list.exp2 b/tests/hard_coded/string_from_code_unit_list.exp2
new file mode 100644
index 000000000..a168ecc72
--- /dev/null
+++ b/tests/hard_coded/string_from_code_unit_list.exp2
@@ -0,0 +1,4 @@
+from_code_unit_list([]) = ""
+from_code_unit_list([65, 0, 66]) failed
+from_code_unit_list([55357, 56832]) = "😀"
+from_code_unit_list([-10179, 56832]) failed
diff --git a/tests/hard_coded/string_from_code_unit_list.m b/tests/hard_coded/string_from_code_unit_list.m
new file mode 100644
index 000000000..5f2fd2ca2
--- /dev/null
+++ b/tests/hard_coded/string_from_code_unit_list.m
@@ -0,0 +1,67 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+%
+% The .exp file is for backends using UTF-8 string encoding.
+% The .exp2 file is for backends using UTF-16 string encoding.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_from_code_unit_list.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module int.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    ( if length("😀") = 4 then
+        Cases = test_cases_8
+    else
+        Cases = test_cases_16
+    ),
+    list.foldl(test_from_code_unit_list, Cases, !IO).
+
+:- func test_cases_8 = list(list(int)).
+
+test_cases_8 = [
+    [],
+    [65, 0, 66],    % null char
+    [0xf0, 0x9f, 0x98, 0x80],
+    % Sign extend first byte: the old implementation used to silently ignore
+    % higher bits so this case would succeed.
+    [\0xff \/ 0xf0, 0x9f, 0x98, 0x80]
+].
+
+:- func test_cases_16 = list(list(int)).
+
+test_cases_16 = [
+    [],
+    [65, 0, 66],
+    [0xd83d, 0xde00],
+    [\0xffff \/ 0xd83d, 0xde00]
+].
+
+:- pred test_from_code_unit_list(list(int)::in, io::di, io::uo) is det.
+
+test_from_code_unit_list(CodeList, !IO) :-
+    io.write_string("from_code_unit_list(", !IO),
+    io.write(CodeList, !IO),
+    io.write_string(")", !IO),
+    ( if string.from_code_unit_list(CodeList, String) then
+        io.write_string(" = """, !IO),
+        io.write_string(String, !IO),
+        io.write_string("""\n", !IO)
+    else
+        io.write_string(" failed\n", !IO)
+    ).
-- 
2.23.0



More information about the reviews mailing list