[m-rev.] for review: Check for surrogates when converting list of char to string.

Peter Wang novalazy at gmail.com
Tue Oct 29 12:01:46 AEDT 2019


library/string.m:
    Make from_char_list, from_rev_char_list, to_char_list throw an
    exception if the list of chars includes a surrogate code point that
    cannot be encoded in a UTF-8 string.

    Make semidet_from_char_list, semidet_from_rev_char_list,
    to_char_list fail if the list of chars includes a surrogate code
    point that cannot be encoded in a UTF-8 string.

runtime/mercury_string.h:
    Document return value of MR_utf8_width.

tests/hard_coded/Mmakefile:
tests/hard_coded/string_from_char_list_ilseq.exp:
tests/hard_coded/string_from_char_list_ilseq.exp2:
tests/hard_coded/string_from_char_list_ilseq.m:
    Add test case.

tests/hard_coded/null_char.exp:
    Expect new message in exceptions thrown by from_char_list,
    from_rev_char_list.

tests/hard_coded/string_hash.m:
    Don't generate surrogate code points in random strings.
---
 library/string.m                              | 162 +++++++++---------
 runtime/mercury_string.h                      |   1 +
 tests/hard_coded/Mmakefile                    |   1 +
 tests/hard_coded/null_char.exp                |   4 +-
 .../string_from_char_list_ilseq.exp           |  15 ++
 .../string_from_char_list_ilseq.exp2          |  15 ++
 .../hard_coded/string_from_char_list_ilseq.m  |  88 ++++++++++
 tests/hard_coded/string_hash.m                |   7 +-
 8 files changed, 211 insertions(+), 82 deletions(-)
 create mode 100644 tests/hard_coded/string_from_char_list_ilseq.exp
 create mode 100644 tests/hard_coded/string_from_char_list_ilseq.exp2
 create mode 100644 tests/hard_coded/string_from_char_list_ilseq.m

diff --git a/library/string.m b/library/string.m
index ab19974d2..cbdbcd128 100644
--- a/library/string.m
+++ b/library/string.m
@@ -131,10 +131,10 @@
     % If strings use UTF-16 encoding then each unpaired surrogate code point
     % is returned as a separate code point in the list.
     %
-    % The reverse mode of the predicate throws an exception if
-    % the list of characters contains a null character.
-    % NOTE: In the future we may also throw an exception if the list contains
-    % a surrogate code point.
+    % The reverse mode of the predicate throws an exception if the list
+    % contains a null character or code point that cannot be encoded in a
+    % string (namely, surrogate code points cannot be encoded in UTF-8
+    % strings).
     %
     % The reverse mode of to_char_list/2 is deprecated because the implied
     % ability to round trip convert a string to a list then back to the same
@@ -155,10 +155,10 @@
     % If strings use UTF-16 encoding then each unpaired surrogate code point
     % is returned as a separate code point in the list.
     %
-    % The reverse mode of the predicate throws an exception if
-    % the list of characters contains a null character.
-    % NOTE: In the future we may also throw an exception if the list contains
-    % a surrogate code point.
+    % The reverse mode of the predicate throws an exception if the list
+    % contains a null character or code point that cannot be encoded in a
+    % string (namely, surrogate code points cannot be encoded in UTF-8
+    % strings).
     %
     % The reverse mode of to_rev_char_list/2 is deprecated because the implied
     % ability to round trip convert a string to a list then back to the same
@@ -171,10 +171,9 @@
 :- mode to_rev_char_list(uo, in) is det.
 
     % Convert a list of characters (code points) to a string.
-    % Throws an exception if the list of characters contains a null character.
-    %
-    % NOTE: In the future we may also throw an exception if the list contains
-    % a surrogate code point.
+    % Throws an exception if the list contains a null character or code point
+    % that cannot be encoded in a string (namely, surrogate code points cannot
+    % be encoded in UTF-8 strings).
     %
     % The forward mode of from_char_list/2 is deprecated because the implied
     % ability to round trip convert a string to a list then back to the same
@@ -187,28 +186,21 @@
 :- mode from_char_list(out, in) is det.
 
     % As above, but fail instead of throwing an exception if the list contains
-    % a null character.
-    %
-    % NOTE: In the future we may also throw an exception if the list contains
-    % a surrogate code point.
+    % a null character or code point that cannot be encoded in a string.
     %
 :- pred semidet_from_char_list(list(char)::in, string::uo) is semidet.
 
     % Same as from_char_list, except that it reverses the order
     % of the characters.
-    % Throws an exception if the list of characters contains a null character.
-    %
-    % NOTE: In the future we may also throw an exception if the list contains
-    % a surrogate code point.
+    % Throws an exception if the list contains a null character or code point
+    % that cannot be encoded in a string (namely, surrogate code points cannot
+    % be encoded in UTF-8 strings).
     %
 :- func from_rev_char_list(list(char)::in) = (string::uo) is det.
 :- pred from_rev_char_list(list(char)::in, string::uo) is det.
 
     % As above, but fail instead of throwing an exception if the list contains
-    % a null character.
-    %
-    % NOTE: In the future we may also throw an exception if the list contains
-    % a surrogate code point.
+    % a null character or code point that cannot be encoded in a string.
     %
 :- pred semidet_from_rev_char_list(list(char)::in, string::uo) is semidet.
 
@@ -1696,7 +1688,7 @@ from_char_list(Chars::in, Str::uo) :-
     ( if semidet_from_char_list(Chars, Str0) then
         Str = Str0
     else
-        unexpected($pred, "null character in list")
+        unexpected($pred, "null character or surrogate code point in list")
     ).
 
 :- pragma foreign_proc("C",
@@ -1704,51 +1696,55 @@ from_char_list(Chars::in, Str::uo) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, may_not_duplicate, no_sharing],
 "{
-    // mode (uo, in) is det
     MR_Word char_list_ptr;
     size_t size;
-    MR_Char c;
 
     // Loop to calculate list length + sizeof(MR_Word) in `size'
     // using list in `char_list_ptr'.
-    size = 0;
-    char_list_ptr = CharList;
-    while (! MR_list_is_empty(char_list_ptr)) {
-        c = (MR_Char) MR_list_head(char_list_ptr);
-        if (MR_is_ascii(c)) {
-            size++;
-        } else {
-            // XXX ILSEQ Do something if c is a surrogate code point.
-            size += MR_utf8_width(c);
-        }
-        char_list_ptr = MR_list_tail(char_list_ptr);
-    }
-
-    // Allocate heap space for string.
-    MR_allocate_aligned_string_msg(Str, size, MR_ALLOC_ID);
-
-    // Loop to copy the characters from the char_list to the string.
     SUCCESS_INDICATOR = MR_TRUE;
     size = 0;
     char_list_ptr = CharList;
     while (! MR_list_is_empty(char_list_ptr)) {
-        c = (MR_Char) MR_list_head(char_list_ptr);
-        // It is an error to put a null character in a string
-        // (see the comments at the top of this file).
+        MR_Char c = (MR_Char) MR_list_head(char_list_ptr);
         if (c == '\\0') {
             SUCCESS_INDICATOR = MR_FALSE;
             break;
         }
         if (MR_is_ascii(c)) {
-            Str[size] = c;
             size++;
         } else {
-            size += MR_utf8_encode(Str + size, c);
+            size_t csize = MR_utf8_width(c);
+            if (csize == 0) {
+                // c is a surrogate code point (or even out of range,
+                // but that is not supposed to happen).
+                SUCCESS_INDICATOR = MR_FALSE;
+                break;
+            }
+            size += csize;
         }
         char_list_ptr = MR_list_tail(char_list_ptr);
     }
 
-    Str[size] = '\\0';
+    if (SUCCESS_INDICATOR) {
+        // Allocate heap space for string.
+        MR_allocate_aligned_string_msg(Str, size, MR_ALLOC_ID);
+
+        // Loop to copy the characters from the char_list to the string.
+        size = 0;
+        char_list_ptr = CharList;
+        while (! MR_list_is_empty(char_list_ptr)) {
+            MR_Char c = (MR_Char) MR_list_head(char_list_ptr);
+            if (MR_is_ascii(c)) {
+                Str[size] = c;
+                size++;
+            } else {
+                size += MR_utf8_encode(Str + size, c);
+            }
+            char_list_ptr = MR_list_tail(char_list_ptr);
+        }
+
+        Str[size] = '\\0';
+    }
 }").
 :- pragma foreign_proc("C#",
     semidet_from_char_list(CharList::in, Str::uo),
@@ -1808,6 +1804,7 @@ semidet_from_char_list(CharList, Str) :-
     ;
         CharList = [C | Cs],
         not char.to_int(C, 0),
+        internal_encoding_is_utf8 => not char.is_surrogate(C),
         semidet_from_char_list(Cs, Str0),
         first_char(Str, C, Str0)
     ).
@@ -1827,7 +1824,7 @@ from_rev_char_list(Chars, Str) :-
     ( if semidet_from_rev_char_list(Chars, Str0) then
         Str = Str0
     else
-        unexpected($pred, "null character in list")
+        unexpected($pred, "null character or surrogate code point in list")
     ).
 
 :- pragma foreign_proc("C",
@@ -1837,48 +1834,55 @@ from_rev_char_list(Chars, Str) :-
 "{
     MR_Word list_ptr;
     MR_Word size;
-    MR_Char c;
 
     // Loop to calculate list length in `size' using list in `list_ptr'.
+    SUCCESS_INDICATOR = MR_TRUE;
     size = 0;
     list_ptr = Chars;
     while (!MR_list_is_empty(list_ptr)) {
-        c = (MR_Char) MR_list_head(list_ptr);
-        if (MR_is_ascii(c)) {
-            size++;
-        } else {
-            // XXX ILSEQ Do something if c is a surrogate code point.
-            size += MR_utf8_width(c);
-        }
-        list_ptr = MR_list_tail(list_ptr);
-    }
-
-    // Allocate heap space for string.
-    MR_allocate_aligned_string_msg(Str, size, MR_ALLOC_ID);
-
-    // Set size to be the offset of the end of the string
-    // (ie the \\0) and null terminate the string.
-    Str[size] = '\\0';
-
-    // Loop to copy the characters from the list_ptr to the string
-    // in reverse order.
-    list_ptr = Chars;
-    SUCCESS_INDICATOR = MR_TRUE;
-    while (!MR_list_is_empty(list_ptr)) {
-        c = (MR_Char) MR_list_head(list_ptr);
+        MR_Char c = (MR_Char) MR_list_head(list_ptr);
         if (c == '\\0') {
             SUCCESS_INDICATOR = MR_FALSE;
             break;
         }
         if (MR_is_ascii(c)) {
-            size--;
-            Str[size] = c;
+            size++;
         } else {
-            size -= MR_utf8_width(c);
-            MR_utf8_encode(Str + size, c);
+            size_t csize = MR_utf8_width(c);
+            if (csize == 0) {
+                // c is a surrogate code point (or even out of range,
+                // but that is not supposed to happen).
+                SUCCESS_INDICATOR = MR_FALSE;
+                break;
+            }
+            size += csize;
         }
         list_ptr = MR_list_tail(list_ptr);
     }
+
+    if (SUCCESS_INDICATOR) {
+        // Allocate heap space for string.
+        MR_allocate_aligned_string_msg(Str, size, MR_ALLOC_ID);
+
+        // Set size to be the offset of the end of the string
+        // (ie the \\0) and null terminate the string.
+        Str[size] = '\\0';
+
+        // Loop to copy the characters from the list_ptr to the string
+        // in reverse order.
+        list_ptr = Chars;
+        while (! MR_list_is_empty(list_ptr)) {
+            MR_Char c = (MR_Char) MR_list_head(list_ptr);
+            if (MR_is_ascii(c)) {
+                size--;
+                Str[size] = c;
+            } else {
+                size -= MR_utf8_width(c);
+                MR_utf8_encode(Str + size, c);
+            }
+            list_ptr = MR_list_tail(list_ptr);
+        }
+    }
 }").
 :- pragma foreign_proc("C#",
     semidet_from_rev_char_list(Chars::in, Str::uo),
diff --git a/runtime/mercury_string.h b/runtime/mercury_string.h
index d3cb00262..a4dcf7c3c 100644
--- a/runtime/mercury_string.h
+++ b/runtime/mercury_string.h
@@ -485,6 +485,7 @@ extern MR_int_least32_t MR_utf8_get_next_mb(const MR_String s,
 extern MR_int_least32_t MR_utf8_prev_get(const MR_String s, MR_Integer *pos);
 
 // Return the number of bytes required to encode the code point `c' in UTF-8.
+// Returns 0 for surrogate code points or illegal values.
 
 extern size_t           MR_utf8_width(MR_Char c);
 
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index ee9f51179..47e8ec5a8 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -365,6 +365,7 @@ ORDINARY_PROGS = \
 	string_count_codepoints_ilseq \
 	string_first_char \
 	string_fold_ilseq \
+	string_from_char_list_ilseq \
 	string_index_ilseq \
 	string_index_next_ilseq \
 	string_loop \
diff --git a/tests/hard_coded/null_char.exp b/tests/hard_coded/null_char.exp
index 7e03f6907..842f8aa9e 100644
--- a/tests/hard_coded/null_char.exp
+++ b/tests/hard_coded/null_char.exp
@@ -1,6 +1,6 @@
 All tests should result in exceptions being thrown.
-exception(univ_cons(software_error("predicate `string.from_char_list\'/2: Unexpected: null character in list")))
-exception(univ_cons(software_error("predicate `string.from_rev_char_list\'/2: Unexpected: null character in list")))
+exception(univ_cons(software_error("predicate `string.from_char_list\'/2: Unexpected: null character or surrogate code point in list")))
+exception(univ_cons(software_error("predicate `string.from_rev_char_list\'/2: Unexpected: null character or surrogate code point in list")))
 exception(univ_cons(software_error("predicate `string.set_char\'/4: Unexpected: null character")))
 exception(univ_cons(software_error("predicate `string.unsafe_set_char\'/4: Unexpected: null character")))
 error("", io_error("null character in input"))
diff --git a/tests/hard_coded/string_from_char_list_ilseq.exp b/tests/hard_coded/string_from_char_list_ilseq.exp
new file mode 100644
index 000000000..73f24d867
--- /dev/null
+++ b/tests/hard_coded/string_from_char_list_ilseq.exp
@@ -0,0 +1,15 @@
+semidet_from_char_list:
+a b 
+semidet_from_char_list failed
+semidet_from_char_list failed
+semidet_from_char_list failed
+semidet_from_char_list failed
+semidet_from_char_list failed
+
+semidet_from_rev_char_list:
+b a 
+semidet_from_rev_char_list failed
+semidet_from_rev_char_list failed
+semidet_from_rev_char_list failed
+semidet_from_rev_char_list failed
+semidet_from_rev_char_list failed
diff --git a/tests/hard_coded/string_from_char_list_ilseq.exp2 b/tests/hard_coded/string_from_char_list_ilseq.exp2
new file mode 100644
index 000000000..c77d3d069
--- /dev/null
+++ b/tests/hard_coded/string_from_char_list_ilseq.exp2
@@ -0,0 +1,15 @@
+semidet_from_char_list:
+a b 
+semidet_from_char_list failed
+a 0xd83d b 
+a 0xde00 b 
+a 😀 b 
+a 0xde00 0xd83d b 
+
+semidet_from_rev_char_list:
+b a 
+semidet_from_rev_char_list failed
+b 0xd83d a 
+b 0xde00 a 
+b 0xde00 0xd83d a 
+b 😀 a 
diff --git a/tests/hard_coded/string_from_char_list_ilseq.m b/tests/hard_coded/string_from_char_list_ilseq.m
new file mode 100644
index 000000000..740f256b3
--- /dev/null
+++ b/tests/hard_coded/string_from_char_list_ilseq.m
@@ -0,0 +1,88 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+%
+% The .exp file is for backends using UTF-8 string encoding.
+% The .exp2 file is for backends using UTF-16 string encoding.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_from_char_list_ilseq.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    io.write_string("semidet_from_char_list:\n", !IO),
+    list.foldl(test_from_char_list, test_cases, !IO),
+
+    io.write_string("\nsemidet_from_rev_char_list:\n", !IO),
+    list.foldl(test_from_rev_char_list, test_cases, !IO).
+
+:- func test_cases = list(list(char)).
+
+test_cases = [
+    ['a', 'b'],
+    ['a', char.det_from_int(0), 'b'],
+    ['a', char.det_from_int(0xd83d), 'b'],
+    ['a', char.det_from_int(0xde00), 'b'],
+    ['a', char.det_from_int(0xd83d), char.det_from_int(0xde00), 'b'],
+    ['a', char.det_from_int(0xde00), char.det_from_int(0xd83d), 'b']
+].
+
+:- pred test_from_char_list(list(char)::in, io::di, io::uo) is det.
+
+test_from_char_list(Chars, !IO) :-
+    ( if semidet_from_char_list(Chars, Str) then
+        write_string_debug(Str, !IO),
+        io.nl(!IO)
+    else
+        io.write_string("semidet_from_char_list failed\n", !IO)
+    ).
+
+:- pred test_from_rev_char_list(list(char)::in, io::di, io::uo) is det.
+
+test_from_rev_char_list(Chars, !IO) :-
+    ( if semidet_from_rev_char_list(Chars, Str) then
+        write_string_debug(Str, !IO),
+        io.nl(!IO)
+    else
+        io.write_string("semidet_from_rev_char_list failed\n", !IO)
+    ).
+
+:- pred write_string_debug(string::in, io::di, io::uo) is det.
+
+write_string_debug(S, !IO) :-
+    write_string_debug_loop(S, 0, !IO).
+
+:- pred write_string_debug_loop(string::in, int::in, io::di, io::uo) is det.
+
+write_string_debug_loop(S, Index, !IO) :-
+    ( if string.index_next(S, Index, NextIndex, Char) then
+        ( if is_surrogate(Char) then
+            write_hex(char.to_int(Char), !IO)
+        else
+            io.write_char(Char, !IO)
+        ),
+        io.write_char(' ', !IO),
+        write_string_debug_loop(S, NextIndex, !IO)
+    else
+        true
+    ).
+
+:- pred write_hex(int::in, io::di, io::uo) is det.
+
+write_hex(I, !IO) :-
+    io.format("%#x", [i(I)], !IO).
diff --git a/tests/hard_coded/string_hash.m b/tests/hard_coded/string_hash.m
index 1e8afe68b..5d534935b 100644
--- a/tests/hard_coded/string_hash.m
+++ b/tests/hard_coded/string_hash.m
@@ -93,7 +93,12 @@ rand_char(Char, !RS) :-
     random.random(Rand, !RS),
     % U+0001..U+10ffff (avoid null character).
     Int = 1 + (Rand `mod` char.max_char_value),
-    char.det_from_int(Int, Char).
+    char.det_from_int(Int, Char0),
+    ( if is_surrogate(Char0) then
+        rand_char(Char, !RS)
+    else
+        Char = Char0
+    ).
 
 :- pred test_hash(string::in, int::in, int::in, string::in,
     bool::in, bool::out, io::di, io::uo) is det.
-- 
2.23.0



More information about the reviews mailing list