<div dir="ltr">This looks fine.</div> <div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Tue, Oct 29, 2019 at 12:02 PM Peter Wang <<a href="mailto:novalazy@gmail.com">novalazy@gmail.com</a>> wrote: </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">library/string.m: Make from_char_list, from_rev_char_list, to_char_list throw an exception if the list of chars includes a surrogate code point that cannot be encoded in a UTF-8 string. Make semidet_from_char_list, semidet_from_rev_char_list, to_char_list fail if the list of chars includes a surrogate code point that cannot be encoded in a UTF-8 string. runtime/mercury_string.h: Document return value of MR_utf8_width. tests/hard_coded/Mmakefile: tests/hard_coded/string_from_char_list_ilseq.exp: tests/hard_coded/string_from_char_list_ilseq.exp2: tests/hard_coded/string_from_char_list_ilseq.m: Add test case. tests/hard_coded/null_char.exp: Expect new message in exceptions thrown by from_char_list, from_rev_char_list. tests/hard_coded/string_hash.m: Don't generate surrogate code points in random strings. --- library/string.m | 162 +++++++++--------- runtime/mercury_string.h | 1 + tests/hard_coded/Mmakefile | 1 + tests/hard_coded/null_char.exp | 4 +- .../string_from_char_list_ilseq.exp | 15 ++ .../string_from_char_list_ilseq.exp2 | 15 ++ .../hard_coded/string_from_char_list_ilseq.m | 88 ++++++++++ tests/hard_coded/string_hash.m | 7 +- 8 files changed, 211 insertions(+), 82 deletions(-) create mode 100644 tests/hard_coded/string_from_char_list_ilseq.exp create mode 100644 tests/hard_coded/string_from_char_list_ilseq.exp2 create mode 100644 tests/hard_coded/string_from_char_list_ilseq.m diff --git a/library/string.m b/library/string.m index ab19974d2..cbdbcd128 100644 --- a/library/string.m +++ b/library/string.m @@ -131,10 +131,10 @@ % If strings use UTF-16 encoding then each unpaired surrogate code point % is returned as a separate code point in the list. % - % The reverse mode of the predicate throws an exception if - % the list of characters contains a null character. - % NOTE: In the future we may also throw an exception if the list contains - % a surrogate code point. + % The reverse mode of the predicate throws an exception if the list + % contains a null character or code point that cannot be encoded in a + % string (namely, surrogate code points cannot be encoded in UTF-8 + % strings). % % The reverse mode of to_char_list/2 is deprecated because the implied % ability to round trip convert a string to a list then back to the same @@ -155,10 +155,10 @@ % If strings use UTF-16 encoding then each unpaired surrogate code point % is returned as a separate code point in the list. % - % The reverse mode of the predicate throws an exception if - % the list of characters contains a null character. - % NOTE: In the future we may also throw an exception if the list contains - % a surrogate code point. + % The reverse mode of the predicate throws an exception if the list + % contains a null character or code point that cannot be encoded in a + % string (namely, surrogate code points cannot be encoded in UTF-8 + % strings). % % The reverse mode of to_rev_char_list/2 is deprecated because the implied % ability to round trip convert a string to a list then back to the same @@ -171,10 +171,9 @@ :- mode to_rev_char_list(uo, in) is det. % Convert a list of characters (code points) to a string. - % Throws an exception if the list of characters contains a null character. - % - % NOTE: In the future we may also throw an exception if the list contains - % a surrogate code point. + % Throws an exception if the list contains a null character or code point + % that cannot be encoded in a string (namely, surrogate code points cannot + % be encoded in UTF-8 strings). % % The forward mode of from_char_list/2 is deprecated because the implied % ability to round trip convert a string to a list then back to the same @@ -187,28 +186,21 @@ :- mode from_char_list(out, in) is det. % As above, but fail instead of throwing an exception if the list contains - % a null character. - % - % NOTE: In the future we may also throw an exception if the list contains - % a surrogate code point. + % a null character or code point that cannot be encoded in a string. % :- pred semidet_from_char_list(list(char)::in, string::uo) is semidet. % Same as from_char_list, except that it reverses the order % of the characters. - % Throws an exception if the list of characters contains a null character. - % - % NOTE: In the future we may also throw an exception if the list contains - % a surrogate code point. + % Throws an exception if the list contains a null character or code point + % that cannot be encoded in a string (namely, surrogate code points cannot + % be encoded in UTF-8 strings). % :- func from_rev_char_list(list(char)::in) = (string::uo) is det. :- pred from_rev_char_list(list(char)::in, string::uo) is det. % As above, but fail instead of throwing an exception if the list contains - % a null character. - % - % NOTE: In the future we may also throw an exception if the list contains - % a surrogate code point. + % a null character or code point that cannot be encoded in a string. % :- pred semidet_from_rev_char_list(list(char)::in, string::uo) is semidet. @@ -1696,7 +1688,7 @@ from_char_list(Chars::in, Str::uo) :- ( if semidet_from_char_list(Chars, Str0) then Str = Str0 else - unexpected($pred, "null character in list") + unexpected($pred, "null character or surrogate code point in list") ). :- pragma foreign_proc("C", @@ -1704,51 +1696,55 @@ from_char_list(Chars::in, Str::uo) :- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail, does_not_affect_liveness, may_not_duplicate, no_sharing], "{ - // mode (uo, in) is det MR_Word char_list_ptr; size_t size; - MR_Char c; // Loop to calculate list length + sizeof(MR_Word) in `size' // using list in `char_list_ptr'. - size = 0; - char_list_ptr = CharList; - while (! MR_list_is_empty(char_list_ptr)) { - c = (MR_Char) MR_list_head(char_list_ptr); - if (MR_is_ascii(c)) { - size++; - } else { - // XXX ILSEQ Do something if c is a surrogate code point. - size += MR_utf8_width(c); - } - char_list_ptr = MR_list_tail(char_list_ptr); - } - - // Allocate heap space for string. - MR_allocate_aligned_string_msg(Str, size, MR_ALLOC_ID); - - // Loop to copy the characters from the char_list to the string. SUCCESS_INDICATOR = MR_TRUE; size = 0; char_list_ptr = CharList; while (! MR_list_is_empty(char_list_ptr)) { - c = (MR_Char) MR_list_head(char_list_ptr); - // It is an error to put a null character in a string - // (see the comments at the top of this file). + MR_Char c = (MR_Char) MR_list_head(char_list_ptr); if (c == '\\0') { SUCCESS_INDICATOR = MR_FALSE; break; } if (MR_is_ascii(c)) { - Str[size] = c; size++; } else { - size += MR_utf8_encode(Str + size, c); + size_t csize = MR_utf8_width(c); + if (csize == 0) { + // c is a surrogate code point (or even out of range, + // but that is not supposed to happen). + SUCCESS_INDICATOR = MR_FALSE; + break; + } + size += csize; } char_list_ptr = MR_list_tail(char_list_ptr); } - Str[size] = '\\0'; + if (SUCCESS_INDICATOR) { + // Allocate heap space for string. + MR_allocate_aligned_string_msg(Str, size, MR_ALLOC_ID); + + // Loop to copy the characters from the char_list to the string. + size = 0; + char_list_ptr = CharList; + while (! MR_list_is_empty(char_list_ptr)) { + MR_Char c = (MR_Char) MR_list_head(char_list_ptr); + if (MR_is_ascii(c)) { + Str[size] = c; + size++; + } else { + size += MR_utf8_encode(Str + size, c); + } + char_list_ptr = MR_list_tail(char_list_ptr); + } + + Str[size] = '\\0'; + } }"). :- pragma foreign_proc("C#", semidet_from_char_list(CharList::in, Str::uo), @@ -1808,6 +1804,7 @@ semidet_from_char_list(CharList, Str) :- ; CharList = [C | Cs], not char.to_int(C, 0), + internal_encoding_is_utf8 => not char.is_surrogate(C), semidet_from_char_list(Cs, Str0), first_char(Str, C, Str0) ). @@ -1827,7 +1824,7 @@ from_rev_char_list(Chars, Str) :- ( if semidet_from_rev_char_list(Chars, Str0) then Str = Str0 else - unexpected($pred, "null character in list") + unexpected($pred, "null character or surrogate code point in list") ). :- pragma foreign_proc("C", @@ -1837,48 +1834,55 @@ from_rev_char_list(Chars, Str) :- "{ MR_Word list_ptr; MR_Word size; - MR_Char c; // Loop to calculate list length in `size' using list in `list_ptr'. + SUCCESS_INDICATOR = MR_TRUE; size = 0; list_ptr = Chars; while (!MR_list_is_empty(list_ptr)) { - c = (MR_Char) MR_list_head(list_ptr); - if (MR_is_ascii(c)) { - size++; - } else { - // XXX ILSEQ Do something if c is a surrogate code point. - size += MR_utf8_width(c); - } - list_ptr = MR_list_tail(list_ptr); - } - - // Allocate heap space for string. - MR_allocate_aligned_string_msg(Str, size, MR_ALLOC_ID); - - // Set size to be the offset of the end of the string - // (ie the \\0) and null terminate the string. - Str[size] = '\\0'; - - // Loop to copy the characters from the list_ptr to the string - // in reverse order. - list_ptr = Chars; - SUCCESS_INDICATOR = MR_TRUE; - while (!MR_list_is_empty(list_ptr)) { - c = (MR_Char) MR_list_head(list_ptr); + MR_Char c = (MR_Char) MR_list_head(list_ptr); if (c == '\\0') { SUCCESS_INDICATOR = MR_FALSE; break; } if (MR_is_ascii(c)) { - size--; - Str[size] = c; + size++; } else { - size -= MR_utf8_width(c); - MR_utf8_encode(Str + size, c); + size_t csize = MR_utf8_width(c); + if (csize == 0) { + // c is a surrogate code point (or even out of range, + // but that is not supposed to happen). + SUCCESS_INDICATOR = MR_FALSE; + break; + } + size += csize; } list_ptr = MR_list_tail(list_ptr); } + + if (SUCCESS_INDICATOR) { + // Allocate heap space for string. + MR_allocate_aligned_string_msg(Str, size, MR_ALLOC_ID); + + // Set size to be the offset of the end of the string + // (ie the \\0) and null terminate the string. + Str[size] = '\\0'; + + // Loop to copy the characters from the list_ptr to the string + // in reverse order. + list_ptr = Chars; + while (! MR_list_is_empty(list_ptr)) { + MR_Char c = (MR_Char) MR_list_head(list_ptr); + if (MR_is_ascii(c)) { + size--; + Str[size] = c; + } else { + size -= MR_utf8_width(c); + MR_utf8_encode(Str + size, c); + } + list_ptr = MR_list_tail(list_ptr); + } + } }"). :- pragma foreign_proc("C#", semidet_from_rev_char_list(Chars::in, Str::uo), diff --git a/runtime/mercury_string.h b/runtime/mercury_string.h index d3cb00262..a4dcf7c3c 100644 --- a/runtime/mercury_string.h +++ b/runtime/mercury_string.h @@ -485,6 +485,7 @@ extern MR_int_least32_t MR_utf8_get_next_mb(const MR_String s, extern MR_int_least32_t MR_utf8_prev_get(const MR_String s, MR_Integer *pos); // Return the number of bytes required to encode the code point `c' in UTF-8. +// Returns 0 for surrogate code points or illegal values. extern size_t MR_utf8_width(MR_Char c); diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile index ee9f51179..47e8ec5a8 100644 --- a/tests/hard_coded/Mmakefile +++ b/tests/hard_coded/Mmakefile @@ -365,6 +365,7 @@ ORDINARY_PROGS = \ string_count_codepoints_ilseq \ string_first_char \ string_fold_ilseq \ + string_from_char_list_ilseq \ string_index_ilseq \ string_index_next_ilseq \ string_loop \ diff --git a/tests/hard_coded/null_char.exp b/tests/hard_coded/null_char.exp index 7e03f6907..842f8aa9e 100644 --- a/tests/hard_coded/null_char.exp +++ b/tests/hard_coded/null_char.exp @@ -1,6 +1,6 @@ All tests should result in exceptions being thrown. -exception(univ_cons(software_error("predicate `string.from_char_list\'/2: Unexpected: null character in list"))) -exception(univ_cons(software_error("predicate `string.from_rev_char_list\'/2: Unexpected: null character in list"))) +exception(univ_cons(software_error("predicate `string.from_char_list\'/2: Unexpected: null character or surrogate code point in list"))) +exception(univ_cons(software_error("predicate `string.from_rev_char_list\'/2: Unexpected: null character or surrogate code point in list"))) exception(univ_cons(software_error("predicate `string.set_char\'/4: Unexpected: null character"))) exception(univ_cons(software_error("predicate `string.unsafe_set_char\'/4: Unexpected: null character"))) error("", io_error("null character in input")) diff --git a/tests/hard_coded/string_from_char_list_ilseq.exp b/tests/hard_coded/string_from_char_list_ilseq.exp new file mode 100644 index 000000000..73f24d867 --- /dev/null +++ b/tests/hard_coded/string_from_char_list_ilseq.exp @@ -0,0 +1,15 @@ +semidet_from_char_list: +a b +semidet_from_char_list failed +semidet_from_char_list failed +semidet_from_char_list failed +semidet_from_char_list failed +semidet_from_char_list failed + +semidet_from_rev_char_list: +b a +semidet_from_rev_char_list failed +semidet_from_rev_char_list failed +semidet_from_rev_char_list failed +semidet_from_rev_char_list failed +semidet_from_rev_char_list failed diff --git a/tests/hard_coded/string_from_char_list_ilseq.exp2 b/tests/hard_coded/string_from_char_list_ilseq.exp2 new file mode 100644 index 000000000..c77d3d069 --- /dev/null +++ b/tests/hard_coded/string_from_char_list_ilseq.exp2 @@ -0,0 +1,15 @@ +semidet_from_char_list: +a b +semidet_from_char_list failed +a 0xd83d b +a 0xde00 b +a 😀 b +a 0xde00 0xd83d b + +semidet_from_rev_char_list: +b a +semidet_from_rev_char_list failed +b 0xd83d a +b 0xde00 a +b 0xde00 0xd83d a +b 😀 a diff --git a/tests/hard_coded/string_from_char_list_ilseq.m b/tests/hard_coded/string_from_char_list_ilseq.m new file mode 100644 index 000000000..740f256b3 --- /dev/null +++ b/tests/hard_coded/string_from_char_list_ilseq.m @@ -0,0 +1,88 @@ +%---------------------------------------------------------------------------% +% vim: ts=4 sw=4 et ft=mercury +%---------------------------------------------------------------------------% +% +% The .exp file is for backends using UTF-8 string encoding. +% The .exp2 file is for backends using UTF-16 string encoding. +% +%---------------------------------------------------------------------------% + +:- module string_from_char_list_ilseq. +:- interface. + +:- import_module io. + +:- pred main(io::di, io::uo) is det. + +%---------------------------------------------------------------------------% + +:- implementation. + +:- import_module char. +:- import_module list. +:- import_module string. + +%---------------------------------------------------------------------------% + +main(!IO) :- + io.write_string("semidet_from_char_list:\n", !IO), + list.foldl(test_from_char_list, test_cases, !IO), + + io.write_string("\nsemidet_from_rev_char_list:\n", !IO), + list.foldl(test_from_rev_char_list, test_cases, !IO). + +:- func test_cases = list(list(char)). + +test_cases = [ + ['a', 'b'], + ['a', char.det_from_int(0), 'b'], + ['a', char.det_from_int(0xd83d), 'b'], + ['a', char.det_from_int(0xde00), 'b'], + ['a', char.det_from_int(0xd83d), char.det_from_int(0xde00), 'b'], + ['a', char.det_from_int(0xde00), char.det_from_int(0xd83d), 'b'] +]. + +:- pred test_from_char_list(list(char)::in, io::di, io::uo) is det. + +test_from_char_list(Chars, !IO) :- + ( if semidet_from_char_list(Chars, Str) then + write_string_debug(Str, !IO), + <a href="http://io.nl" rel="noreferrer" target="_blank">io.nl</a>(!IO) + else + io.write_string("semidet_from_char_list failed\n", !IO) + ). + +:- pred test_from_rev_char_list(list(char)::in, io::di, io::uo) is det. + +test_from_rev_char_list(Chars, !IO) :- + ( if semidet_from_rev_char_list(Chars, Str) then + write_string_debug(Str, !IO), + <a href="http://io.nl" rel="noreferrer" target="_blank">io.nl</a>(!IO) + else + io.write_string("semidet_from_rev_char_list failed\n", !IO) + ). + +:- pred write_string_debug(string::in, io::di, io::uo) is det. + +write_string_debug(S, !IO) :- + write_string_debug_loop(S, 0, !IO). + +:- pred write_string_debug_loop(string::in, int::in, io::di, io::uo) is det. + +write_string_debug_loop(S, Index, !IO) :- + ( if string.index_next(S, Index, NextIndex, Char) then + ( if is_surrogate(Char) then + write_hex(char.to_int(Char), !IO) + else + io.write_char(Char, !IO) + ), + io.write_char(' ', !IO), + write_string_debug_loop(S, NextIndex, !IO) + else + true + ). + +:- pred write_hex(int::in, io::di, io::uo) is det. + +write_hex(I, !IO) :- + io.format("%#x", [i(I)], !IO). diff --git a/tests/hard_coded/string_hash.m b/tests/hard_coded/string_hash.m index 1e8afe68b..5d534935b 100644 --- a/tests/hard_coded/string_hash.m +++ b/tests/hard_coded/string_hash.m @@ -93,7 +93,12 @@ rand_char(Char, !RS) :- random.random(Rand, !RS), % U+0001..U+10ffff (avoid null character). Int = 1 + (Rand `mod` char.max_char_value), - char.det_from_int(Int, Char). + char.det_from_int(Int, Char0), + ( if is_surrogate(Char0) then + rand_char(Char, !RS) + else + Char = Char0 + ). :- pred test_hash(string::in, int::in, int::in, string::in, bool::in, bool::out, io::di, io::uo) is det. -- 2.23.0 _______________________________________________ reviews mailing list <a href="mailto:reviews@lists.mercurylang.org" target="_blank">reviews@lists.mercurylang.org</a> <a href="https://lists.mercurylang.org/listinfo/reviews" rel="noreferrer" target="_blank">https://lists.mercurylang.org/listinfo/reviews</a> </blockquote></div>