[m-rev.] for review: Add string.from_code_unit_list_allow_ill_formed.
Peter Wang
novalazy at gmail.com
Mon Nov 4 16:51:39 AEDT 2019
Suggestions for a better name?
----
library/string.m:
Add string.from_code_unit_list_allow_ill_formed/2.
tests/hard_coded/string_from_code_unit_list.exp:
tests/hard_coded/string_from_code_unit_list.exp2:
tests/hard_coded/string_from_code_unit_list.m:
Extend test case.
NEWS:
Announce addition.
---
NEWS | 1 +
library/string.m | 135 ++++++++++++------
.../hard_coded/string_from_code_unit_list.exp | 7 +
.../string_from_code_unit_list.exp2 | 7 +
tests/hard_coded/string_from_code_unit_list.m | 26 +++-
5 files changed, 126 insertions(+), 50 deletions(-)
diff --git a/NEWS b/NEWS
index b8827d9b1..e876cd6d3 100644
--- a/NEWS
+++ b/NEWS
@@ -420,6 +420,7 @@ Changes to the Mercury standard library:
- is_all_alnum/1
- is_empty/1
- is_well_formed/1
+ - from_code_unit_list_allow_ill_formed/2
- to_utf8_code_unit_list/2
- to_utf16_code_unit_list/2
- from_utf8_code_unit_list/2
diff --git a/library/string.m b/library/string.m
index 10428a0b4..9adabf662 100644
--- a/library/string.m
+++ b/library/string.m
@@ -224,6 +224,14 @@
%
:- pred from_code_unit_list(list(int)::in, string::uo) is semidet.
+ % Convert a list of code units to a string.
+ % The resulting string may contain ill-formed sequences.
+ % Fails if the list contains a code unit that is out of range
+ % or if the string would contain a null character.
+ %
+:- pred from_code_unit_list_allow_ill_formed(list(int)::in, string::uo)
+ is semidet.
+
% Convert a list of UTF-8 code units to a string.
% Fails if the list does not contain a valid encoding of a string
% or if the string would contain a null character.
@@ -2009,12 +2017,18 @@ encode_utf16(Char, CodeList0, CodeList) :-
%---------------------%
-% XXX ILSEQ to_code_unit_list(S0, L), from_code_unit_list(L, S) may fail
-% because the original string contained an ill-formed sequence.
-% Perhaps we should provide a predicate that can produce the original string.
+from_code_unit_list(CodeList, Str) :-
+ Verify = yes,
+ do_from_code_unit_list(CodeList, Verify, Str).
+
+from_code_unit_list_allow_ill_formed(CodeList, Str) :-
+ Verify = no,
+ do_from_code_unit_list(CodeList, Verify, Str).
+
+:- pred do_from_code_unit_list(list(int)::in, bool::in, string::uo) is semidet.
:- pragma foreign_proc("C",
- from_code_unit_list(CodeList::in, Str::uo),
+ do_from_code_unit_list(CodeList::in, Verify::in, Str::uo),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, may_not_duplicate, no_sharing],
"
@@ -2047,43 +2061,57 @@ encode_utf16(Char, CodeList0, CodeList) :-
Str[size] = '\\0';
- SUCCESS_INDICATOR = SUCCESS_INDICATOR && MR_utf8_verify(Str);
+ if (SUCCESS_INDICATOR && Verify == MR_YES) {
+ SUCCESS_INDICATOR = MR_utf8_verify(Str);
+ }
").
:- pragma foreign_proc("Java",
- from_code_unit_list(CodeList::in, Str::uo),
+ do_from_code_unit_list(CodeList::in, Verify::in, Str::uo),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness],
"
java.lang.StringBuilder sb = new java.lang.StringBuilder();
- boolean prev_high = false;
SUCCESS_INDICATOR = true;
- Iterable<Integer> iterable = new list.ListIterator<Integer>(CodeList);
- for (int i : iterable) {
- // Check for null character or invalid code unit.
- if (i <= 0 || i > 0xffff) {
- SUCCESS_INDICATOR = false;
- break;
+ if (Verify == bool.YES) {
+ boolean prev_high = false;
+ Iterable<Integer> iterable = new list.ListIterator<Integer>(CodeList);
+ for (int i : iterable) {
+ // Check for null character or invalid code unit.
+ if (i <= 0 || i > 0xffff) {
+ SUCCESS_INDICATOR = false;
+ break;
+ }
+ char c = (char) i;
+ if (prev_high) {
+ if (!java.lang.Character.isLowSurrogate(c)) {
+ SUCCESS_INDICATOR = false;
+ break;
+ }
+ prev_high = false;
+ } else if (java.lang.Character.isHighSurrogate(c)) {
+ prev_high = true;
+ } else if (java.lang.Character.isLowSurrogate(c)) {
+ SUCCESS_INDICATOR = false;
+ break;
+ }
+ sb.append(c);
}
- char c = (char) i;
- if (prev_high) {
- if (!java.lang.Character.isLowSurrogate(c)) {
+ SUCCESS_INDICATOR = SUCCESS_INDICATOR && !prev_high;
+ } else {
+ Iterable<Integer> iterable = new list.ListIterator<Integer>(CodeList);
+ for (int i : iterable) {
+ // Check for null character or invalid code unit.
+ if (i <= 0 || i > 0xffff) {
SUCCESS_INDICATOR = false;
break;
}
- prev_high = false;
- } else if (java.lang.Character.isHighSurrogate(c)) {
- prev_high = true;
- } else if (java.lang.Character.isLowSurrogate(c)) {
- SUCCESS_INDICATOR = false;
- break;
+ char c = (char) i;
+ sb.append(c);
}
- sb.append(c);
}
- SUCCESS_INDICATOR = SUCCESS_INDICATOR && !prev_high;
-
if (SUCCESS_INDICATOR) {
Str = sb.toString();
} else {
@@ -2091,41 +2119,54 @@ encode_utf16(Char, CodeList0, CodeList) :-
}
").
:- pragma foreign_proc("C#",
- from_code_unit_list(CodeList::in, Str::uo),
+ do_from_code_unit_list(CodeList::in, Verify::in, Str::uo),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness],
"
System.Text.StringBuilder sb = new System.Text.StringBuilder();
- bool prev_high = false;
SUCCESS_INDICATOR = true;
- while (!list.is_empty(CodeList)) {
- int i = (int) list.det_head(CodeList);
- // Check for null character or invalid code unit.
- if (i <= 0 || i > 0xffff) {
- SUCCESS_INDICATOR = false;
- break;
+ if (Verify == mr_bool.YES) {
+ bool prev_high = false;
+ while (!list.is_empty(CodeList)) {
+ int i = (int) list.det_head(CodeList);
+ // Check for null character or invalid code unit.
+ if (i <= 0 || i > 0xffff) {
+ SUCCESS_INDICATOR = false;
+ break;
+ }
+ char c = (char) i;
+ if (prev_high) {
+ if (!System.Char.IsLowSurrogate(c)) {
+ SUCCESS_INDICATOR = false;
+ break;
+ }
+ prev_high = false;
+ } else if (System.Char.IsHighSurrogate(c)) {
+ prev_high = true;
+ } else if (System.Char.IsLowSurrogate(c)) {
+ SUCCESS_INDICATOR = false;
+ break;
+ }
+ sb.Append(c);
+ CodeList = list.det_tail(CodeList);
}
- char c = (char) i;
- if (prev_high) {
- if (!System.Char.IsLowSurrogate(c)) {
+ SUCCESS_INDICATOR = SUCCESS_INDICATOR && !prev_high;
+ } else {
+ while (!list.is_empty(CodeList)) {
+ int i = (int) list.det_head(CodeList);
+ // Check for null character or invalid code unit.
+ if (i <= 0 || i > 0xffff) {
SUCCESS_INDICATOR = false;
break;
}
- prev_high = false;
- } else if (System.Char.IsHighSurrogate(c)) {
- prev_high = true;
- } else if (System.Char.IsLowSurrogate(c)) {
- SUCCESS_INDICATOR = false;
- break;
+ char c = (char) i;
+ sb.Append(c);
+ CodeList = list.det_tail(CodeList);
}
- sb.Append(c);
- CodeList = list.det_tail(CodeList);
}
- SUCCESS_INDICATOR = SUCCESS_INDICATOR && !prev_high;
-
if (SUCCESS_INDICATOR) {
Str = sb.ToString();
} else {
@@ -2133,7 +2174,7 @@ encode_utf16(Char, CodeList0, CodeList) :-
}
").
:- pragma foreign_proc("Erlang",
- from_code_unit_list(CodeList::in, Str::uo),
+ do_from_code_unit_list(CodeList::in, _Verify::in, Str::uo),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness],
"
diff --git a/tests/hard_coded/string_from_code_unit_list.exp b/tests/hard_coded/string_from_code_unit_list.exp
index 32e5b9f99..a49f01657 100644
--- a/tests/hard_coded/string_from_code_unit_list.exp
+++ b/tests/hard_coded/string_from_code_unit_list.exp
@@ -2,3 +2,10 @@ from_code_unit_list([]) = ""
from_code_unit_list([65, 0, 66]) failed
from_code_unit_list([240, 159, 152, 128]) = "😀"
from_code_unit_list([-16, 159, 152, 128]) failed
+from_code_unit_list([128, 152, 159, 240]) failed
+
+from_code_unit_list_allow_ill_formed([]) = []
+from_code_unit_list_allow_ill_formed([65, 0, 66]) failed
+from_code_unit_list_allow_ill_formed([240, 159, 152, 128]) = [240, 159, 152, 128]
+from_code_unit_list_allow_ill_formed([-16, 159, 152, 128]) failed
+from_code_unit_list_allow_ill_formed([128, 152, 159, 240]) = [128, 152, 159, 240]
diff --git a/tests/hard_coded/string_from_code_unit_list.exp2 b/tests/hard_coded/string_from_code_unit_list.exp2
index a168ecc72..0eebd2f4e 100644
--- a/tests/hard_coded/string_from_code_unit_list.exp2
+++ b/tests/hard_coded/string_from_code_unit_list.exp2
@@ -2,3 +2,10 @@ from_code_unit_list([]) = ""
from_code_unit_list([65, 0, 66]) failed
from_code_unit_list([55357, 56832]) = "😀"
from_code_unit_list([-10179, 56832]) failed
+from_code_unit_list([56832, 55357]) failed
+
+from_code_unit_list_allow_ill_formed([]) = []
+from_code_unit_list_allow_ill_formed([65, 0, 66]) failed
+from_code_unit_list_allow_ill_formed([55357, 56832]) = [55357, 56832]
+from_code_unit_list_allow_ill_formed([-10179, 56832]) failed
+from_code_unit_list_allow_ill_formed([56832, 55357]) = [56832, 55357]
diff --git a/tests/hard_coded/string_from_code_unit_list.m b/tests/hard_coded/string_from_code_unit_list.m
index 5f2fd2ca2..ad94bc4c4 100644
--- a/tests/hard_coded/string_from_code_unit_list.m
+++ b/tests/hard_coded/string_from_code_unit_list.m
@@ -30,7 +30,9 @@ main(!IO) :-
else
Cases = test_cases_16
),
- list.foldl(test_from_code_unit_list, Cases, !IO).
+ list.foldl(test_from_code_unit_list, Cases, !IO),
+ io.nl(!IO),
+ list.foldl(test_from_code_unit_list_allow_ill_formed, Cases, !IO).
:- func test_cases_8 = list(list(int)).
@@ -40,7 +42,8 @@ test_cases_8 = [
[0xf0, 0x9f, 0x98, 0x80],
% Sign extend first byte: the old implementation used to silently ignore
% higher bits so this case would succeed.
- [\0xff \/ 0xf0, 0x9f, 0x98, 0x80]
+ [\0xff \/ 0xf0, 0x9f, 0x98, 0x80],
+ [0x80, 0x98, 0x9f, 0xf0]
].
:- func test_cases_16 = list(list(int)).
@@ -49,7 +52,8 @@ test_cases_16 = [
[],
[65, 0, 66],
[0xd83d, 0xde00],
- [\0xffff \/ 0xd83d, 0xde00]
+ [\0xffff \/ 0xd83d, 0xde00],
+ [0xde00, 0xd83d]
].
:- pred test_from_code_unit_list(list(int)::in, io::di, io::uo) is det.
@@ -65,3 +69,19 @@ test_from_code_unit_list(CodeList, !IO) :-
else
io.write_string(" failed\n", !IO)
).
+
+:- pred test_from_code_unit_list_allow_ill_formed(list(int)::in,
+ io::di, io::uo) is det.
+
+test_from_code_unit_list_allow_ill_formed(CodeList, !IO) :-
+ io.write_string("from_code_unit_list_allow_ill_formed(", !IO),
+ io.write(CodeList, !IO),
+ io.write_string(")", !IO),
+ ( if string.from_code_unit_list_allow_ill_formed(CodeList, String) then
+ string.to_code_unit_list(String, CodeListB),
+ io.write_string(" = ", !IO),
+ io.write(CodeListB, !IO),
+ io.write_string("\n", !IO)
+ else
+ io.write_string(" failed\n", !IO)
+ ).
--
2.23.0
More information about the reviews
mailing list