[m-rev.] for review: Add string.from_code_unit_list_allow_ill_formed.

Peter Wang novalazy at gmail.com
Mon Nov 4 16:51:39 AEDT 2019


Suggestions for a better name?

----

library/string.m:
    Add string.from_code_unit_list_allow_ill_formed/2.

tests/hard_coded/string_from_code_unit_list.exp:
tests/hard_coded/string_from_code_unit_list.exp2:
tests/hard_coded/string_from_code_unit_list.m:
    Extend test case.

NEWS:
    Announce addition.
---
 NEWS                                          |   1 +
 library/string.m                              | 135 ++++++++++++------
 .../hard_coded/string_from_code_unit_list.exp |   7 +
 .../string_from_code_unit_list.exp2           |   7 +
 tests/hard_coded/string_from_code_unit_list.m |  26 +++-
 5 files changed, 126 insertions(+), 50 deletions(-)

diff --git a/NEWS b/NEWS
index b8827d9b1..e876cd6d3 100644
--- a/NEWS
+++ b/NEWS
@@ -420,6 +420,7 @@ Changes to the Mercury standard library:
    - is_all_alnum/1
    - is_empty/1
    - is_well_formed/1
+   - from_code_unit_list_allow_ill_formed/2
    - to_utf8_code_unit_list/2
    - to_utf16_code_unit_list/2
    - from_utf8_code_unit_list/2
diff --git a/library/string.m b/library/string.m
index 10428a0b4..9adabf662 100644
--- a/library/string.m
+++ b/library/string.m
@@ -224,6 +224,14 @@
     %
 :- pred from_code_unit_list(list(int)::in, string::uo) is semidet.
 
+    % Convert a list of code units to a string.
+    % The resulting string may contain ill-formed sequences.
+    % Fails if the list contains a code unit that is out of range
+    % or if the string would contain a null character.
+    %
+:- pred from_code_unit_list_allow_ill_formed(list(int)::in, string::uo)
+    is semidet.
+
     % Convert a list of UTF-8 code units to a string.
     % Fails if the list does not contain a valid encoding of a string
     % or if the string would contain a null character.
@@ -2009,12 +2017,18 @@ encode_utf16(Char, CodeList0, CodeList) :-
 
 %---------------------%
 
-% XXX ILSEQ to_code_unit_list(S0, L), from_code_unit_list(L, S) may fail
-% because the original string contained an ill-formed sequence.
-% Perhaps we should provide a predicate that can produce the original string.
+from_code_unit_list(CodeList, Str) :-
+    Verify = yes,
+    do_from_code_unit_list(CodeList, Verify, Str).
+
+from_code_unit_list_allow_ill_formed(CodeList, Str) :-
+    Verify = no,
+    do_from_code_unit_list(CodeList, Verify, Str).
+
+:- pred do_from_code_unit_list(list(int)::in, bool::in, string::uo) is semidet.
 
 :- pragma foreign_proc("C",
-    from_code_unit_list(CodeList::in, Str::uo),
+    do_from_code_unit_list(CodeList::in, Verify::in, Str::uo),
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, may_not_duplicate, no_sharing],
 "
@@ -2047,43 +2061,57 @@ encode_utf16(Char, CodeList0, CodeList) :-
 
     Str[size] = '\\0';
 
-    SUCCESS_INDICATOR = SUCCESS_INDICATOR && MR_utf8_verify(Str);
+    if (SUCCESS_INDICATOR && Verify == MR_YES) {
+        SUCCESS_INDICATOR = MR_utf8_verify(Str);
+    }
 ").
 :- pragma foreign_proc("Java",
-    from_code_unit_list(CodeList::in, Str::uo),
+    do_from_code_unit_list(CodeList::in, Verify::in, Str::uo),
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness],
 "
     java.lang.StringBuilder sb = new java.lang.StringBuilder();
-    boolean prev_high = false;
 
     SUCCESS_INDICATOR = true;
 
-    Iterable<Integer> iterable = new list.ListIterator<Integer>(CodeList);
-    for (int i : iterable) {
-        // Check for null character or invalid code unit.
-        if (i <= 0 || i > 0xffff) {
-            SUCCESS_INDICATOR = false;
-            break;
+    if (Verify == bool.YES) {
+        boolean prev_high = false;
+        Iterable<Integer> iterable = new list.ListIterator<Integer>(CodeList);
+        for (int i : iterable) {
+            // Check for null character or invalid code unit.
+            if (i <= 0 || i > 0xffff) {
+                SUCCESS_INDICATOR = false;
+                break;
+            }
+            char c = (char) i;
+            if (prev_high) {
+                if (!java.lang.Character.isLowSurrogate(c)) {
+                    SUCCESS_INDICATOR = false;
+                    break;
+                }
+                prev_high = false;
+            } else if (java.lang.Character.isHighSurrogate(c)) {
+                prev_high = true;
+            } else if (java.lang.Character.isLowSurrogate(c)) {
+                SUCCESS_INDICATOR = false;
+                break;
+            }
+            sb.append(c);
         }
-        char c = (char) i;
-        if (prev_high) {
-            if (!java.lang.Character.isLowSurrogate(c)) {
+        SUCCESS_INDICATOR = SUCCESS_INDICATOR && !prev_high;
+    } else {
+        Iterable<Integer> iterable = new list.ListIterator<Integer>(CodeList);
+        for (int i : iterable) {
+            // Check for null character or invalid code unit.
+            if (i <= 0 || i > 0xffff) {
                 SUCCESS_INDICATOR = false;
                 break;
             }
-            prev_high = false;
-        } else if (java.lang.Character.isHighSurrogate(c)) {
-            prev_high = true;
-        } else if (java.lang.Character.isLowSurrogate(c)) {
-            SUCCESS_INDICATOR = false;
-            break;
+            char c = (char) i;
+            sb.append(c);
         }
-        sb.append(c);
     }
 
-    SUCCESS_INDICATOR = SUCCESS_INDICATOR && !prev_high;
-
     if (SUCCESS_INDICATOR) {
         Str = sb.toString();
     } else {
@@ -2091,41 +2119,54 @@ encode_utf16(Char, CodeList0, CodeList) :-
     }
 ").
 :- pragma foreign_proc("C#",
-    from_code_unit_list(CodeList::in, Str::uo),
+    do_from_code_unit_list(CodeList::in, Verify::in, Str::uo),
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness],
 "
     System.Text.StringBuilder sb = new System.Text.StringBuilder();
-    bool prev_high = false;
 
     SUCCESS_INDICATOR = true;
 
-    while (!list.is_empty(CodeList)) {
-        int i = (int) list.det_head(CodeList);
-        // Check for null character or invalid code unit.
-        if (i <= 0 || i > 0xffff) {
-            SUCCESS_INDICATOR = false;
-            break;
+    if (Verify == mr_bool.YES) {
+        bool prev_high = false;
+        while (!list.is_empty(CodeList)) {
+            int i = (int) list.det_head(CodeList);
+            // Check for null character or invalid code unit.
+            if (i <= 0 || i > 0xffff) {
+                SUCCESS_INDICATOR = false;
+                break;
+            }
+            char c = (char) i;
+            if (prev_high) {
+                if (!System.Char.IsLowSurrogate(c)) {
+                    SUCCESS_INDICATOR = false;
+                    break;
+                }
+                prev_high = false;
+            } else if (System.Char.IsHighSurrogate(c)) {
+                prev_high = true;
+            } else if (System.Char.IsLowSurrogate(c)) {
+                SUCCESS_INDICATOR = false;
+                break;
+            }
+            sb.Append(c);
+            CodeList = list.det_tail(CodeList);
         }
-        char c = (char) i;
-        if (prev_high) {
-            if (!System.Char.IsLowSurrogate(c)) {
+        SUCCESS_INDICATOR = SUCCESS_INDICATOR && !prev_high;
+    } else {
+        while (!list.is_empty(CodeList)) {
+            int i = (int) list.det_head(CodeList);
+            // Check for null character or invalid code unit.
+            if (i <= 0 || i > 0xffff) {
                 SUCCESS_INDICATOR = false;
                 break;
             }
-            prev_high = false;
-        } else if (System.Char.IsHighSurrogate(c)) {
-            prev_high = true;
-        } else if (System.Char.IsLowSurrogate(c)) {
-            SUCCESS_INDICATOR = false;
-            break;
+            char c = (char) i;
+            sb.Append(c);
+            CodeList = list.det_tail(CodeList);
         }
-        sb.Append(c);
-        CodeList = list.det_tail(CodeList);
     }
 
-    SUCCESS_INDICATOR = SUCCESS_INDICATOR && !prev_high;
-
     if (SUCCESS_INDICATOR) {
         Str = sb.ToString();
     } else {
@@ -2133,7 +2174,7 @@ encode_utf16(Char, CodeList0, CodeList) :-
     }
 ").
 :- pragma foreign_proc("Erlang",
-    from_code_unit_list(CodeList::in, Str::uo),
+    do_from_code_unit_list(CodeList::in, _Verify::in, Str::uo),
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness],
 "
diff --git a/tests/hard_coded/string_from_code_unit_list.exp b/tests/hard_coded/string_from_code_unit_list.exp
index 32e5b9f99..a49f01657 100644
--- a/tests/hard_coded/string_from_code_unit_list.exp
+++ b/tests/hard_coded/string_from_code_unit_list.exp
@@ -2,3 +2,10 @@ from_code_unit_list([]) = ""
 from_code_unit_list([65, 0, 66]) failed
 from_code_unit_list([240, 159, 152, 128]) = "😀"
 from_code_unit_list([-16, 159, 152, 128]) failed
+from_code_unit_list([128, 152, 159, 240]) failed
+
+from_code_unit_list_allow_ill_formed([]) = []
+from_code_unit_list_allow_ill_formed([65, 0, 66]) failed
+from_code_unit_list_allow_ill_formed([240, 159, 152, 128]) = [240, 159, 152, 128]
+from_code_unit_list_allow_ill_formed([-16, 159, 152, 128]) failed
+from_code_unit_list_allow_ill_formed([128, 152, 159, 240]) = [128, 152, 159, 240]
diff --git a/tests/hard_coded/string_from_code_unit_list.exp2 b/tests/hard_coded/string_from_code_unit_list.exp2
index a168ecc72..0eebd2f4e 100644
--- a/tests/hard_coded/string_from_code_unit_list.exp2
+++ b/tests/hard_coded/string_from_code_unit_list.exp2
@@ -2,3 +2,10 @@ from_code_unit_list([]) = ""
 from_code_unit_list([65, 0, 66]) failed
 from_code_unit_list([55357, 56832]) = "😀"
 from_code_unit_list([-10179, 56832]) failed
+from_code_unit_list([56832, 55357]) failed
+
+from_code_unit_list_allow_ill_formed([]) = []
+from_code_unit_list_allow_ill_formed([65, 0, 66]) failed
+from_code_unit_list_allow_ill_formed([55357, 56832]) = [55357, 56832]
+from_code_unit_list_allow_ill_formed([-10179, 56832]) failed
+from_code_unit_list_allow_ill_formed([56832, 55357]) = [56832, 55357]
diff --git a/tests/hard_coded/string_from_code_unit_list.m b/tests/hard_coded/string_from_code_unit_list.m
index 5f2fd2ca2..ad94bc4c4 100644
--- a/tests/hard_coded/string_from_code_unit_list.m
+++ b/tests/hard_coded/string_from_code_unit_list.m
@@ -30,7 +30,9 @@ main(!IO) :-
     else
         Cases = test_cases_16
     ),
-    list.foldl(test_from_code_unit_list, Cases, !IO).
+    list.foldl(test_from_code_unit_list, Cases, !IO),
+    io.nl(!IO),
+    list.foldl(test_from_code_unit_list_allow_ill_formed, Cases, !IO).
 
 :- func test_cases_8 = list(list(int)).
 
@@ -40,7 +42,8 @@ test_cases_8 = [
     [0xf0, 0x9f, 0x98, 0x80],
     % Sign extend first byte: the old implementation used to silently ignore
     % higher bits so this case would succeed.
-    [\0xff \/ 0xf0, 0x9f, 0x98, 0x80]
+    [\0xff \/ 0xf0, 0x9f, 0x98, 0x80],
+    [0x80, 0x98, 0x9f, 0xf0]
 ].
 
 :- func test_cases_16 = list(list(int)).
@@ -49,7 +52,8 @@ test_cases_16 = [
     [],
     [65, 0, 66],
     [0xd83d, 0xde00],
-    [\0xffff \/ 0xd83d, 0xde00]
+    [\0xffff \/ 0xd83d, 0xde00],
+    [0xde00, 0xd83d]
 ].
 
 :- pred test_from_code_unit_list(list(int)::in, io::di, io::uo) is det.
@@ -65,3 +69,19 @@ test_from_code_unit_list(CodeList, !IO) :-
     else
         io.write_string(" failed\n", !IO)
     ).
+
+:- pred test_from_code_unit_list_allow_ill_formed(list(int)::in,
+    io::di, io::uo) is det.
+
+test_from_code_unit_list_allow_ill_formed(CodeList, !IO) :-
+    io.write_string("from_code_unit_list_allow_ill_formed(", !IO),
+    io.write(CodeList, !IO),
+    io.write_string(")", !IO),
+    ( if string.from_code_unit_list_allow_ill_formed(CodeList, String) then
+        string.to_code_unit_list(String, CodeListB),
+        io.write_string(" = ", !IO),
+        io.write(CodeListB, !IO),
+        io.write_string("\n", !IO)
+    else
+        io.write_string(" failed\n", !IO)
+    ).
-- 
2.23.0



More information about the reviews mailing list