[m-rev.] for review: Add string.is_well_formed predicate.

Peter Wang novalazy at gmail.com
Thu Sep 12 16:54:23 AEST 2019


library/string.m:
    Add predicate to test if a string is in UTF-8 or UTF-16,
    depending on the target language.

NEWS:
    Announce the addition.

tests/hard_coded/string_well_formed.exp:
tests/hard_coded/string_well_formed.m:
    Add basic test case.

tests/hard_coded/string_well_formed_utf8.exp:
tests/hard_coded/string_well_formed_utf8.exp2:
tests/hard_coded/string_well_formed_utf8.exp3:
tests/hard_coded/string_well_formed_utf8.inp:
tests/hard_coded/string_well_formed_utf8.m:
    Add more thorough test for UTF-8. The input file is from
    https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt

tests/hard_coded/Mmakefile:
    Enable the tests.
---
 NEWS                                          |   1 +
 library/string.m                              |  70 ++++++++++-
 tests/hard_coded/Mmakefile                    |   2 +
 tests/hard_coded/string_well_formed.exp       |   6 +
 tests/hard_coded/string_well_formed.m         |  43 +++++++
 tests/hard_coded/string_well_formed_utf8.exp  | 109 ++++++++++++++++++
 tests/hard_coded/string_well_formed_utf8.exp2 | 107 +++++++++++++++++
 tests/hard_coded/string_well_formed_utf8.exp3 | 109 ++++++++++++++++++
 tests/hard_coded/string_well_formed_utf8.inp  | Bin 0 -> 22781 bytes
 tests/hard_coded/string_well_formed_utf8.m    |  76 ++++++++++++
 10 files changed, 521 insertions(+), 2 deletions(-)
 create mode 100644 tests/hard_coded/string_well_formed.exp
 create mode 100644 tests/hard_coded/string_well_formed.m
 create mode 100644 tests/hard_coded/string_well_formed_utf8.exp
 create mode 100644 tests/hard_coded/string_well_formed_utf8.exp2
 create mode 100644 tests/hard_coded/string_well_formed_utf8.exp3
 create mode 100644 tests/hard_coded/string_well_formed_utf8.inp
 create mode 100644 tests/hard_coded/string_well_formed_utf8.m

diff --git a/NEWS b/NEWS
index 1fbef1f5b..a1b9d121d 100644
--- a/NEWS
+++ b/NEWS
@@ -412,6 +412,7 @@ Changes to the Mercury standard library:
 
    - is_all_alnum/1
    - is_empty/1
+   - is_well_formed/1
    - to_utf8_code_unit_list/2
    - to_utf16_code_unit_list/2
    - from_utf8_code_unit_list/2
diff --git a/library/string.m b/library/string.m
index 5f7696c34..0e5507110 100644
--- a/library/string.m
+++ b/library/string.m
@@ -444,6 +444,14 @@
     %
 :- pred is_empty(string::in) is semidet.
 
+    % True if the string is a valid UTF-8 or UTF-16 string.
+    % In target languages that use UTF-8 string encoding, `is_well_formed(S)'
+    % is true iff S consists of a well-formed UTF-8 code unit sequence.
+    % In target languages that use UTF-16 string encoding, `is_well_formed(S)'
+    % is true iff S consists of a well-formed UTF-16 code unit sequence.
+    %
+:- pred is_well_formed(string::in) is semidet.
+
     % True if string contains only alphabetic characters [A-Za-z].
     %
 :- pred is_all_alpha(string::in) is semidet.
@@ -3101,11 +3109,69 @@ hash6_loop(String, Index, Length, !HashVal) :-
 %
 % Tests on strings.
 %
-% For speed, most of these predicates have C versions as well as
-% Mercury versions. XXX why not all?
 
 is_empty("").
 
+%---------------------%
+
+:- pragma foreign_proc("C",
+    is_well_formed(S::in),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness],
+"
+    SUCCESS_INDICATOR = MR_utf8_verify(S);
+").
+:- pragma foreign_proc("Java",
+    is_well_formed(S::in),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness],
+"
+    SUCCESS_INDICATOR = true;
+    for (int i = 0; i < S.length(); i++) {
+        if (java.lang.Character.isLowSurrogate(S.charAt(i))) {
+            SUCCESS_INDICATOR = false;
+            break;
+        }
+        if (java.lang.Character.isHighSurrogate(S.charAt(i))) {
+            i++;
+            if (i >= S.length() ||
+                !java.lang.Character.isLowSurrogate(S.charAt(i)))
+            {
+                SUCCESS_INDICATOR = false;
+                break;
+            }
+        }
+    }
+").
+:- pragma foreign_proc("C#",
+    is_well_formed(S::in),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness],
+"
+    SUCCESS_INDICATOR = true;
+    for (int i = 0; i < S.Length; i++) {
+        if (System.Char.IsLowSurrogate(S[i])) {
+            SUCCESS_INDICATOR = false;
+            break;
+        }
+        if (System.Char.IsHighSurrogate(S[i])) {
+            i++;
+            if (i >= S.Length || !System.Char.IsLowSurrogate(S[i])) {
+                SUCCESS_INDICATOR = false;
+                break;
+            }
+        }
+    }
+").
+
+is_well_formed(_) :-
+    sorry($module, "string.is_well_formed/1").
+
+%---------------------%
+
+% For speed, most of these predicates have C versions as well as
+% Mercury versions. XXX why not all?
+
 % XXX ILSEQ Behaviour depends on target language.
 % The generic versions use all_match which currently uses unsafe_index_next and
 % ignores the first ill-formed sequence and everything thereafter.
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index f6447b0ea..fe700b0f6 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -372,6 +372,8 @@ ORDINARY_PROGS = \
 	string_switch3 \
 	string_to_float_overflow \
 	string_various \
+	string_well_formed \
+	string_well_formed_utf8 \
 	sv_nested_closures \
 	sv_record_update \
 	switch_detect \
diff --git a/tests/hard_coded/string_well_formed.exp b/tests/hard_coded/string_well_formed.exp
new file mode 100644
index 000000000..52483e26c
--- /dev/null
+++ b/tests/hard_coded/string_well_formed.exp
@@ -0,0 +1,6 @@
+Empty is well-formed
+Good is well-formed
+Bad1 is not well-formed
+Bad2 is not well-formed
+Bad3 is not well-formed
+Bad4 is not well-formed
diff --git a/tests/hard_coded/string_well_formed.m b/tests/hard_coded/string_well_formed.m
new file mode 100644
index 000000000..606626cbb
--- /dev/null
+++ b/tests/hard_coded/string_well_formed.m
@@ -0,0 +1,43 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+
+:- module string_well_formed.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module int.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    Empty = "",
+    Good = "\U0001F600", % 4 UTF-8 code units, 2 UTF-16 code units
+    Bad1 = string.between(Good, 0, 1),
+    Bad2 = string.between(Good, 0, length(Good) - 1),
+    Bad3 = Bad1 ++ Good,
+    Bad4 = Good ++ Bad2,
+    test_well_formed("Empty", Empty, !IO),
+    test_well_formed("Good", Good, !IO),
+    test_well_formed("Bad1", Bad1, !IO),
+    test_well_formed("Bad2", Bad2, !IO),
+    test_well_formed("Bad3", Bad3, !IO),
+    test_well_formed("Bad4", Bad4, !IO).
+
+:- pred test_well_formed(string::in, string::in, io::di, io::uo) is det.
+
+test_well_formed(Label, S, !IO) :-
+    ( if string.is_well_formed(S) then
+        io.write_string(Label ++ " is well-formed\n", !IO)
+    else
+        io.write_string(Label ++ " is not well-formed\n", !IO)
+    ).
diff --git a/tests/hard_coded/string_well_formed_utf8.exp b/tests/hard_coded/string_well_formed_utf8.exp
new file mode 100644
index 000000000..731b0dced
--- /dev/null
+++ b/tests/hard_coded/string_well_formed_utf8.exp
@@ -0,0 +1,109 @@
+string encoding is UTF-8
+
+line 65: well-formed
+
+line 71: null character in input
+
+line 72: well-formed
+line 73: well-formed
+line 74: well-formed
+line 75: not well-formed
+line 76: not well-formed
+
+line 81: well-formed
+line 82: well-formed
+line 83: not well-formed
+line 84: not well-formed
+line 85: not well-formed
+
+line 89: well-formed
+line 90: well-formed
+line 91: contains replacement char
+line 92: well-formed
+line 93: not well-formed
+
+line 102: not well-formed
+line 103: not well-formed
+
+line 105: not well-formed
+line 106: not well-formed
+line 107: not well-formed
+line 108: not well-formed
+line 109: not well-formed
+line 110: not well-formed
+
+line 114: not well-formed
+line 115: not well-formed
+line 116: not well-formed
+line 117: not well-formed
+
+line 124: not well-formed
+line 125: not well-formed
+
+line 130: not well-formed
+
+line 135: not well-formed
+
+line 140: not well-formed
+
+line 145: not well-formed
+
+line 153: not well-formed
+line 154: not well-formed
+line 155: not well-formed
+line 156: not well-formed
+line 157: not well-formed
+line 158: not well-formed
+line 159: not well-formed
+line 160: not well-formed
+line 161: not well-formed
+line 162: not well-formed
+
+line 169: not well-formed
+
+line 175: not well-formed
+line 176: not well-formed
+line 177: not well-formed
+
+line 207: not well-formed
+line 208: not well-formed
+line 209: not well-formed
+line 210: not well-formed
+line 211: not well-formed
+
+line 220: not well-formed
+line 221: not well-formed
+line 222: not well-formed
+line 223: not well-formed
+line 224: not well-formed
+
+line 232: not well-formed
+line 233: not well-formed
+line 234: not well-formed
+line 235: not well-formed
+line 236: not well-formed
+
+line 247: not well-formed
+line 248: not well-formed
+line 249: not well-formed
+line 250: not well-formed
+line 251: not well-formed
+line 252: not well-formed
+line 253: not well-formed
+
+line 257: not well-formed
+line 258: not well-formed
+line 259: not well-formed
+line 260: not well-formed
+line 261: not well-formed
+line 262: not well-formed
+line 263: not well-formed
+line 264: not well-formed
+
+line 288: well-formed
+line 289: well-formed
+
+line 293: well-formed
+
+line 297: well-formed
+line 298: well-formed
diff --git a/tests/hard_coded/string_well_formed_utf8.exp2 b/tests/hard_coded/string_well_formed_utf8.exp2
new file mode 100644
index 000000000..e4b22eb39
--- /dev/null
+++ b/tests/hard_coded/string_well_formed_utf8.exp2
@@ -0,0 +1,107 @@
+string encoding is UTF-16
+
+line 65: well-formed
+
+line 72: well-formed
+line 73: well-formed
+line 74: well-formed
+line 75: contains replacement char
+line 76: contains replacement char
+
+line 81: well-formed
+line 82: well-formed
+line 83: contains replacement char
+line 84: contains replacement char
+line 85: contains replacement char
+
+line 89: well-formed
+line 90: well-formed
+line 91: contains replacement char
+line 92: well-formed
+line 93: contains replacement char
+
+line 102: contains replacement char
+line 103: contains replacement char
+
+line 105: contains replacement char
+line 106: contains replacement char
+line 107: contains replacement char
+line 108: contains replacement char
+line 109: contains replacement char
+line 110: contains replacement char
+
+line 114: contains replacement char
+line 115: contains replacement char
+line 116: contains replacement char
+line 117: contains replacement char
+
+line 124: contains replacement char
+line 125: contains replacement char
+
+line 130: contains replacement char
+
+line 135: contains replacement char
+
+line 140: contains replacement char
+
+line 145: contains replacement char
+
+line 153: contains replacement char
+line 154: contains replacement char
+line 155: contains replacement char
+line 156: contains replacement char
+line 157: contains replacement char
+line 158: contains replacement char
+line 159: contains replacement char
+line 160: contains replacement char
+line 161: contains replacement char
+line 162: contains replacement char
+
+line 169: contains replacement char
+
+line 175: contains replacement char
+line 176: contains replacement char
+line 177: contains replacement char
+
+line 207: contains replacement char
+line 208: contains replacement char
+line 209: contains replacement char
+line 210: contains replacement char
+line 211: contains replacement char
+
+line 220: contains replacement char
+line 221: contains replacement char
+line 222: contains replacement char
+line 223: contains replacement char
+line 224: contains replacement char
+
+line 232: contains replacement char
+line 233: contains replacement char
+line 234: contains replacement char
+line 235: contains replacement char
+line 236: contains replacement char
+
+line 247: contains replacement char
+line 248: contains replacement char
+line 249: contains replacement char
+line 250: contains replacement char
+line 251: contains replacement char
+line 252: contains replacement char
+line 253: contains replacement char
+
+line 257: contains replacement char
+line 258: contains replacement char
+line 259: contains replacement char
+line 260: contains replacement char
+line 261: contains replacement char
+line 262: contains replacement char
+line 263: contains replacement char
+line 264: contains replacement char
+
+line 288: well-formed
+line 289: well-formed
+
+line 293: well-formed
+
+line 297: well-formed
+line 298: well-formed
diff --git a/tests/hard_coded/string_well_formed_utf8.exp3 b/tests/hard_coded/string_well_formed_utf8.exp3
new file mode 100644
index 000000000..b7ea3695f
--- /dev/null
+++ b/tests/hard_coded/string_well_formed_utf8.exp3
@@ -0,0 +1,109 @@
+string encoding is UTF-16
+
+line 65: well-formed
+
+line 71: null character in input
+
+line 72: well-formed
+line 73: well-formed
+line 74: well-formed
+line 75: contains replacement char
+line 76: contains replacement char
+
+line 81: well-formed
+line 82: well-formed
+line 83: contains replacement char
+line 84: contains replacement char
+line 85: contains replacement char
+
+line 89: well-formed
+line 90: well-formed
+line 91: contains replacement char
+line 92: well-formed
+line 93: contains replacement char
+
+line 102: contains replacement char
+line 103: contains replacement char
+
+line 105: contains replacement char
+line 106: contains replacement char
+line 107: contains replacement char
+line 108: contains replacement char
+line 109: contains replacement char
+line 110: contains replacement char
+
+line 114: contains replacement char
+line 115: contains replacement char
+line 116: contains replacement char
+line 117: contains replacement char
+
+line 124: contains replacement char
+line 125: contains replacement char
+
+line 130: contains replacement char
+
+line 135: contains replacement char
+
+line 140: contains replacement char
+
+line 145: contains replacement char
+
+line 153: contains replacement char
+line 154: contains replacement char
+line 155: contains replacement char
+line 156: contains replacement char
+line 157: contains replacement char
+line 158: contains replacement char
+line 159: contains replacement char
+line 160: contains replacement char
+line 161: contains replacement char
+line 162: contains replacement char
+
+line 169: contains replacement char
+
+line 175: contains replacement char
+line 176: contains replacement char
+line 177: contains replacement char
+
+line 207: contains replacement char
+line 208: contains replacement char
+line 209: contains replacement char
+line 210: contains replacement char
+line 211: contains replacement char
+
+line 220: contains replacement char
+line 221: contains replacement char
+line 222: contains replacement char
+line 223: contains replacement char
+line 224: contains replacement char
+
+line 232: contains replacement char
+line 233: contains replacement char
+line 234: contains replacement char
+line 235: contains replacement char
+line 236: contains replacement char
+
+line 247: contains replacement char
+line 248: contains replacement char
+line 249: contains replacement char
+line 250: contains replacement char
+line 251: contains replacement char
+line 252: contains replacement char
+line 253: contains replacement char
+
+line 257: contains replacement char
+line 258: contains replacement char
+line 259: contains replacement char
+line 260: contains replacement char
+line 261: contains replacement char
+line 262: contains replacement char
+line 263: contains replacement char
+line 264: contains replacement char
+
+line 288: well-formed
+line 289: well-formed
+
+line 293: well-formed
+
+line 297: well-formed
+line 298: well-formed
diff --git a/tests/hard_coded/string_well_formed_utf8.inp b/tests/hard_coded/string_well_formed_utf8.inp
new file mode 100644
index 0000000000000000000000000000000000000000..a5b5d50e6b61eb9a3b751b3954f83e61bb59db9b
GIT binary patch
literal 22781
zcmdU1X_Fh*b<OAg6*tHS4b2&FHj*P%oGOwt6R{$Rs*+eKe{2AaL3 at C1xVtgnq)-VH
zIkB@!-gishSBZ8K2eD%j^pZW1v+s%M>>CYjiIrHEbIyIeVd1bb#8j*<f`b{LPv3X<
zci(;Q;)B~u8&Y$se$5G_YPao*+jOISvAvpvQQ(B3L{1o4O9d~>y4MbNJE8ngr{T%>
zH=?M0*NGF|Zns=*maBHFY*)*j-4j3B+Sy$_dEy6TNmiFvPA)BPEUj+fmviUj>}zDb
zylhzyHeB;;sk==Fw0Y8Snr+$lJK|ijTdwCUO2hB+4}n}98At=<H=WShvjf)$G0}e8
z3B|8V)ei!v8ZAo8Zr1&v<<u6%55$iePSABjM_ey*FzU$lb_}K!I<M<EUeyW9q9iFb
zAkFh5X*sqRN#sk#5gXthF>QQVnO2tbk<@(9!SA*KTMb-()7zJ}9Yk)m)3ovMPE_?<
z4&m`=AYfdi2gL9x(zW;T2&>e#!>~kZOLg0AmhhcCgBHcvb3FXf at 9Z>qKS;P_9xq{M
zX9r^v+X4}FC$KAEXd?8A)3Eozr9kXRLJ%VDrme$0ABAGOEs(JYzJ!rugIk~^6$2>n
zcEzq>1dFOMR23y23`tPI^(hkaJx~+w1GHs>5#U|33BkDdJ8tNBov>p!@zi!F0^_$X
zwVlAF6hNWW!L)ctp%pbicFV`;>TVE5OOcBa*d6d8P>GKyMu;y#v2jP!v2ly<Igqc>
zk#^vNEf_7C=(s`3piW3-P;OK9WvZqTur#5BA;#K-8dm*1m=Jj3)$CxO%20DT9xyLe
zwsDmL*Fxz!%_e`wy4l<p3>5ZQi<LT6Kyv1y=kharx#n1HAL<J2CJRCczF89wS^$-*
zRv}M}gi5IzA7jM>I8()+YdS1n&OdNLR+dk#pITbEYjt^fc~L?miBLAnr&X at 2<+Unk
z1xxnas$;=~9Ct4 at 5j@+*5Y%*_V9+xuWkXKLj~OeW4FN%?jpW=YyARqX)rK9|RoI{z
z6*L^m6}lKPRobrk?FftsJC)FhN~FyyrMJYKuwgkgxUNh6oN`}GFDwzt;8txQTCib0
zpaX3vNd^8-vO$_rm*s*-zPDPw2N~g;P^31r#;!I1WQ{2rZPN#c8k3rQ#uD?@lrel9
zc&guQIdCZHh-Bg7@$K#H&0~wwZMb+&)Ax7DrV9Ahba&}X at N9^jV}PH6<AMdX;DXh!
z*%G=fmrf?!Z&m!JdZd=S(};}6+VTCGfVno!de5cJVQ4j1{a?oo8+g=B$*u=}ixM*m
z*4(#JwM}$%LYA-X!00F&dk-#5-B2tc=xczXTXi4fuXJFV5QlNe^$v+ at Iy-iA)XC)(
zSKX|4M`XeK)*w&_(}mwS4-0^^t=b`csy^J^@%(_An++?G15_=rI$lOvDuz(umGIcd
zQ?}{jAsiR_-?DcdtL9YTZ1n+f<X~mJPB$O>M+Px?o*Hkbs)2c49sb4l;8pIYPlARu
zoGPT(Z8+q>K>~n^Evy`)jv=&wc~r>Af(RF7VRiKHP%*^JY6M2Y=RqsH9fW|J=p5L9
znCI@<O)6sumf1~cAX9>cwPR{0b+~K4%VEB*<45q|vDS%Qr|sIF1&&Z+V?K-_EmasQ
z8kByYLD(%L0K%By=V1!rzE^DozUTgkfsVl1(8MKiTZq~&WQ?&SPutJLna1mUxDM_8
zP)ZNpy(L at sZI)m$I*2Ai09`mdo-eM(DM*@X6gpHP8)DPgmBh+wV22I*AB0J!9+fy;
z8{>>!u-$}rV^pxJHiRsQ=#+ES2`J?=&=eCklnU at e6AC)5L_(+U%sfB{lLwa<+D)c0
z at KC(hsyj~Ym{ISVi82sR3hY2obbeCmm0&TKs!kix5mfO0s0&XAKVcanR?92p6&3Aj
z8SfK619~GH1bV)=)P}u8YO7}+ICuU$#}Wm8Gqf%qN3>YhoB8(iHYkdjXatJ~CgW3j
zY>f4iQ(Nk~HBd)WUqlpu?vm2{m{U#~UP-4-nIT8$v^)WO76SEQ at N+P=V`Y;of&qRv
z)cla%oCy!KxVqci&#Y-mCc=zBS!*Ss((AM;P-3Qwi3bb$e1=vyB2^fTxfonxrVgS2
zoKKuoB7(+ABNaY2vzm!32iYF<<33-#7gY~A0sMvCghhF<gFOh(L at 9Vs8XO3me4+*>
zq-e|{8C-$Ugm&8k%c>);(`u^J2$kppjT%GKyf1X4j at nzOxnwR`JcchcJNi(NkUAXU
zsBwD@$v}<yT4f)^gRETCNbS+6B;)FoK$mKcaV$o0F#mv=@9g6VR^ALX!}Q<|)L*Jc
zr?tjfq)GxtT6Z&rW at c^l9{Ld8C5OIx)}e2mdSlO85qW^rC2D7~0?pc|-f{3B=TF~j
zFwu$)(YQlLelT#HUFjlslsk6)u;tv5aJ2N)7ry at F=fC#QQ(t`QbETV8I)5rxMb1v-
z+{XGD_QRXgGVPNipbzd<wmCJCwjYKrr(TJ7Fs6dQpuQH-b_Z_p!0SlpBZVu*c9C!{
z(W&z~mdI90-*bRSN9nZ~P#}|4#Y2LU;BDmbzI?eTU{3+pMApUtH%17&{@9F&>mutT
zh$~}5e&&(OGeVveIhlv#Txx{K7vFe!R?Jf(r*fF<3Yq8RORop!41|tK>vn&mbRg_|
zm`Ke*yX7d<8wxo^y-#oF)cePZYNK$x)71O)b{_BPK~X?uV0lu#G3m<#4hJ(Lj;r?y
z0rQC89Lx$irrv9cn1}opAZH|VRK3AOy>gQIHZf-ay<LSKy`R#7Y%U<Tm|EEx{r0VY
zI|}t)qk0=By9ox$s|E9uJL^)Z<8A52qlKBL<m-s3;0AFE{)UK-Y%KF15Wi8}+vE_z
zMS}<dZxT at ntkj4&96VXnTyluea3ZcOQ)K{CUzZI7ssn%F;o at o|@H-H685*vT8VU^L
zojPnY@)u@^2!y<~KFsoso&_G>J7t0~3rOy}M?jxyqUWGiivnIv=ZUUoW)yJ@%RT>A
zZ|6_n7AGs61L#Y#T@(gG2L*alV=(oPCW|Yqyd_s}%50DMoQs02-}TDL_lo|U|9IgU
zC=ru;a?(B`3|YRkv3!@5islZ`Ka<&Mu%8qz at G5oS74<hqGhW!p>LhKL_mwUeB`g-0
z885ud+RgBaa?Su6Jh+(ko8rwxp}|7{J$ZAyStvAkr<rd~=!)Ur<-sgGG<X{#rw_s_
zPT|q$%bl2m$E6ybjG=~hYCTJ3CpjStw3{U=dzPk9!t<vOpry+{{u2+q{wIIx4R8GE
zpZVFJ`}tpZ*yz$>{x|eJ@}@Vx<*jdf`#awGu6MuZz3+SfqlZB<_ZYm#KJdX0efT3E
z{n*Dp at ySnp`tc{O+%Dc{KKqNm^vj?7mHw}O{tLhM#V4=MWD%p!p1(ZSbgF~$IZ9|@
zg!S8ed2saM1%5jc`snXjU|w4_tD$k&MOa#0(%M{7dBDVc`BIg7T&ttBKjlj4(;A;C
z%OM8p(pcEW0&zC_v`#PuCjC>2m;UAU-Y8z_ntVyVEMJkY%CF0B$ZyJT$#2W=$nVPU
z$?wZk^0gTrVd(n<`9t|5`D6JL`BV8b`E&UT`AhjL`D^(b`CIurd0L7ms`DQ&^p6 at y
z`L3KAm+#sX`8t%Z^NN=5b@{sdz5Ijxqx_Tnv;2$vtNfe%yZndzr`(Wl$ngJN9>_N+
zkBxD0tWOa~o#LpUo^Ebg{P?c6w`b&8c}|{}7v#U>zvY`V at PFJCkH$WbuTdWB<MKE;
zO&%MRN97eQj~C^C<bUN`w;_i_E=MB|<hzu|L^wJ%O&(RsqxOoH$4m0<Imm)CISP5e
zTP4{pRxY_htV`e%S+2BPtW;px`G9Nw^iffmq-fig_O)Q~4vTzPQcTj8{0v>&64GgT
zysDZJJZ*k%+6;6}KSAQkmiF_oRc4|oB7=iG(#2q13uZ9^c@>u(QBsJ|L8XgMXgXOq
zm#rG3WWd$cxU&sS8R<2D{;i>eh5T<U)62<OGr&Z&=q$=DPn52WXEvO_X~sL2Sro*(
zet9||RB$Xd76g1|vhHX|gv%fE?gbHFoU(9YAmO#f98^Kbm!>Vo7);F63rof<!I?FM
z($mu!iJ5v~$tWJ{q?odz+&Uc)Q}x1<Vd0f{3W%kI>4=zI7naP at a<-sQ;!47FNLX26
z$t>m86H2;}Fo~I(<6PcQ(on~9X!b$vkUblwOT$G!=a?r8i#%}tK7u}$E(?52e5$%z
z*gDHJG1+X$Lc>MgN8*O8*Ce66mo|5>-H8DsEtVFV6DByx&-srR$fax7FT;7z+SNRt
z9uA<fRI#Z#k<e1~=lp5bg^2U5bUoF?#G>?!e{cTuJqZ2D`v2HZW++5+qqcUo+YTB|
z5%$vF(4+}4($33K3g=HB_{TU{ceuFp^0>{9KK<6e(WXyw-d)#qrSHrK4`C4W at n1a#
ze);9^9145>{DTL(fKE5<JItKKtvNOQ8}p}cRR1yxlfLiFPodE*YjdaDLo&(sROD^x
z)nf- at +E%|>hGUG_Lmbq~a<|`Q&!6_2p=}QBWMm2ea0_}P(FGrd?Dc08U2^rTakV#X
z(Xa@!XT)ZkA}VQUq(i at Jt;5YC4X0I>Gg1m|sD$YQ%kbjn(^BG&1Lw8KC-o-N)2xl)
z*{Cy)XwN6?Gk^8aUTcTgmT;Fg4`YW4w{vK-y7n%!r at hIhc*HIwuXW+JhA~|rCz7kD
zn_I9`2oX=0$j~<I*0LVK7ziv+^Jnacs5quKRvSbO0Wl!t;Y--o4KunSBgF0wJTGqL
zM0$-ZdWwF6muN!F=*sBJH&CXni_^){uFk4%qyRtNDmM?I+KUCj^y*pK`41v1``nR(
zN&_1-*+Gh37}~sTCKP0T+%TKY2g<UIzm8lzi(6~BXNVgt;uJCIq-U=wyaX7#aL at KJ
zG~VeJ5AX&Qfq~k`5B19f_O!7V!MwwrOfgmb1#zDH7M3q9WBUN7S&#zy?9n4?VA1o>
z30Q8Z3fn^)s82;V(zUh!)zHLRu!j}ZoYp3d(oJZC#sC`E$OSyN?QmdVhS@=Y=}{-(
zrZe5oLhIafo$g}7J)};MR=h6_ACB8crtjR2W%^xdvJl5O|J8<HMLxWW{nCjY<_Ast
z$olCqM1x^M9D*7L6xi?r<RmM4r|al6_pxUu>_AObq4NWVKM1)X&S53})42gHvR~Io
zCU5;QAF<y{$B_dheQN)yU5 at T1fkHyXOhY>6wAcnkNvJ|GL31u)a|b4=={0}<yw2#m
zg&lgByC-w`l-Mzeq-5Om&HU;6VHQ5Xn#|Y6+%`S*kP~H`WjJ+qUZ(>XCp4!onNK0W
z_(((4S{kxYI#EJ5wb`r%y)(juZ6z1&m-~8h>Gr6A7gfNC(|{@KZm0;o+q!9y_RIk$
z7vs^vE!Ql8fE`MnZe=scf6dnoFWAP%kJAe|{{-W{H~n<>H}jV-^bZ=y^2RC~gR0n}
zkc;=A5NYAHD~0p8o<+db4A9Y at pgbl3uV0=OaV<lvCtXG<C=fAcnH6$92e}cqY at v6_
zpaPO}pjk0b<}j-nrhx^f&Z%Yron1u^x`Wa|a(=IU$!&F7nVDum;h?)GKbyZgp4Hu<
zI-Q20(K9r#d4;=ElD(YBm^%Cf0Sh^ziNn|fg<WIx*qlgtDWN?p^+d at KnWW`omYP4G
zvjfM4G&SALqA at Jq7L@=Z{a{y9x@}dr%`k#YS~;Qy%LnQXvs^CmWiepS$~(2O%yiU|
z<)ctS&cx68%LDqt0A<<{xoe3&0`2b;KH^IgHK+;d at CQ{us$dKl_h2r^J(%e&^by6H
zF%2SShz5~mHjqdkk at PnjQ2L01U5*A46Fcp1B%(sXN1W!s%|D>hN4#N5 at hu%i?v24T
zJ(F?mj%30|pjAJ?#2fmEhk#j-o4F^<;Vp>-Cod#(nAA=*Rwo7g`!3#-%}x${>ipI5
z!O2`BF>Lq(lPBm7t^@2UBe at rEL(U*LH;rLfodHTRS5xE6G8Hmm;>xXh<Z{M|uND5o
z{P_p at syUmzh#Kw at oAMVF@Y=*`ME(o;_o^^z&IzCi_jybJUY}fz$fM-n8$_COf|!Q;
z6cmUE_cIbnb50?1;ohJElHq<vOqz3wIU4Q(3rr37Gl1qfar^Ik76?2a?=04H5KtvG
zrz{)_eE#Y<YR+Q;FEPYIrpy853heosiL)P{^N_kTF;$mzOM2$c5Z$xt6ek0Qrevb7
zS%=vWs+(Y2So>vJ0b73K;#4*}HwR>dIZnepaCUAE6hTQO%}vGzUetHYUmh$4KVTMi
zNfR2UFij8q9n>fC1Idl)ptnatpK)KC*w6+8W0gf`1F*Flj}<nwlxSFpY16)VeDkd4
zHbg|<o(AF at j~9x<DH`7xMD<^rXVnQ1QFnxp#wQL-qq?unvzmzzQBwvYUO6m{>by43
zC48);eEiH|X~d!}__&>n8g^^E;Ek(?MZ~f#5tC7yM2Vv=mLH8tX5`n~{)LiWmHTZs
zfIPlSt;OW-$JJWE$W++IXh3%+^~WyXxV7MQ%qHF^n5l3w%v3jld22x#%-CXJgClgR
zDUQ(j_ at FSIvCSCFjLsA@r4yJ%=`>bD%O0UqReglcCyMja*iDR?(V1eVbON&|oyKzL
zFGlE8s1Z7^6z3;;9sm|&W^|^QDV at M9Lg#Ip;0OTqXie_JvV5|DfU-W8I1Zch%s+aK
zW;-RE7Lt;R(lwWfN<5k-*n at yrRL~=$aGDH at -g?Z^*m(16PpHNsaT!^{^P4=S4s$u4
zn4_g-PG6ET$^-GbU5eSWO3BPePsM`!RsujH>i4x$Ilc89&M<O!aBd1pNZ)s-)B+!i
z*jS<idhSAL*4Fhyj0sEGdRkdo?jtt7iDEaFRPY8zhnWQgtn8t9qs<>I<iv at W9VOLN
zWKXE<1ULar7t68ch4Xe?a3 at W<0yo4_KsBcg(g=Zek}k_cTr)u3re#ALP_K?JUCP8K
zR1(Y`c}5mbA;FO!JV7n1$A=v~wFBz{u_$iRtTBIm!3lqQ<b at u2bLRf@aTnqWdU{Z|
z&U6PO(-XE-0+bmST}_#63ALMaym<3!STHPio9n5j`-J5(@N3DhFX?j at wk}}p4BF=)
z at zbMt%#k at 0N$HV{_k4V~N*qtksbbLJ#l)5|JTru?IQ52V;kZ7Y{4{nceJtWFg<ZQX
zJS)kRF%~d(bnj3PF~N_z4$6M!dG~Co6`5boZsE+J+%^C4>e9-DdQnA3T!WrWLlMIw
zp{lN#KWL=_YYXWDsV)UqntpW`df>;3Z59c&tui(!b9f~fie^igAJ#+iN8yc(*PM%o
zgQ+lQ#JPjq$ur$Tjq@=p!HGQLh0Y9+wa(;}L%?JP$Ax_^nZjg87 at pla_1*mCaesQs
zBt{CNc)%f4raCGP@*p<ByaS%_3iGEA at y*PBV8C0P{b<02Rv;rD4z3nnqe<AQ5z}OI
z>Jw5#o&C&kib4X2*-!d*E^m64IZa^19ws<l{896#k22BRb$fGjS;`2K$G11PST8OO
z`;QF!ZyNUBJnX+^*njJ=|F&WO?Zf^%hW&RA`|ld|-#zTVXV`!5u>Zbc|NX=Mqr?7V
z!~O?`{SOZN9~$;QJnVmD*#GFT|FL2J<HP<ZhW$?t`=1*2KRxU}KI}g+>|ZGz&1f`h
z_Eg0#9_P<(SztEeVN5IK^2&0qE&E{3&!4{dd8KC_9bCn0 at XQtbyn>$t{6x|G*@p*L
z at ftjf&+!^Oi_h^IJd4lq8a#*3 at fsXVxA`=q{yCuGHFyqacnzKd8eW6vfri)Md3=u7
p;CXzG*Wh`4j at JOEDqJmkvO@N>BDCH5J3T8tzwg#xE%c3^^?x0SNE-kE

literal 0
HcmV?d00001

diff --git a/tests/hard_coded/string_well_formed_utf8.m b/tests/hard_coded/string_well_formed_utf8.m
new file mode 100644
index 000000000..850ace45e
--- /dev/null
+++ b/tests/hard_coded/string_well_formed_utf8.m
@@ -0,0 +1,76 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+%
+% The .exp file is for targets using UTF-8 encoding.
+% The .exp2 file is for the Java backend.
+% The .exp3 file is for the C# backend.
+%
+% XXX The Java and C# backends currently differ in whether the null byte on
+% line 71 is considered an error.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_well_formed_utf8.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    ( if count_code_units("\U0001F600") = 4 then
+        io.write_string("string encoding is UTF-8\n", !IO)
+    else
+        io.write_string("string encoding is UTF-16\n", !IO)
+    ),
+    loop([], AccResults, !IO),
+    list.reverse(AccResults, Results0),
+    list.remove_adjacent_dups(Results0, Results),
+    io.write_list(Results, "\n", io.write_string, !IO).
+
+:- pred loop(list(string)::in, list(string)::out, io::di, io::uo) is det.
+
+loop(!Acc, !IO) :-
+    io.get_line_number(LineNr, !IO),
+    io.read_line_as_string(RLAS, !IO),
+    (
+        RLAS = ok(Line),
+        check_line(LineNr, strip(Line), !Acc),
+        loop(!Acc, !IO)
+    ;
+        RLAS = eof
+    ;
+        RLAS = error(Error),
+        Msg = format("line %d: %s", [i(LineNr), s(io.error_message(Error))]),
+        !:Acc = [Msg | !.Acc],
+        loop(!Acc, !IO)
+    ).
+
+:- pred check_line(int::in, string::in, list(string)::in, list(string)::out)
+    is det.
+
+check_line(LineNr, S, !Acc) :-
+    ( if string.is_well_formed(S) then
+        ( if string.all_match(is_ascii, S) then
+            Msg = ""
+        else if string.contains_char(S, '\uFFFD') then
+            Msg = format("line %d: contains replacement char", [i(LineNr)])
+        else
+            Msg = format("line %d: well-formed", [i(LineNr)])
+        )
+    else
+        Msg = format("line %d: not well-formed", [i(LineNr)])
+    ),
+    !:Acc = [Msg | !.Acc].
-- 
2.23.0



More information about the reviews mailing list