[m-rev.] preliminary review: UTF-8/16 support
Peter Wang
novalazy at gmail.com
Wed Dec 1 14:17:51 AEDT 2010
Here is some work in progress on UTF-8 and UTF-16 string support, in
case someone would like to look at it or make suggestions.
We would commit to using the Universal Character Set instead of
trying to be agnostic about string encodings.
Mercury `char' becomes a Unicode code point, so is re-defined as a
32-bit integer (at least).
String indexing predicates return code points instead of code units.
e.g. `string.index(String, CodeUnitOffset, CodePoint)' returns the
code point beginning at CodeUnitOffset.
When you iterate over a string you should not advance to Index + 1 any
more, so there is are predicates which return the next/previous offset:
string.index_next(String, Offset, CodePoint, NextOffset)
string.prev_index(String, NextOffset, CodePoint, Offset)
diff --git a/compiler/c_util.m b/compiler/c_util.m
index c162176..a72a558 100644
--- a/compiler/c_util.m
+++ b/compiler/c_util.m
@@ -179,6 +179,7 @@
:- implementation.
+:- import_module libs.compiler_util.
:- import_module libs.options.
:- import_module bool.
@@ -307,7 +308,27 @@ output_quoted_string(S, !IO) :-
output_quoted_string_lang(literal_c, S, !IO).
output_quoted_string_lang(Lang, S, !IO) :-
- do_output_quoted_string(Lang, 0, length(S), S, !IO).
+ (
+ Lang = literal_c,
+ % Avoid a limitation in the MSVC compiler where string literals can be
+ % no longer than 2048 chars. However if you output the string in
+ % chunks, eg "part a" "part b" it will accept a string longer than 2048
+ % chars, go figure!
+ % XXX we could avoid creating new strings
+ string.split_by_codepoint(S, 160, Left, Right),
+ do_output_quoted_string(Lang, Left, 0, !IO),
+ ( Right = "" ->
+ true
+ ;
+ io.write_string("\" \"", !IO),
+ output_quoted_string_lang(Lang, Right, !IO)
+ )
+ ;
+ ( Lang = literal_java
+ ; Lang = literal_csharp
+ ),
+ do_output_quoted_string(Lang, S, 0, !IO)
+ ).
output_quoted_multi_string(Ss, !IO) :-
output_quoted_multi_string_lang(literal_c, Ss, !IO).
@@ -318,50 +339,13 @@ output_quoted_multi_string_lang(Lang, [S | Ss], !IO) :-
output_quoted_char_lang(Lang, char.det_from_int(0), !IO),
output_quoted_multi_string_lang(Lang, Ss, !IO).
-:- pred do_output_quoted_string(literal_language::in, int::in, int::in,
- string::in, io::di, io::uo) is det.
+:- pred do_output_quoted_string(literal_language::in, string::in,
+ int::in, io::di, io::uo) is det.
-do_output_quoted_string(Lang, Cur, Len, S, !IO) :-
- ( Cur < Len ->
- % Avoid a limitation in the MSVC compiler where string literals
- % can be no longer than 2048 chars. However if you output the string
- % in chunks, eg "part a" "part b" it will accept a string longer than
- % 2048 chars, go figure!
- (
- Lang = literal_c,
- Cur \= 0,
- Cur mod 512 = 0
- ->
- io.write_string("\" \"", !IO)
- ;
- true
- ),
-
- string.unsafe_index(S, Cur, Char),
+do_output_quoted_string(Lang, S, Cur, !IO) :-
+ ( string.unsafe_index_next(S, Cur, Next, Char) ->
output_quoted_char_lang(Lang, Char, !IO),
-
- % Check for trigraph sequences in string literals. We break the
- % trigraph by breaking the string into multiple chunks. For example,
- % "??-" gets converted to "?" "?-".
- (
- Lang = literal_c,
- Char = '?',
- Cur + 2 < Len
- ->
- (
- string.unsafe_index(S, Cur + 1, '?'),
- string.unsafe_index(S, Cur + 2, ThirdChar),
- is_trigraph_char(ThirdChar)
- ->
- io.write_string("\" \"", !IO)
- ;
- true
- )
- ;
- true
- ),
-
- do_output_quoted_string(Lang, Cur + 1, Len, S, !IO)
+ do_output_quoted_string(Lang, S, Next, !IO)
;
true
).
@@ -403,25 +387,40 @@ quote_one_char(Lang, Char, RevChars0, RevChars) :-
->
RevChars = [EscapeChar, '\\' | RevChars0]
;
- is_c_source_char(Char)
+ Lang = literal_c,
+ Char = '?'
->
- RevChars = [Char | RevChars0]
+ % Avoid trigraphs by escaping the question marks.
+ RevChars = ['?', '\\' | RevChars0]
;
- Lang = literal_java,
- char.to_int(Char) >= 0x80
+ is_c_source_char(Char)
->
- % If the compiler is built in a C grade (8-bit strings), we assume that
- % both the Mercury source file and Java target file use UTF-8 encoding.
- % Each `Char' will be a UTF-8 code unit in a multi-byte sequence.
- % If the compiler is built in a Java backend, each `Char' will be a
- % UTF-16 code unit, possibly of a surrogate pair. In both cases the
- % code units must be passed through without escaping.
RevChars = [Char | RevChars0]
;
char.to_int(Char, 0)
->
RevChars = ['0', '\\' | RevChars0]
;
+ Int = char.to_int(Char),
+ Int >= 0x80
+ ->
+ (
+ Lang = literal_c,
+ ( char.to_utf8(Char, CodeUnits) ->
+ list.map(hex_escape_any_int, CodeUnits, EscapeCharss),
+ list.condense(EscapeCharss, EscapeChars),
+ reverse_append(EscapeChars, RevChars0, RevChars)
+ ;
+ unexpected(this_file, "invalid Unicode code point")
+ )
+ ;
+ Lang = literal_java,
+ RevChars = [Char | RevChars0]
+ ;
+ Lang = literal_csharp,
+ RevChars = [Char | RevChars0]
+ )
+ ;
(
Lang = literal_c,
octal_escape_any_char(Char, EscapeChars)
@@ -453,20 +452,6 @@ escape_special_char('\v', 'v'). % not in Java
escape_special_char('\r', 'r').
escape_special_char('\f', 'f').
- % Succeed if the given character, prefixed with "??", is a trigraph.
- %
-:- pred is_trigraph_char(char::in) is semidet.
-
-is_trigraph_char('(').
-is_trigraph_char(')').
-is_trigraph_char('<').
-is_trigraph_char('>').
-is_trigraph_char('=').
-is_trigraph_char('/').
-is_trigraph_char('\'').
-is_trigraph_char('!').
-is_trigraph_char('-').
-
% This succeeds iff the specified character is allowed as an (unescaped)
% character in standard-conforming C source code.
%
@@ -513,7 +498,12 @@ octal_escape_any_char(Char, EscapeCodeChars) :-
%
hex_escape_any_char(Char, EscapeCodeChars) :-
char.to_int(Char, Int),
- string.format("\\x%04x", [i(Int)], HexString),
+ hex_escape_any_int(Int, EscapeCodeChars).
+
+:- pred hex_escape_any_int(int::in, list(char)::out) is det.
+
+hex_escape_any_int(Int, EscapeCodeChars) :-
+ string.format("\\x%02x", [i(Int)], HexString),
string.to_char_list(HexString, EscapeCodeChars).
%-----------------------------------------------------------------------------%
@@ -645,3 +635,9 @@ is_valid_c_identifier(S) :-
string.is_all_alnum_or_underscore(S).
%-----------------------------------------------------------------------------%
+
+:- func this_file = string.
+
+this_file = "c_util.m".
+
+%-----------------------------------------------------------------------------%
diff --git a/compiler/mlds_to_cs.m b/compiler/mlds_to_cs.m
index 3848de2..62a217c 100644
--- a/compiler/mlds_to_cs.m
+++ b/compiler/mlds_to_cs.m
@@ -2240,7 +2240,8 @@ type_to_string(Info, MLDS_Type, String, ArrayDims) :-
ArrayDims = []
;
MLDS_Type = mlds_native_char_type,
- String = "char",
+ % String = "char",
+ String = "int",
ArrayDims = []
;
MLDS_Type = mlds_foreign_type(ForeignType),
@@ -2335,7 +2336,8 @@ type_to_string(Info, MLDS_Type, String, ArrayDims) :-
mercury_type_to_string(Info, Type, CtorCat, String, ArrayDims) :-
(
CtorCat = ctor_cat_builtin(cat_builtin_char),
- String = "char",
+ % String = "char",
+ String = "int",
ArrayDims = []
;
CtorCat = ctor_cat_builtin(cat_builtin_int),
@@ -3515,9 +3517,13 @@ csharp_builtin_type(Type, "double") :-
Type = mlds_native_float_type.
csharp_builtin_type(Type, "double") :-
Type = mercury_type(builtin_type(builtin_type_float), _, _).
-csharp_builtin_type(Type, "char") :-
+% csharp_builtin_type(Type, "char") :-
+% Type = mlds_native_char_type.
+% csharp_builtin_type(Type, "char") :-
+% Type = mercury_type(builtin_type(builtin_type_char), _, _).
+csharp_builtin_type(Type, "int") :-
Type = mlds_native_char_type.
-csharp_builtin_type(Type, "char") :-
+csharp_builtin_type(Type, "int") :-
Type = mercury_type(builtin_type(builtin_type_char), _, _).
csharp_builtin_type(Type, "bool") :-
Type = mlds_native_bool_type.
@@ -3611,7 +3617,8 @@ output_rval_const(Info, Const, !IO) :-
output_int_const(N, !IO)
;
Const = mlconst_char(N),
- io.write_string("((char) ", !IO),
+ % io.write_string("((char) ", !IO),
+ io.write_string("( ", !IO),
output_int_const(N, !IO),
io.write_string(")", !IO)
;
diff --git a/compiler/mlds_to_java.m b/compiler/mlds_to_java.m
index 6bd9186..1f5d0d9 100644
--- a/compiler/mlds_to_java.m
+++ b/compiler/mlds_to_java.m
@@ -3424,7 +3424,8 @@ type_to_string(Info, MLDS_Type, String, ArrayDims) :-
ArrayDims = []
;
MLDS_Type = mlds_native_char_type,
- String = "char",
+ % String = "char",
+ String = "int",
ArrayDims = []
;
MLDS_Type = mlds_foreign_type(ForeignType),
@@ -3518,7 +3519,8 @@ type_to_string(Info, MLDS_Type, String, ArrayDims) :-
mercury_type_to_string(Info, Type, CtorCat, String, ArrayDims) :-
(
CtorCat = ctor_cat_builtin(cat_builtin_char),
- String = "char",
+ % String = "char",
+ String = "int",
ArrayDims = []
;
CtorCat = ctor_cat_builtin(cat_builtin_int),
@@ -4847,9 +4849,13 @@ java_builtin_type(Type, "double", "java.lang.Double", "doubleValue") :-
Type = mlds_native_float_type.
java_builtin_type(Type, "double", "java.lang.Double", "doubleValue") :-
Type = mercury_type(builtin_type(builtin_type_float), _, _).
-java_builtin_type(Type, "char", "java.lang.Character", "charValue") :-
+% java_builtin_type(Type, "char", "java.lang.Character", "charValue") :-
+% Type = mlds_native_char_type.
+% java_builtin_type(Type, "char", "java.lang.Character", "charValue") :-
+% Type = mercury_type(builtin_type(builtin_type_char), _, _).
+java_builtin_type(Type, "int", "java.lang.Integer", "intValue") :-
Type = mlds_native_char_type.
-java_builtin_type(Type, "char", "java.lang.Character", "charValue") :-
+java_builtin_type(Type, "int", "java.lang.Integer", "intValue") :-
Type = mercury_type(builtin_type(builtin_type_char), _, _).
java_builtin_type(Type, "boolean", "java.lang.Boolean", "booleanValue") :-
Type = mlds_native_bool_type.
@@ -4976,7 +4982,8 @@ output_rval_const(Info, Const, !IO) :-
output_int_const(N, !IO)
;
Const = mlconst_char(N),
- io.write_string("((char) ", !IO),
+ % io.write_string("((char) ", !IO),
+ io.write_string("((int) ", !IO),
output_int_const(N, !IO),
io.write_string(")", !IO)
;
diff --git a/doc/reference_manual.texi b/doc/reference_manual.texi
index cf654ca..a53a2d3 100644
--- a/doc/reference_manual.texi
+++ b/doc/reference_manual.texi
@@ -6689,11 +6689,13 @@ than @samp{sizeof(MR_Word)}.
@node C# data passing conventions
@subsection C# data passing conventions
-The Mercury types @code{int}, @code{float}, @code{char},
+The Mercury types @code{int}, @code{float},
and @code{string} are mapped to the Common Language Infrastructure (CLI) types
- at code{System.Int32}, @code{System.Double}, @code{System.Char} and
+ at code{System.Int32}, @code{System.Double}, and
@code{System.String} respectively, which correspond to the C# types
- at code{int}, @code{double}, @code{char}, and @code{string}.
+ at code{int}, @code{double}, and @code{string}.
+The Mercury type @code{char} is mapped like @code{int}; @emph{not} to the CLI
+type @code{System.Char} because that only holds 16-bit numeric values.
For the Mercury standard library type @samp{bool.bool}, there is a
corresponding C# type, @code{mr_bool.Bool_0}. C# code can refer to the
@@ -6805,10 +6807,12 @@ List_1 list.cons(object head, List_1 tail)
@node Java data passing conventions
@subsection Java data passing conventions
-The Mercury types @code{int}, @code{float}, @code{char},
+The Mercury types @code{int}, @code{float},
and @code{string} are mapped to the Java types
- at code{int}, @code{double}, @code{char} and
+ at code{int}, @code{double}, and
@code{java.lang.String} respectively.
+The Mercury type @code{char} is mapped like @code{int}; @emph{not} to the Java
+type @code{char} because that only holds 16-bit numeric values.
For the Mercury standard library type @samp{bool.bool}, there is a
corresponding Java type, @code{bool.Bool_0}. Java code can refer to the
diff --git a/library/char.m b/library/char.m
index 5771ebf..51dfb3f 100644
--- a/library/char.m
+++ b/library/char.m
@@ -24,31 +24,28 @@
:- module char.
:- interface.
+
:- import_module enum.
+:- import_module list.
:- import_module pretty_printer.
%-----------------------------------------------------------------------------%
+ % A Unicode code point.
+ %
:- type char == character.
:- instance enum(character).
% Convert a character to its corresponding numerical code (integer value).
- % Beware that the mapping from characters to numerical codes is
- % implementation-dependent; there is no guarantee that the integer values
- % for characters will fit in 8 bits. Furthermore, the value returned from
- % char.to_int might be different than the byte(s) used to store the
- % character in a file. There is also no guarantee that characters created
- % using `char.to_int(out, in)' can be written to files or to the standard
- % output or standard error streams. For example, an implementation might
- % represent characters using Unicode, but store files in an 8-bit national
- % character set.
%
% Note that '\0' is not accepted as a Mercury null character constant.
% Instead, a null character can be created using `char.det_from_int(0)'.
% Null characters aren't very useful in Mercury because they aren't
% allowed in strings.
%
+ % Codepoints outside the Unicode range are also not accepted.
+ %
:- func char.to_int(char) = int.
:- pred char.to_int(char, int).
:- mode char.to_int(in, out) is det.
@@ -97,8 +94,8 @@
:- mode char.lower_upper(in, out) is semidet.
:- mode char.lower_upper(out, in) is semidet.
- % True iff the character is whitespace, i.e. a space, tab,
- % newline, carriage return, form-feed, or vertical tab.
+ % True iff the character is a whitespace character in the ASCII range,
+ % i.e. a space, tab, newline, carriage return, form-feed, or vertical tab.
%
:- pred char.is_whitespace(char::in) is semidet.
@@ -175,11 +172,26 @@
%
:- func char.char_to_doc(char) = pretty_printer.doc.
+ % Encode a Unicode code point in UTF-8.
+ %
+:- pred char.to_utf8(char::in, list(int)::out) is semidet.
+
+ % Encode a Unicode code point in UTF-16 (native endianness).
+ %
+:- pred char.to_utf16(char::in, list(int)::out) is semidet.
+
+ % Succeed if `Char' is a Unicode surrogate code point.
+ % In UTF-16, a code point with a scalar value greater than 0xffff
+ % is encoded with a pair of surrogate code points.
+ %
+:- pred char.is_surrogate(char::in) is semidet.
+
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
:- implementation.
+:- import_module int.
:- import_module require.
:- import_module term_io.
@@ -513,13 +525,8 @@ char.det_from_int(Int) = Char :-
[will_not_call_mercury, promise_pure, thread_safe,
does_not_affect_liveness],
"
- /*
- ** If the integer doesn't fit into a char, then the assignment
- ** `Character = Int' below will truncate it. SUCCESS_INDICATOR will be set
- ** to true only if the result was not truncated.
- */
Character = Int;
- SUCCESS_INDICATOR = ((MR_UnsignedChar) Character == Int);
+ SUCCESS_INDICATOR = (Character >= 0 && Character <= 0x10ffff);
").
:- pragma foreign_proc("C#",
@@ -540,8 +547,8 @@ char.det_from_int(Int) = Char :-
char.to_int(Character::out, Int::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
- Character = (char) Int;
- SUCCESS_INDICATOR = (Character == Int);
+ Character = Int;
+ SUCCESS_INDICATOR = (Int >= 0 && Int <= 0x10ffff);
").
:- pragma foreign_proc("Java",
@@ -562,8 +569,8 @@ char.det_from_int(Int) = Char :-
char.to_int(Character::out, Int::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
- Character = (char) Int;
- SUCCESS_INDICATOR = ((int) Character == Int);
+ Character = Int;
+ SUCCESS_INDICATOR = (Int >= 0 && Int <= 0x10ffff);
").
:- pragma foreign_proc("Erlang",
@@ -584,6 +591,7 @@ char.det_from_int(Int) = Char :-
char.to_int(Character::out, Int::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
+ % XXX Unicode
case
Int >= 0 andalso Int < 256
of
@@ -607,33 +615,70 @@ char.min_char_value(0).
[will_not_call_mercury, promise_pure, thread_safe,
does_not_affect_liveness],
"
- Max = UCHAR_MAX;
+ Max = 0x10ffff;
").
:- pragma foreign_proc("C#",
char.max_char_value(Max::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
- // .NET uses 16-bit 'Unicode'. This might be either UCS-2,
- // where Unicode characters that don't fit in 16 bits are encoded
- // in two 16 bit characters, or it might be just the 16-bit subset,
- // i.e. only the Unicode characters that fit in 16 bits.
- // For our purposes, it doesn't matter.
- Max = 0xffff;
+ Max = 0x10ffff;
").
:- pragma foreign_proc("Java",
char.max_char_value(Max::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
- Max = (int) java.lang.Character.MAX_VALUE;
+ Max = 0x10ffff;
").
:- pragma foreign_proc("Erlang",
char.max_char_value(Max::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
- % XXX actually Erlang doesn't have chars so there should be no maximum
- Max = 255
+ Max = 16#10ffff
").
+char.to_utf8(Char, CodeUnits) :-
+ Int = char.to_int(Char),
+ ( Int =< 0x7f ->
+ CodeUnits = [Int]
+ ; Int =< 0x7ff ->
+ A = 0xc0 \/ ((Int >> 6) /\ 0x1f),
+ B = 0x80 \/ (Int /\ 0x3f),
+ CodeUnits = [A, B]
+ ; Int =< 0xffff ->
+ A = 0xe0 \/ ((Int >> 12) /\ 0x0f),
+ B = 0x80 \/ ((Int >> 6) /\ 0x3f),
+ C = 0x80 \/ (Int /\ 0x3f),
+ CodeUnits = [A, B, C]
+ ; Int =< 0x10ffff ->
+ A = 0xf0 \/ ((Int >> 18) /\ 0x07),
+ B = 0x80 \/ ((Int >> 12) /\ 0x3f),
+ C = 0x80 \/ ((Int >> 6) /\ 0x3f),
+ D = 0x80 \/ (Int /\ 0x3f),
+ CodeUnits = [A, B, C, D]
+ ;
+ % Illegal code point.
+ fail
+ ).
+
+char.to_utf16(Char, CodeUnits) :-
+ Int = char.to_int(Char),
+ ( Int =< 0xffff ->
+ CodeUnits = [Int]
+ ; Int =< 0x10ffff ->
+ U = Int - 0x10000,
+ A = 0xd800 \/ (U >> 10),
+ B = 0xdc00 \/ (U /\ 0x3ff),
+ CodeUnits = [A, B]
+ ;
+ % Illegal code point.
+ fail
+ ).
+
+char.is_surrogate(Char) :-
+ Int = char.to_int(Char),
+ Int >= 0xd800,
+ Int =< 0xdfff.
+
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
% Ralph Becket <rwab1 at cl.cam.ac.uk> 27/04/99
diff --git a/library/io.m b/library/io.m
index c90613e..c46ec6e 100644
--- a/library/io.m
+++ b/library/io.m
@@ -134,7 +134,7 @@
% Text input predicates
%
- % Reads a character from the current input stream.
+ % Reads a character (code point) from the current input stream.
%
:- pred io.read_char(io.result(char)::out, io::di, io::uo) is det.
@@ -143,7 +143,7 @@
:- pred io.read_word(io.result(list(char))::out, io::di, io::uo) is det.
% Reads a line from the current input stream, returns the result
- % as a list of chars.
+ % as a list of characters (code points).
%
:- pred io.read_line(io.result(list(char))::out, io::di, io::uo) is det.
@@ -153,14 +153,14 @@
%
:- pred io.read_line_as_string(io.result(string)::out, io::di, io::uo) is det.
- % Reads all the characters from the current input stream until
- % eof or error.
+ % Reads all the characters (code points) from the current input stream
+ % until eof or error.
%
:- pred io.read_file(io.maybe_partial_res(list(char))::out, io::di, io::uo)
is det.
- % Reads all the characters from the current input stream until
- % eof or error. Returns the result as a string rather than
+ % Reads all the characters (code points) from the current input stream
+ % until eof or error. Returns the result as a string rather than
% as a list of char.
%
% Returns an error if the file contains a null character, because
@@ -169,7 +169,7 @@
:- pred io.read_file_as_string(io.maybe_partial_res(string)::out,
io::di, io::uo) is det.
- % Applies the given closure to each character read from
+ % Applies the given closure to each character (code point) read from
% the input stream in turn, until eof or error.
%
:- pred io.input_stream_foldl(pred(char, T, T), T, io.maybe_partial_res(T),
@@ -179,7 +179,7 @@
:- mode io.input_stream_foldl((pred(in, in, out) is cc_multi), in, out,
di, uo) is cc_multi.
- % Applies the given closure to each character read from
+ % Applies the given closure to each character (code point) read from
% the input stream in turn, until eof or error.
%
:- pred io.input_stream_foldl_io(pred(char, io, io), io.res, io, io).
@@ -188,7 +188,7 @@
:- mode io.input_stream_foldl_io((pred(in, di, uo) is cc_multi), out, di, uo)
is cc_multi.
- % Applies the given closure to each character read from
+ % Applies the given closure to each character (code point) read from
% the input stream in turn, until eof or error.
%
:- pred io.input_stream_foldl2_io(pred(char, T, T, io, io),
@@ -198,9 +198,9 @@
:- mode io.input_stream_foldl2_io((pred(in, in, out, di, uo) is cc_multi),
in, out, di, uo) is cc_multi.
- % Applies the given closure to each character read from the input stream
- % in turn, until eof or error, or the closure returns `no' as its
- % second argument.
+ % Applies the given closure to each character (code point) read from the
+ % input stream in turn, until eof or error, or the closure returns `no' as
+ % its second argument.
%
:- pred io.input_stream_foldl2_io_maybe_stop(
pred(char, bool, T, T, io, io),
@@ -212,7 +212,7 @@
(pred(in, out, in, out, di, uo) is cc_multi),
in, out, di, uo) is cc_multi.
- % Un-reads a character from the current input stream.
+ % Un-reads a character (code point) from the current input stream.
% You can put back as many characters as you like.
% You can even put back something that you didn't actually read.
% Note: `io.putback_char' uses the C library function ungetc().
@@ -221,12 +221,12 @@
%
:- pred io.putback_char(char::in, io::di, io::uo) is det.
- % Reads a character from specified stream.
+ % Reads a character (code point) from specified stream.
%
:- pred io.read_char(io.input_stream::in, io.result(char)::out,
io::di, io::uo) is det.
- % Reads a character from the specified stream.
+ % Reads a character (code point) from the specified stream.
% This interface avoids memory allocation when there is no error.
%
:- pred io.read_char_unboxed(io.input_stream::in, io.result::out, char::out,
@@ -238,7 +238,7 @@
io::di, io::uo) is det.
% Reads a line from specified stream, returning the result
- % as a list of chars.
+ % as a list of characters (code point).
%
:- pred io.read_line(io.input_stream::in, io.result(list(char))::out,
io::di, io::uo) is det.
@@ -250,7 +250,7 @@
:- pred io.read_line_as_string(io.input_stream::in, io.result(string)::out,
io::di, io::uo) is det.
- % Reads all the characters from the given input stream until
+ % Reads all the characters (code points) from the given input stream until
% eof or error.
%
:- pred io.read_file(io.input_stream::in,
@@ -265,7 +265,7 @@
:- pred io.read_file_as_string(io.input_stream::in,
io.maybe_partial_res(string)::out, io::di, io::uo) is det.
- % Applies the given closure to each character read from
+ % Applies the given closure to each character (code point) read from
% the input stream in turn, until eof or error.
%
:- pred io.input_stream_foldl(io.input_stream, pred(char, T, T),
@@ -275,7 +275,7 @@
:- mode io.input_stream_foldl(in, in(pred(in, in, out) is cc_multi),
in, out, di, uo) is cc_multi.
- % Applies the given closure to each character read from
+ % Applies the given closure to each character (code point) read from
% the input stream in turn, until eof or error.
%
:- pred io.input_stream_foldl_io(io.input_stream, pred(char, io, io),
@@ -285,7 +285,7 @@
:- mode io.input_stream_foldl_io(in, in(pred(in, di, uo) is cc_multi),
out, di, uo) is cc_multi.
- % Applies the given closure to each character read from
+ % Applies the given closure to each character (code point) read from
% the input stream in turn, until eof or error.
%
:- pred io.input_stream_foldl2_io(io.input_stream,
@@ -298,9 +298,9 @@
in(pred(in, in, out, di, uo) is cc_multi),
in, out, di, uo) is cc_multi.
- % Applies the given closure to each character read from the input stream
- % in turn, until eof or error, or the closure returns `no' as its
- % second argument.
+ % Applies the given closure to each character (code point) read from the
+ % input stream in turn, until eof or error, or the closure returns `no' as
+ % its second argument.
%
:- pred io.input_stream_foldl2_io_maybe_stop(io.input_stream,
pred(char, bool, T, T, io, io),
@@ -1280,6 +1280,8 @@
% file. It is the responsibility of the program to delete the file
% when it is no longer needed.
%
+ % XXX don't know why this description is so specific as to Prefix
+ %
:- pred io.make_temp(string::in, string::in, string::out, io::di, io::uo)
is det.
@@ -1807,12 +1809,12 @@
% we don't want the C/Java/etc code to depend on how Mercury stores
% its discriminated union data types.
- % Reads a character from specified stream, and returns the numerical value
- % for that character (as from char.to_int). This may involve converting
- % external character encodings into Mercury's internal character
- % representation and (for text streams) converting OS line indicators,
- % e.g. CR-LF for Windows, to '\n' characters. Returns -1 if at EOF,
- % -2 if an error occurs.
+ % Reads a character (code point) from specified stream, and returns the
+ % numerical value for that character (as from char.to_int). This may
+ % involve converting external character encodings into Mercury's internal
+ % character representation and (for text streams) converting OS line
+ % indicators, e.g. CR-LF for Windows, to '\n' characters.
+ % Returns -1 if at EOF, -2 if an error occurs.
%
:- pred io.read_char_code(io.input_stream::in, int::out, io::di, io::uo)
is det.
@@ -2171,15 +2173,15 @@ io.read_line_as_string(input_stream(Stream), Result, !IO) :-
#define ML_IO_BYTES_TO_WORDS(n) (((n) + sizeof(MR_Word) - 1) / sizeof(MR_Word))
#define ML_IO_READ_LINE_START 1024
- MR_Char initial_read_buffer[ML_IO_READ_LINE_START];
- MR_Char *read_buffer = initial_read_buffer;
+ char initial_read_buffer[ML_IO_READ_LINE_START];
+ char *read_buffer = initial_read_buffer;
size_t read_buf_size = ML_IO_READ_LINE_START;
size_t i;
int char_code = '\\0';
Res = 0;
for (i = 0; char_code != '\\n'; ) {
- char_code = mercury_getc(Stream);
+ char_code = mercury_get_byte(Stream);
if (char_code == EOF) {
if (i == 0) {
Res = -1;
@@ -5815,6 +5817,18 @@ int ML_fprintf(MercuryFilePtr mf, const char *format, ...);
return true;
}
+ private int read_code_unit()
+ throws java.io.IOException
+ {
+ if (!refill_buffer()) {
+ return -1;
+ }
+
+ char c = buf[buf_pos];
+ buf_pos++;
+ return (int) c;
+ }
+
/*
** read_char(): [Java]
**
@@ -5824,18 +5838,24 @@ int ML_fprintf(MercuryFilePtr mf, const char *format, ...);
public int read_char()
throws java.io.IOException
{
- if (!refill_buffer()) {
+ final int c1 = read_code_unit();
+ if (c1 == -1) {
return -1;
}
-
- char c = buf[buf_pos];
- buf_pos++;
-
- if (c == '\\n') {
+ if (c1 == '\\n') {
line_number++;
}
+ if (!Character.isHighSurrogate((char) c1)) {
+ return c1;
+ }
- return c;
+ final int c2 = read_code_unit();
+ if (c2 != -1 && !Character.isLowSurrogate((char) c2)) {
+ // XXX Unicode: what to do?
+ return c1;
+ }
+
+ return Character.toCodePoint((char) c1, (char) c2);
}
/*
@@ -5908,7 +5928,7 @@ int ML_fprintf(MercuryFilePtr mf, const char *format, ...);
}
}
- public void ungetc(char c) {
+ private void unget_code_unit(char c) {
/*
** If necessary, shift the unread characters in the input buffer
** to make room at the front of the buffer. If the buffer is full
@@ -5930,11 +5950,19 @@ int ML_fprintf(MercuryFilePtr mf, const char *format, ...);
}
}
- buf_pos--;
- buf[buf_pos] = c;
+ buf[--buf_pos] = c;
+ }
- if (c == '\\n') {
- line_number--;
+ public void ungetc(int c) {
+ if (Character.charCount(c) == 1) {
+ unget_code_unit((char) c);
+ if (c == '\\n') {
+ line_number--;
+ }
+ } else {
+ char[] units = Character.toChars(c);
+ unget_code_unit(units[1]);
+ unget_code_unit(units[0]);
}
}
@@ -7001,7 +7029,11 @@ mercury_print_string(MR_MercuryFileStruct mf, string s)
// So we have to output each character separately.
for (int i = 0; i < s.Length; i++) {
- if (s[i] == '\\n') {
+ if (System.Char.IsSurrogate(s[i])) {
+ // XXX more error checking
+ mf.writer.Write(s.Substring(i, 2));
+ i++;
+ } else if (s[i] == '\\n') {
mf.line_number++;
mf.writer.WriteLine("""");
} else {
@@ -7016,8 +7048,9 @@ mercury_print_string(MR_MercuryFileStruct mf, string s)
:- pragma foreign_code("C", "
+/* XXX This is really get_byte, not get_char. */
int
-mercury_getc(MercuryFilePtr mf)
+mercury_get_byte(MercuryFilePtr mf)
{
int c = MR_GETCH(*mf);
if (c == '\\n') {
@@ -7285,6 +7318,16 @@ mercury_close(MercuryFilePtr mf)
:- pragma foreign_code("C#", "
public static void
+mercury_write_codepoint(System.IO.TextWriter w, int c)
+{
+ if (c <= 0xffff) {
+ w.Write((char) c);
+ } else {
+ w.Write(System.Char.ConvertFromUtf32(c));
+ }
+}
+
+public static void
mercury_close(MR_MercuryFileStruct mf)
{
if (mf.reader != null) {
@@ -7322,6 +7365,10 @@ ML_fprintf(MercuryFilePtr mf, const char *format, ...)
% Input predicates
%
+:- pragma foreign_decl("C", local, "
+ extern int MR_utf8_get(const unsigned char *s, int pos);
+").
+
io.read_char_code(input_stream(Stream), CharCode, !IO) :-
io.read_char_code_2(Stream, CharCode, !IO).
@@ -7333,7 +7380,49 @@ io.read_char_code(input_stream(Stream), CharCode, !IO) :-
[will_not_call_mercury, promise_pure, tabled_for_io,
does_not_affect_liveness, no_sharing],
"
- CharCode = mercury_getc(Stream);
+ char buf[5];
+ int nbytes;
+ int i;
+ int c;
+ unsigned int uc;
+
+ c = mercury_get_byte(Stream);
+ uc = c;
+ if (uc <= 0x7f) {
+ CharCode = uc;
+ } else if (c == EOF) {
+ CharCode = -1;
+ } else {
+ if ((uc & 0xE0) == 0xC0) {
+ nbytes = 2;
+ } else if ((uc & 0xF0) == 0xE0) {
+ nbytes = 3;
+ } else if ((uc & 0xF8) == 0xF0) {
+ nbytes = 4;
+ } else {
+ nbytes = 0;
+ }
+ if (nbytes > 0) {
+ buf[0] = uc;
+ for (i = 1; i < nbytes; i++) {
+ uc = mercury_get_byte(Stream);
+ buf[i] = uc;
+ /* XXX break early if not a trail byte? */
+ }
+ buf[i] = '\\0';
+ CharCode = MR_utf8_get(buf, 0);
+ if (CharCode < 0) {
+ /* Invalid byte sequence. */
+ errno = EILSEQ;
+ CharCode = -2;
+ }
+ } else {
+ /* Invalid byte sequence. */
+ errno = EILSEQ;
+ CharCode = -2;
+ }
+ }
+
MR_update_io(IO0, IO);
").
@@ -7347,7 +7436,7 @@ io.read_byte_val(input_stream(Stream), ByteVal, !IO) :-
[will_not_call_mercury, promise_pure, tabled_for_io,
does_not_affect_liveness, no_sharing],
"
- ByteVal = mercury_getc(Stream);
+ ByteVal = mercury_get_byte(Stream);
MR_update_io(IO0, IO);
").
@@ -7364,9 +7453,20 @@ io.putback_char(input_stream(Stream), Character, !IO) :-
if (Character == '\\n') {
MR_line_number(*mf)--;
}
- /* XXX should work even if ungetc() fails */
- if (MR_UNGETCH(*mf, Character) == EOF) {
- mercury_io_error(mf, ""io.putback_char: ungetc failed"");
+ if (Character <= 0x7f) {
+ if (MR_UNGETCH(*mf, Character) == EOF) {
+ mercury_io_error(mf, ""io.putback_char: ungetc failed"");
+ }
+ } else {
+ char buf[5];
+ ssize_t len;
+ len = MR_utf8_encode(buf, Character);
+ /* XXX this depends on multiple putback */
+ for (; len > 0; len--) {
+ if (MR_UNGETCH(*mf, buf[len - 1]) == EOF) {
+ mercury_io_error(mf, ""io.putback_char: ungetc failed"");
+ }
+ }
}
MR_update_io(IO0, IO);
").
@@ -7381,9 +7481,18 @@ io.putback_byte(binary_input_stream(Stream), Character, !IO) :-
does_not_affect_liveness, no_sharing],
"
MercuryFilePtr mf = Stream;
- /* XXX should work even if ungetc() fails */
if (MR_UNGETCH(*mf, Character) == EOF) {
mercury_io_error(mf, ""io.putback_byte: ungetc failed"");
+ } else {
+ char buf[5];
+ ssize_t len;
+ len = MR_utf8_encode(buf, Character);
+ /* XXX this depends on multiple putback */
+ for (; len > 0; len--) {
+ if (MR_UNGETCH(*mf, buf[len - 1]) == EOF) {
+ mercury_io_error(mf, ""io.putback_char: ungetc failed"");
+ }
+ }
}
MR_update_io(IO0, IO);
").
@@ -7502,6 +7611,10 @@ io.putback_byte(binary_input_stream(Stream), Character, !IO) :-
% Output predicates (with output to mercury_current_text_output)
%
+:- pragma foreign_decl("C", local, "
+ extern size_t MR_utf8_encode(char s[], int c);
+").
+
:- pragma foreign_proc("C",
io.write_string(Message::in, IO0::di, IO::uo),
[may_call_mercury, promise_pure, tabled_for_io, thread_safe, terminates,
@@ -7517,11 +7630,24 @@ io.putback_byte(binary_input_stream(Stream), Character, !IO) :-
does_not_affect_liveness, no_sharing],
"
MercuryFilePtr out = mercury_current_text_output();
- if (MR_PUTCH(*out, Character) < 0) {
- mercury_output_error(out);
- }
- if (Character == '\\n') {
- MR_line_number(*out)++;
+ char buf[5];
+ size_t len;
+ int i;
+ if (Character <= 0x7f) {
+ if (MR_PUTCH(*out, Character) < 0) {
+ mercury_output_error(out);
+ }
+ if (Character == '\\n') {
+ MR_line_number(*out)++;
+ }
+ } else {
+ len = MR_utf8_encode(buf, Character);
+ for (i = 0; i < len; i++) {
+ if (MR_PUTCH(*out, buf[i]) < 0) {
+ mercury_output_error(out);
+ break;
+ }
+ }
}
MR_update_io(IO0, IO);
").
@@ -7623,7 +7749,7 @@ io.write_bitmap(Bitmap, Start, NumBytes, !IO) :-
switch (io.mercury_current_text_output.line_ending) {
case io.ML_line_ending_kind.ML_raw_binary:
case io.ML_line_ending_kind.ML_Unix_line_ending:
- w.Write(Character);
+ mercury_write_codepoint(w, Character);
break;
case io.ML_line_ending_kind.ML_OS_line_ending:
w.WriteLine("""");
@@ -7631,7 +7757,7 @@ io.write_bitmap(Bitmap, Start, NumBytes, !IO) :-
}
io.mercury_current_text_output.line_number++;
} else {
- w.Write(Character);
+ mercury_write_codepoint(w, Character);
}
").
@@ -7674,8 +7800,11 @@ io.write_bitmap(Bitmap, Start, NumBytes, !IO) :-
io.write_char(Chr::in, _IO0::di, _IO::uo),
[may_call_mercury, promise_pure, thread_safe, tabled_for_io, terminates],
"
- io.mercury_current_text_output.get().write_or_throw(
- Character.toString(Chr));
+ MR_TextOutputFile stream = io.mercury_current_text_output.get();
+ char[] buf = java.lang.Character.toChars(Chr);
+ for (char c : buf) {
+ stream.put_or_throw(c);
+ }
").
:- pragma foreign_proc("Java",
io.write_int(Val::in, _IO0::di, _IO::uo),
@@ -7854,11 +7983,24 @@ io.write_char(output_stream(Stream), Character, !IO) :-
[may_call_mercury, promise_pure, tabled_for_io, thread_safe, terminates,
does_not_affect_liveness, no_sharing],
"
- if (MR_PUTCH(*Stream, Character) < 0) {
- mercury_output_error(Stream);
- }
- if (Character == '\\n') {
- MR_line_number(*Stream)++;
+ if (Character <= 0x7f) {
+ if (MR_PUTCH(*Stream, Character) < 0) {
+ mercury_output_error(Stream);
+ }
+ if (Character == '\\n') {
+ MR_line_number(*Stream)++;
+ }
+ } else {
+ char buf[5];
+ size_t len;
+ int i;
+ len = MR_utf8_encode(buf, Character);
+ for (i = 0; i < len; i++) {
+ if (MR_PUTCH(*Stream, buf[i]) < 0) {
+ mercury_output_error(Stream);
+ break;
+ }
+ }
}
MR_update_io(IO0, IO);
").
@@ -8011,7 +8153,7 @@ io.flush_binary_output(binary_output_stream(Stream), !IO) :-
switch (stream.line_ending) {
case io.ML_line_ending_kind.ML_raw_binary:
case io.ML_line_ending_kind.ML_Unix_line_ending:
- w.Write(Character);
+ mercury_write_codepoint(w, Character);
break;
case io.ML_line_ending_kind.ML_OS_line_ending:
w.WriteLine("""");
@@ -8019,7 +8161,7 @@ io.flush_binary_output(binary_output_stream(Stream), !IO) :-
}
stream.line_number++;
} else {
- w.Write(Character);
+ mercury_write_codepoint(w, Character);
}
").
@@ -8086,7 +8228,10 @@ io.flush_binary_output(binary_output_stream(Stream), !IO) :-
io.write_char_2(Stream::in, Character::in, _IO0::di, _IO::uo),
[may_call_mercury, promise_pure, thread_safe, tabled_for_io, terminates],
"
- ((io.MR_TextOutputFile) Stream).put_or_throw(Character);
+ char[] buf = java.lang.Character.toChars(Character);
+ for (char c : buf) {
+ ((io.MR_TextOutputFile) Stream).put_or_throw(c);
+ }
").
:- pragma foreign_proc("Java",
diff --git a/library/lexer.m b/library/lexer.m
index 95fb5cc..31094b7 100644
--- a/library/lexer.m
+++ b/library/lexer.m
@@ -1223,13 +1223,14 @@ get_unicode_escape(Stream, NumHexChars, QuoteChar, Chars, HexChars, Token,
( if
rev_char_list_to_string(HexChars, HexString),
string.base_string_to_int(16, HexString, UnicodeCharCode),
- convert_unicode_char_to_target_chars(UnicodeCharCode, UTFChars)
+ allowed_unicode_char_code(UnicodeCharCode),
+ char.from_int(UnicodeCharCode, UnicodeChar)
then
( if UnicodeCharCode = 0 then
Token = null_character_error
else
get_quoted_name(Stream, QuoteChar,
- list.reverse(UTFChars) ++ Chars, Token, !IO)
+ [UnicodeChar | Chars], Token, !IO)
)
else
Token = error("invalid Unicode character code")
@@ -1263,9 +1264,10 @@ string_get_unicode_escape(NumHexChars, String, Len, QuoteChar, Chars,
( if
rev_char_list_to_string(HexChars, HexString),
string.base_string_to_int(16, HexString, UnicodeCharCode),
- convert_unicode_char_to_target_chars(UnicodeCharCode, UTFChars)
+ allowed_unicode_char_code(UnicodeCharCode),
+ char.from_int(UnicodeCharCode, UnicodeChar)
then
- RevCharsWithUnicode = list.reverse(UTFChars) ++ Chars,
+ RevCharsWithUnicode = [UnicodeChar | Chars],
( if UnicodeCharCode = 0 then
string_get_context(Posn0, Context, !Posn),
Token = null_character_error
@@ -1292,90 +1294,6 @@ string_get_unicode_escape(NumHexChars, String, Len, QuoteChar, Chars,
)
).
-:- pred convert_unicode_char_to_target_chars(int::in, list(char)::out)
- is semidet.
-
-convert_unicode_char_to_target_chars(UnicodeCharCode, Chars) :-
- BackendEncoding = backend_unicode_encoding,
- (
- BackendEncoding = utf8,
- encode_unicode_char_as_utf8(UnicodeCharCode, Chars)
- ;
- BackendEncoding = utf16,
- encode_unicode_char_as_utf16(UnicodeCharCode, Chars)
- ).
-
-:- pred encode_unicode_char_as_utf8(int::in, list(char)::out) is semidet.
-
-encode_unicode_char_as_utf8(UnicodeCharCode, UTF8Chars) :-
- allowed_unicode_char_code(UnicodeCharCode),
-
- % Refer to table 3-5 of the Unicode 4.0.0 standard (available from
- % www.unicode.org) for documentation on the bit distribution patterns used
- % below.
-
- ( if UnicodeCharCode =< 0x00007F then
- UTF8Chars = [char.det_from_int(UnicodeCharCode)]
- else if UnicodeCharCode =< 0x0007FF then
- Part1 = (0b11111000000 /\ UnicodeCharCode) >> 6,
- Part2 = 0b00000111111 /\ UnicodeCharCode,
- char.det_from_int(Part1 \/ 0b11000000, UTF8Char1),
- char.det_from_int(Part2 \/ 0b10000000, UTF8Char2),
- UTF8Chars = [UTF8Char1, UTF8Char2]
- else if UnicodeCharCode =< 0x00FFFF then
- Part1 = (0b1111000000000000 /\ UnicodeCharCode) >> 12,
- Part2 = (0b0000111111000000 /\ UnicodeCharCode) >> 6,
- Part3 = 0b0000000000111111 /\ UnicodeCharCode,
- char.det_from_int(Part1 \/ 0b11100000, UTF8Char1),
- char.det_from_int(Part2 \/ 0b10000000, UTF8Char2),
- char.det_from_int(Part3 \/ 0b10000000, UTF8Char3),
- UTF8Chars = [UTF8Char1, UTF8Char2, UTF8Char3]
- else
- Part1 = (0b111000000000000000000 /\ UnicodeCharCode) >> 18,
- Part2 = (0b000111111000000000000 /\ UnicodeCharCode) >> 12,
- Part3 = (0b000000000111111000000 /\ UnicodeCharCode) >> 6,
- Part4 = 0b000000000000000111111 /\ UnicodeCharCode,
- char.det_from_int(Part1 \/ 0b11110000, UTF8Char1),
- char.det_from_int(Part2 \/ 0b10000000, UTF8Char2),
- char.det_from_int(Part3 \/ 0b10000000, UTF8Char3),
- char.det_from_int(Part4 \/ 0b10000000, UTF8Char4),
- UTF8Chars = [UTF8Char1, UTF8Char2, UTF8Char3, UTF8Char4]
- ).
-
-:- pred encode_unicode_char_as_utf16(int::in, list(char)::out) is semidet.
-
- % This predicate should only be called on backends that have
- % a 16 bit character type.
- %
-encode_unicode_char_as_utf16(UnicodeCharCode, UTF16Chars) :-
- allowed_unicode_char_code(UnicodeCharCode),
-
- % If the code point is less than or equal to 0xFFFF
- % then the UTF-16 encoding is simply the code point value,
- % otherwise we construct a surrogate pair.
-
- ( if UnicodeCharCode =< 0xFFFF then
- char.det_from_int(UnicodeCharCode, Char),
- UTF16Chars = [Char]
- else
- %
- % Refer to table 3-4 of the Unicode 4.0.0 standard (available from
- % www.unicode.org) for documentation on the bit distribution patterns
- % used below.
- %
- UUUUU = (0b111110000000000000000 /\ UnicodeCharCode) >> 16,
- XXXXXX = (0b000001111110000000000 /\ UnicodeCharCode) >> 10,
- XXXXXXXXXX = (0b000000000001111111111 /\ UnicodeCharCode),
- WWWWW = UUUUU - 1,
- Surrogate1Lead = 0b1101100000000000,
- Surrogate2Lead = 0b1101110000000000,
- Surrogate1 = Surrogate1Lead \/ (WWWWW << 6) \/ XXXXXX,
- Surrogate2 = Surrogate2Lead \/ XXXXXXXXXX,
- char.det_from_int(Surrogate1, Surrogate1Char),
- char.det_from_int(Surrogate2, Surrogate2Char),
- UTF16Chars = [Surrogate1Char, Surrogate2Char]
- ).
-
:- pred allowed_unicode_char_code(int::in) is semidet.
% Succeeds if the give code point is a legal Unicode code point
@@ -1389,60 +1307,6 @@ allowed_unicode_char_code(Code) :-
Code >= 0xD800, Code =< 0xDFFF
).
-:- type unicode_encoding
- ---> utf8
- ; utf16.
-
-:- func backend_unicode_encoding = unicode_encoding.
-
-backend_unicode_encoding = Encoding :-
- Int = backend_unicode_encoding_int,
- ( unicode_encoding_int_to_encoding(Int, EncodingPrime) ->
- Encoding = EncodingPrime
- ;
- error("backend_unicode_encoding: unexpected Unicode encoding code")
- ).
-
-:- pred unicode_encoding_int_to_encoding(int::in, unicode_encoding::out)
- is semidet.
-
-unicode_encoding_int_to_encoding(0, utf8).
-unicode_encoding_int_to_encoding(1, utf16).
-
-:- func backend_unicode_encoding_int = int.
-
-:- pragma inline(backend_unicode_encoding_int/0).
-
-:- pragma foreign_proc("C",
- backend_unicode_encoding_int = (EncodingInt::out),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
- does_not_affect_liveness],
-"
- EncodingInt = 0;
-").
-
-:- pragma foreign_proc("Java",
- backend_unicode_encoding_int = (EncodingInt::out),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail],
-"
- EncodingInt = 1;
-").
-
-:- pragma foreign_proc("C#",
- backend_unicode_encoding_int = (EncodingInt::out),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail],
-"
- EncodingInt = 1;
-").
-
-:- pragma foreign_proc("Erlang",
- backend_unicode_encoding_int = (EncodingInt::out),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
- does_not_affect_liveness],
-"
- EncodingInt = 0
-").
-
:- pred get_hex_escape(io.input_stream::in, char::in, list(char)::in,
list(char)::in, token::out, io::di, io::uo) is det.
diff --git a/library/string.m b/library/string.m
index 8ab5454..ded50b9 100644
--- a/library/string.m
+++ b/library/string.m
@@ -20,18 +20,20 @@
%
% The representation of strings is implementation dependent and subject to
% change. In the current implementation, when Mercury is compiled to C, strings
-% are represented as in C, using a null character as the string terminator.
+% are UTF-8 encoded, using a null character as the string terminator.
+% A single code point requires one to four bytes (code units) to encode.
+%
% When Mercury is compiled to Java, strings are represented as Java `String's.
-% When Mercury is compiled to .NET IL code, strings are represented as .NET
-% `System.String's.
+% When Mercury is compiled to C# code, strings are represented as
+% `System.String's. In both cases, strings are UTF-16 encoded. A single code
+% point requires one or two 16-bit integers (code units) to encode.
%
% The builtin comparison operation on strings is also implementation dependent.
% In the current implementation, when Mercury is compiled to C, string
% comparison is implemented using C's strcmp() function. When Mercury
% is compiled to Java, string comparison is implemented using Java's
-% String.compareTo() method. When Mercury is compiled to .NET IL code
-% string comparison is implemented using C#'s System.String.CompareOrdinal()
-% method.
+% String.compareTo() method. When Mercury is compiled to C#, string comparison
+% is implemented using C#'s System.String.CompareOrdinal() method.
%
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
@@ -69,14 +71,45 @@
:- type text_file
---> text_file(string).
- % Determine the length of a string.
+ % Determine the length of a string, in code units.
% An empty string has length zero.
%
+ % NOTE: code points (characters) are encoded using one or more code units,
+ % i.e. bytes for UTF-8; 16-bit integers for UTF-16.
+ %
:- func string.length(string::in) = (int::uo) is det.
:- pred string.length(string, int).
:- mode string.length(in, uo) is det.
:- mode string.length(ui, uo) is det.
+ % Synonyms for string.length.
+ %
+:- func string.count_code_units(string) = int.
+:- pred string.count_code_units(string::in, int::out) is det.
+
+ % Determine the number of code points in a string.
+ %
+:- func string.count_codepoints(string) = int.
+:- pred string.count_codepoints(string::in, int::out) is det.
+
+ % string.codepoint_offset(String, CodePointCount, CodePointOffset):
+ % Equivalent to `string.codepoint_offset(String, 0, CodePointCount,
+ % CodePointOffset)'.
+ %
+:- pred string.codepoint_offset(string::in, int::in, int::out) is semidet.
+
+ % string.codepoint_offset(String, StartOffset, CodePointCount,
+ % CodePointOffset):
+ %
+ % Return the offset into `String' where, starting from `StartOffset',
+ % `CodePointCount' code points are skipped. Fails if either `StartOffset'
+ % or `CodePointOffset' are out of range.
+ %
+ % XXX Unicode not confirmed: Unpaired surrogates are treated as one code point each.
+ %
+:- pred string.codepoint_offset(string::in, int::in, int::in, int::out)
+ is semidet.
+
% Append two strings together.
%
:- func string.append(string::in, string::in) = (string::uo) is det.
@@ -170,7 +203,7 @@
:- mode string.string_ops_noncanon(in, in, in, out) is cc_multi.
% string.char_to_string(Char, String).
- % Converts a character (single-character atom) to a string or vice versa.
+ % Converts a character (code point) to a string or vice versa.
%
:- func string.char_to_string(char::in) = (string::uo) is det.
:- pred string.char_to_string(char, string).
@@ -238,7 +271,7 @@
:- func string.from_c_pointer(c_pointer::in) = (string::uo) is det.
% string.first_char(String, Char, Rest) is true iff Char is the first
- % character of String, and Rest is the remainder.
+ % character (code point) of String, and Rest is the remainder.
%
% WARNING: string.first_char makes a copy of Rest because the garbage
% collector doesn't handle references into the middle of an object,
@@ -298,16 +331,18 @@
:- func string.uncapitalize_first(string) = string.
:- pred string.uncapitalize_first(string::in, string::out) is det.
- % Convert the string to a list of characters.
- % Throws an exception if the list of characters contains a null character.
+ % Convert the string to a list of characters (code points).
+ % Throws an exception if the list of characters contains a null character
+ % or surrogate code point.
%
:- func string.to_char_list(string) = list(char).
:- pred string.to_char_list(string, list(char)).
:- mode string.to_char_list(in, out) is det.
:- mode string.to_char_list(uo, in) is det.
- % Convert a list of characters to a string.
- % Throws an exception if the list of characters contains a null character.
+ % Convert a list of characters (code points) to a string.
+ % Throws an exception if the list of characters contains a null character
+ % or surrogate code point.
%
:- func string.from_char_list(list(char)::in) = (string::uo) is det.
:- pred string.from_char_list(list(char), string).
@@ -315,19 +350,20 @@
:- mode string.from_char_list(out, in) is det.
% As above, but fail instead of throwing an exception if the
- % list contains a null character.
+ % list contains a null character or surrogate code point.
%
:- pred string.semidet_from_char_list(list(char)::in, string::uo) is semidet.
% Same as string.from_char_list, except that it reverses the order
% of the characters.
- % Throws an exception if the list of characters contains a null character.
+ % Throws an exception if the list of characters contains a null character
+ % or surrogate code point.
%
:- func string.from_rev_char_list(list(char)::in) = (string::uo) is det.
:- pred string.from_rev_char_list(list(char)::in, string::uo) is det.
% As above, but fail instead of throwing an exception if the
- % list contains a null character.
+ % list contains a null character or surrogate code point.
%
:- pred string.semidet_from_rev_char_list(list(char)::in, string::uo)
is semidet.
@@ -371,15 +407,17 @@
%
:- pred string.to_float(string::in, float::out) is semidet.
- % True if string contains only alphabetic characters (letters).
+ % True if string contains only alphabetic characters [A-Za-z].
%
:- pred string.is_all_alpha(string::in) is semidet.
- % True if string contains only alphabetic characters and underscores.
+ % True if string contains only alphabetic characters [A-Za-z] and
+ % underscores.
%
:- pred string.is_all_alpha_or_underscore(string::in) is semidet.
- % True if string contains only letters, digits, and underscores.
+ % True if string contains only alphabetic characters [A-Za-z], digits,
+ % and underscores.
%
:- pred string.is_all_alnum_or_underscore(string::in) is semidet.
@@ -389,7 +427,7 @@
% string.all_match(TestPred, String):
%
- % True if TestPred is true when applied to each character in
+ % True if TestPred is true when applied to each character (code point) in
% String or if String is the empty string.
%
:- pred string.all_match(pred(char)::in(pred(in) is semidet), string::in)
@@ -398,6 +436,8 @@
% string.pad_left(String0, PadChar, Width, String):
% Insert `PadChar's at the left of `String0' until it is at least as long
% as `Width', giving `String'.
+ % Width is measured as the number of code points, which is too simplistic
+ % if the string may contain zero or double width code points.
%
:- func string.pad_left(string, char, int) = string.
:- pred string.pad_left(string::in, char::in, int::in, string::out) is det.
@@ -405,33 +445,41 @@
% string.pad_right(String0, PadChar, Width, String):
% Insert `PadChar's at the right of `String0' until it is at least as long
% as `Width', giving `String'.
+ % Width is measured as the number of code points, which is too simplistic
+ % if the string may contain zero or double width code points.
%
:- func string.pad_right(string, char, int) = string.
:- pred string.pad_right(string::in, char::in, int::in, string::out) is det.
% string.duplicate_char(Char, Count, String):
% Construct a string consisting of `Count' occurrences of `Char'
- % in sequence.
+ % code points in sequence.
%
:- func string.duplicate_char(char::in, int::in) = (string::uo) is det.
:- pred string.duplicate_char(char::in, int::in, string::uo) is det.
% string.contains_char(String, Char):
- % Succeed if `Char' occurs in `String'.
+ % Succeed if the code point `Char' occurs in `String'.
%
:- pred string.contains_char(string::in, char::in) is semidet.
+% XXX add a predicate to validate that a string is not illegal
+
% string.index(String, Index, Char):
- % `Char' is the (`Index' + 1)-th character of `String'.
- % Fails if `Index' is out of range (negative, or greater than or equal to
- % the length of `String').
+ % `Char' is the character (code point) in `String', beginning at the
+ % code unit `Index'. Fails if `Index' is out of range (negative, or
+ % greater than or equal to the length of `String').
+ %
+ % Calls error/1 if an illegal sequence is detected.
%
:- pred string.index(string::in, int::in, char::uo) is semidet.
% string.index_det(String, Index, Char):
- % `Char' is the (`Index' + 1)-th character of `String'.
+ % `Char' is the character (code point) in `String', beginning at the
+ % code unit `Index'.
% Calls error/1 if `Index' is out of range (negative, or greater than
- % or equal to the length of `String').
+ % or equal to the length of `String'), or if an illegal sequence is
+ % detected.
%
:- func string.index_det(string, int) = char.
:- pred string.index_det(string::in, int::in, char::uo) is det.
@@ -442,7 +490,8 @@
:- func string ^ elem(int) = char.
% string.unsafe_index(String, Index, Char):
- % `Char' is the (`Index' + 1)-th character of `String'.
+ % `Char' is the character (code point) in `String', beginning at the
+ % code unit `Index'.
% WARNING: behavior is UNDEFINED if `Index' is out of range
% (negative, or greater than or equal to the length of `String').
% This version is constant time, whereas string.index_det
@@ -456,56 +505,92 @@
%
:- func string ^ unsafe_elem(int) = char.
+ % string.index_next(String, Index, NextIndex, Char):
+ % Like `string.index'/3 but also returns the position of the code unit
+ % that follows the code point beginning at `Index',
+ % i.e. NextIndex = Index + num_bytes_to_encode(Char).
+ %
+:- pred string.index_next(string::in, int::in, int::out, char::uo) is semidet.
+
+ % string.unsafe_index_next(String, Index, NextIndex, Char):
+ % `Char' is the character (code point) in `String', beginning at the
+ % code unit `Index'. `NextIndex' is the offset following the encoding
+ % of `Char'. Fails if `Index' is equal to the length of `String'.
+ % WARNING: behavior is UNDEFINED if `Index' is out of range
+ % (negative, or greater than the length of `String').
+ %
+:- pred string.unsafe_index_next(string::in, int::in, int::out, char::uo)
+ is semidet.
+
+ % string.unsafe_prev_index(String, Index, CharIndex, Char):
+ % `Char' is the character (code point) in `String' immediately _before_
+ % the code unit `Index'. `CharIndex' is the offset of the beginning of
+ % `Char'. Fails if `Index' is zero.
+ % WARNING: behavior is UNDEFINED if `Index' is out of range
+ % (negative, or greater than or equal to the length of `String').
+ %
+:- pred string.unsafe_prev_index(string::in, int::in, int::out, char::uo)
+ is semidet.
+
+ % string.unsafe_index_code_unit(String, Index, CodeUnit):
+ % `Code' unit is the code unit in `String' at the offset `Index'.
+ % WARNING: behavior is UNDEFINED if `Index' is out of range
+ % (negative, or greater than or equal to the length of `String').
+ %
+:- pred string.unsafe_index_code_unit(string::in, int::in, int::out) is det.
+
% string.chomp(String):
% `String' minus any single trailing newline character.
%
:- func string.chomp(string) = string.
% string.lstrip(String):
- % `String' minus any initial whitespace characters.
+ % `String' minus any initial whitespace characters in the ASCII range.
%
:- func string.lstrip(string) = string.
% string.rstrip(String):
- % `String' minus any trailing whitespace characters.
+ % `String' minus any trailing whitespace characters in the ASCII range.
%
:- func string.rstrip(string) = string.
% string.strip(String):
- % `String' minus any initial and trailing whitespace characters.
+ % `String' minus any initial and trailing whitespace characters in the
+ % ASCII range.
%
:- func string.strip(string) = string.
% string.lstrip_pred(Pred, String):
- % `String' minus the maximal prefix consisting entirely of chars
- % satisfying `Pred'.
+ % `String' minus the maximal prefix consisting entirely of characters
+ % (code points) satisfying `Pred'.
%
:- func string.lstrip_pred(pred(char)::in(pred(in) is semidet), string::in)
= (string::out) is det.
% string.rstrip_pred(Pred, String):
- % `String' minus the maximal suffix consisting entirely of chars
- % satisfying `Pred'.
+ % `String' minus the maximal suffix consisting entirely of characters
+ % (code points) satisfying `Pred'.
%
:- func string.rstrip_pred(pred(char)::in(pred(in) is semidet), string::in)
= (string::out) is det.
% string.prefix_length(Pred, String):
- % The length of the maximal prefix of `String' consisting entirely of
- % chars satisfying Pred.
+ % The length (in code units) of the maximal prefix of `String' consisting
+ % entirely of characters (code points) satisfying Pred.
%
:- func string.prefix_length(pred(char)::in(pred(in) is semidet), string::in)
= (int::out) is det.
% string.suffix_length(Pred, String):
- % The length of the maximal suffix of `String' consisting entirely of chars
- % satisfying Pred.
+ % The length (in code units) of the maximal suffix of `String' consisting
+ % entirely of characters (code points) satisfying Pred.
%
:- func suffix_length(pred(char)::in(pred(in) is semidet), string::in)
= (int::out) is det.
% string.set_char(Char, Index, String0, String):
- % `String' is `String0' with the (`Index' + 1)-th character set to `Char'.
+ % `String' is `String0', with the code point beginning at code unit
+ % `Index' removed and replaced by `Char'.
% Fails if `Index' is out of range (negative, or greater than or equal to
% the length of `String0').
%
@@ -516,7 +601,8 @@
%:- mode string.set_char(in, in, di, uo) is semidet.
% string.set_char_det(Char, Index, String0, String):
- % `String' is `String0' with the (`Index' + 1)-th character set to `Char'.
+ % `String' is `String0', with the code point beginning at code unit
+ % `Index' removed and replaced by `Char'.
% Calls error/1 if `Index' is out of range (negative, or greater than
% or equal to the length of `String0').
%
@@ -528,7 +614,8 @@
%:- mode string.set_char_det(in, in, di, uo) is det.
% string.unsafe_set_char(Char, Index, String0, String):
- % `String' is `String0' with the (`Index' + 1)-th character set to `Char'.
+ % `String' is `String0', with the code point beginning at code unit
+ % `Index' removed and replaced by `Char'.
% WARNING: behavior is UNDEFINED if `Index' is out of range
% (negative, or greater than or equal to the length of `String0').
% This version is constant time, whereas string.set_char_det
@@ -547,8 +634,8 @@
% string.foldl(Closure, String, !Acc):
% `Closure' is an accumulator predicate which is to be called for each
- % character of the string `String' in turn. The initial value of the
- % accumulator is `!.Acc' and the final value is `!:Acc'.
+ % character (code point) of the string `String' in turn. The initial
+ % value of the accumulator is `!.Acc' and the final value is `!:Acc'.
% (string.foldl is equivalent to
% string.to_char_list(String, Chars),
% list.foldl(Closure, Chars, !Acc)
@@ -583,6 +670,8 @@
% is equivalent to string.foldl(Closure, SubString, !Acc)
% where SubString = string.substring(String, Start, Count).
%
+ % `Start' and `Count' are in terms of code units.
+ %
:- func string.foldl_substring(func(char, A) = A, string, int, int, A) = A.
:- pred string.foldl_substring(pred(char, A, A), string, int, int, A, A).
:- mode string.foldl_substring(pred(in, in, out) is det, in, in, in,
@@ -599,6 +688,8 @@
% string.foldl_substring2(Closure, String, Start, Count, !Acc1, !Acc2)
% A variant of string.foldl_substring with two accumulators.
%
+ % `Start' and `Count' are in terms of code units.
+ %
:- pred string.foldl2_substring(pred(char, A, A, B, B),
string, int, int, A, A, B, B).
:- mode string.foldl2_substring(pred(in, di, uo, di, uo) is det,
@@ -629,6 +720,8 @@
% is equivalent to string.foldr(Closure, SubString, !Acc)
% where SubString = string.substring(String, Start, Count).
%
+ % `Start' and `Count' are in terms of code units.
+ %
:- func string.foldr_substring(func(char, T) = T, string, int, int, T) = T.
:- pred string.foldr_substring(pred(char, T, T), string, int, int, T, T).
:- mode string.foldr_substring(pred(in, in, out) is det, in, in, in,
@@ -644,7 +737,8 @@
% string.words_separator(SepP, String) returns the list of non-empty
% substrings of String (in first to last order) that are delimited
- % by non-empty sequences of chars matched by SepP. For example,
+ % by non-empty sequences of characters (code points) matched by SepP.
+ % For example,
%
% string.words_separator(char.is_whitespace, " the cat sat on the mat") =
% ["the", "cat", "sat", "on", "the", "mat"]
@@ -661,7 +755,7 @@
% string.split_at_separator(SepP, String) returns the list of
% substrings of String (in first to last order) that are delimited
- % by chars matched by SepP. For example,
+ % by characters (code points) matched by SepP. For example,
%
% string.split_at_separator(char.is_whitespace, " a cat sat on the mat")
% = ["", "a", "cat", "", "sat", "on", "the", "", "mat"]
@@ -686,33 +780,59 @@
%
:- func string.split_at_string(string, string) = list(string).
- % string.split(String, Count, LeftSubstring, RightSubstring):
- % `LeftSubstring' is the left-most `Count' characters of `String',
- % and `RightSubstring' is the remainder of `String'.
+ % string.split(String, Index, LeftSubstring, RightSubstring):
+ % Split a string into two substrings, at the code unit `Index'.
% (If `Count' is out of the range [0, length of `String'], it is treated
% as if it were the nearest end-point of that range.)
%
:- pred string.split(string::in, int::in, string::uo, string::uo) is det.
+ % string.split_by_codepoint(String, Count, LeftSubstring, RightSubstring):
+ % `LeftSubstring' is the left-most `Count' characters (code points) of
+ % `String', and `RightSubstring' is the remainder of `String'.
+ % (If `Count' is out of the range [0, length of `String'], it is treated
+ % as if it were the nearest end-point of that range.)
+ %
+:- pred string.split_by_codepoint(string::in, int::in, string::uo, string::uo)
+ is det.
+
% string.left(String, Count, LeftSubstring):
- % `LeftSubstring' is the left-most `Count' characters of `String'.
+ % `LeftSubstring' is the left-most `Count' code _units_ of `String'.
% (If `Count' is out of the range [0, length of `String'], it is treated
% as if it were the nearest end-point of that range.)
%
:- func string.left(string::in, int::in) = (string::uo) is det.
:- pred string.left(string::in, int::in, string::uo) is det.
+ % string.left_by_codepoint(String, Count, LeftSubstring):
+ % `LeftSubstring' is the left-most `Count' characters (code points) of
+ % `String'.
+ % (If `Count' is out of the range [0, length of `String'], it is treated
+ % as if it were the nearest end-point of that range.)
+ %
+:- func string.left_by_codepoint(string::in, int::in) = (string::uo) is det.
+:- pred string.left_by_codepoint(string::in, int::in, string::uo) is det.
+
% string.right(String, Count, RightSubstring):
- % `RightSubstring' is the right-most `Count' characters of `String'.
+ % `RightSubstring' is the right-most `Count' code _units_ of `String'.
% (If `Count' is out of the range [0, length of `String'], it is treated
% as if it were the nearest end-point of that range.)
%
:- func string.right(string::in, int::in) = (string::uo) is det.
:- pred string.right(string::in, int::in, string::uo) is det.
+ % string.right_by_codepoint(String, Count, RightSubstring):
+ % `RightSubstring' is the right-most `Count' characters (code points) of
+ % `String'.
+ % (If `Count' is out of the range [0, length of `String'], it is treated
+ % as if it were the nearest end-point of that range.)
+ %
+:- func string.right_by_codepoint(string::in, int::in) = (string::uo) is det.
+:- pred string.right_by_codepoint(string::in, int::in, string::uo) is det.
+
% string.substring(String, Start, Count, Substring):
- % `Substring' is first the `Count' characters in what would remain
- % of `String' after the first `Start' characters were removed.
+ % `Substring' is first the `Count' code _units_ in what would remain
+ % of `String' after the first `Start' code _units_ were removed.
% (If `Start' is out of the range [0, length of `String'], it is treated
% as if it were the nearest end-point of that range.
% If `Count' is out of the range [0, length of `String' - `Start'],
@@ -721,15 +841,28 @@
:- func string.substring(string::in, int::in, int::in) = (string::uo) is det.
:- pred string.substring(string::in, int::in, int::in, string::uo) is det.
+ % string.substring_by_codepoint(String, Start, Count, Substring):
+ % `Substring' is first the `Count' code points in what would remain
+ % of `String' after the first `Start' code points were removed.
+ % (If `Start' is out of the range [0, length of `String'], it is treated
+ % as if it were the nearest end-point of that range.
+ % If `Count' is out of the range [0, length of `String' - `Start'],
+ % it is treated as if it were the nearest end-point of that range.)
+ %
+:- func string.substring_by_codepoint(string::in, int::in, int::in)
+ = (string::uo) is det.
+:- pred string.substring_by_codepoint(string::in, int::in, int::in, string::uo)
+ is det.
+
% string.unsafe_substring(String, Start, Count, Substring):
- % `Substring' is first the `Count' characters in what would remain
- % of `String' after the first `Start' characters were removed.
+ % `Substring' is first the `Count' code _units_ in what would remain
+ % of `String' after the first `Start' code _units_ were removed.
% WARNING: if `Start' is out of the range [0, length of `String'],
% or if `Count' is out of the range [0, length of `String' - `Start'],
% then the behaviour is UNDEFINED. Use with care!
% This version takes time proportional to the length of the substring,
% whereas string.substring may take time proportional to the length
- %% of the whole string.
+ % of the whole string.
%
:- func string.unsafe_substring(string::in, int::in, int::in) = (string::uo)
is det.
@@ -757,16 +890,16 @@
:- func string.hash3(string) = int.
% string.sub_string_search(String, SubString, Index).
- % `Index' is the position in `String' where the first occurrence of
- % `SubString' begins. Indices start at zero, so if `SubString' is a prefix
- % of `String', this will return Index = 0.
+ % `Index' is the code unit position in `String' where the first
+ % occurrence of `SubString' begins. Indices start at zero, so if
+ % `SubString' is a prefix of `String', this will return Index = 0.
%
:- pred string.sub_string_search(string::in, string::in, int::out) is semidet.
% string.sub_string_search_start(String, SubString, BeginAt, Index).
- % `Index' is the position in `String' where the first occurrence of
- % `SubString' occurs such that 'Index' is greater than or equal to
- % `BeginAt'. Indices start at zero,
+ % `Index' is the code unit position in `String' where the first
+ % occurrence of `SubString' occurs such that 'Index' is greater than or
+ % equal to `BeginAt'. Indices start at zero.
%
:- pred string.sub_string_search_start(string::in, string::in, int::in,
int::out)
@@ -848,6 +981,11 @@
% bb * 22
% ccc * 333
%
+ % NOTE: the result is only aligned in termed of numbers of code points.
+ % The output may not be properly aligned if any code points occupy more or
+ % less than one column when printed.
+ % XXX we could try to use wcswidth()
+ %
:- func string.format_table(list(justified_column), string) = string.
% format_table_max(Columns, Separator) does the same job as format_table,
@@ -861,11 +999,14 @@
; right(list(string)).
% word_wrap(Str, N) = Wrapped.
- % Wrapped is Str with newlines inserted between words so that at most
- % N characters appear on a line and each line contains as many whole words
- % as possible. If any one word exceeds N characters in length then it will
- % be broken over two (or more) lines. Sequences of whitespace characters
- % are replaced by a single space.
+ % Wrapped is Str with newlines inserted between words (separated by ASCII
+ % space characters) so that at most N code points appear on a line and each
+ % line contains as many whole words as possible. If any one word exceeds N
+ % code point in length then it will be broken over two (or more) lines.
+ % Sequences of whitespace characters are replaced by a single space.
+ %
+ % NOTE: this function does not care what a code point is, nor how wide are
+ % the glyphs for each code point.
%
:- func string.word_wrap(string, int) = string.
@@ -873,7 +1014,7 @@
% word_wrap_separator/3 is like word_wrap/2, except that words that
% need to be broken up over multiple lines have WordSeparator inserted
% between each piece. If the length of WordSeparator is greater than
- % or equal to N, then no separator is used.
+ % or equal to N code points, then no separator is used.
%
:- func string.word_wrap_separator(string, int, string) = string.
@@ -1008,12 +1149,14 @@ string.foldl2(Closure, String, !Acc1, !Acc2) :-
string.foldl_substring(Closure, String, Start0, Count0, !Acc) :-
Start = max(0, Start0),
Count = min(Count0, length(String) - Start),
- string.foldl_substring_2(Closure, String, Start, Count, !Acc).
+ End = Start + Count,
+ string.foldl_substring_2(Closure, String, Start, End, !Acc).
string.foldl2_substring(Closure, String, Start0, Count0, !Acc1, !Acc2) :-
Start = max(0, Start0),
Count = min(Count0, length(String) - Start),
- string.foldl2_substring_2(Closure, String, Start, Count, !Acc1, !Acc2).
+ End = Start + Count,
+ string.foldl2_substring_2(Closure, String, Start, End, !Acc1, !Acc2).
:- pred string.foldl_substring_2(pred(char, A, A), string, int, int, A, A).
:- mode string.foldl_substring_2(pred(in, di, uo) is det, in, in, in,
@@ -1027,10 +1170,14 @@ string.foldl2_substring(Closure, String, Start0, Count0, !Acc1, !Acc2) :-
:- mode string.foldl_substring_2(pred(in, in, out) is multi, in, in, in,
in, out) is multi.
-string.foldl_substring_2(Closure, String, I, Count, !Acc) :-
- ( 0 < Count ->
- Closure(string.unsafe_index(String, I), !Acc),
- string.foldl_substring_2(Closure, String, I + 1, Count - 1, !Acc)
+string.foldl_substring_2(Closure, String, I, End, !Acc) :-
+ (
+ I < End,
+ string.unsafe_index_next(String, I, J, Char),
+ J =< End
+ ->
+ Closure(Char, !Acc),
+ string.foldl_substring_2(Closure, String, J, End, !Acc)
;
true
).
@@ -1050,11 +1197,14 @@ string.foldl_substring_2(Closure, String, I, Count, !Acc) :-
:- mode string.foldl2_substring_2(pred(in, in, out, in, out) is multi,
in, in, in, in, out, in, out) is multi.
-string.foldl2_substring_2(Closure, String, I, Count, !Acc1, !Acc2) :-
- ( 0 < Count ->
- Closure(string.unsafe_index(String, I), !Acc1, !Acc2),
- string.foldl2_substring_2(Closure, String, I + 1, Count - 1,
- !Acc1, !Acc2)
+string.foldl2_substring_2(Closure, String, I, End, !Acc1, !Acc2) :-
+ (
+ I < End,
+ string.unsafe_index_next(String, I, J, Char),
+ J =< End
+ ->
+ Closure(Char, !Acc1, !Acc2),
+ string.foldl2_substring_2(Closure, String, J, End, !Acc1, !Acc2)
;
true
).
@@ -1073,7 +1223,8 @@ string.foldr(Closure, String, Acc0, Acc) :-
string.foldr_substring(Closure, String, Start0, Count0, Acc0, Acc) :-
Start = max(0, Start0),
Count = min(Count0, length(String) - Start),
- string.foldr_substring_2(Closure, String, Start, Count, Acc0, Acc).
+ End = Start + Count,
+ string.foldr_substring_2(Closure, String, Start, End, Acc0, Acc).
:- pred string.foldr_substring_2(pred(char, T, T), string, int, int, T, T).
:- mode string.foldr_substring_2(pred(in, in, out) is det, in, in, in,
@@ -1087,10 +1238,14 @@ string.foldr_substring(Closure, String, Start0, Count0, Acc0, Acc) :-
:- mode string.foldr_substring_2(pred(in, in, out) is multi, in, in, in,
in, out) is multi.
-string.foldr_substring_2(Closure, String, I, Count, !Acc) :-
- ( 0 < Count ->
- Closure(string.unsafe_index(String, I + Count - 1), !Acc),
- string.foldr_substring_2(Closure, String, I, Count - 1, !Acc)
+string.foldr_substring_2(Closure, String, Start, I, !Acc) :-
+ (
+ I > Start,
+ string.unsafe_prev_index(String, I, J, Char),
+ J >= Start
+ ->
+ Closure(Char, !Acc),
+ string.foldr_substring_2(Closure, String, Start, J, !Acc)
;
true
).
@@ -1098,11 +1253,25 @@ string.foldr_substring_2(Closure, String, I, Count, !Acc) :-
string.left(String, Count, LeftString) :-
string.split(String, Count, LeftString, _RightString).
+string.left_by_codepoint(String, Count) = LeftString :-
+ string.left_by_codepoint(String, Count, LeftString).
+
+string.left_by_codepoint(String, Count, LeftString) :-
+ string.split_by_codepoint(String, Count, LeftString, _RightString).
+
string.right(String, RightCount, RightString) :-
string.length(String, Length),
LeftCount = Length - RightCount,
string.split(String, LeftCount, _LeftString, RightString).
+string.right_by_codepoint(String, RightCount) = RightString :-
+ string.right_by_codepoint(String, RightCount, RightString).
+
+string.right_by_codepoint(String, RightCount, RightString) :-
+ string.count_codepoints(String, TotalCount),
+ LeftCount = TotalCount - RightCount,
+ string.split_by_codepoint(String, LeftCount, _LeftString, RightString).
+
string.remove_suffix(String, Suffix, StringWithoutSuffix) :-
string.suffix(String, Suffix),
string.left(String, length(String) - length(Suffix), StringWithoutSuffix).
@@ -1364,11 +1533,16 @@ string.to_char_list(Str::uo, CharList::in) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"{
- MR_ConstString p = Str + strlen(Str);
+ int pos = strlen(Str);
+ int c;
+
CharList = MR_list_empty_msg(MR_PROC_LABEL);
- while (p > Str) {
- p--;
- CharList = MR_char_list_cons_msg((MR_UnsignedChar) *p, CharList,
+ for (;;) {
+ c = MR_utf8_prev_get(Str, &pos);
+ if (c <= 0) {
+ break;
+ }
+ CharList = MR_char_list_cons_msg((MR_UnsignedChar) c, CharList,
MR_PROC_LABEL);
}
}").
@@ -1380,7 +1554,23 @@ string.to_char_list(Str::uo, CharList::in) :-
"
list.List_1 lst = list.empty_list();
for (int i = Str.Length - 1; i >= 0; i--) {
- char c = Str[i];
+ int c;
+ char c2 = Str[i];
+ if (System.Char.IsLowSurrogate(c2)) {
+ try {
+ char c1 = Str[i - 1];
+ c = System.Char.ConvertToUtf32(c1, c2);
+ i--;
+ } catch (System.ArgumentOutOfRangeException) {
+ c = 0xfffd;
+ } catch (System.IndexOutOfRangeException) {
+ c = 0xfffd;
+ }
+ } else if (System.Char.IsHighSurrogate(c2)) {
+ c = 0xfffd;
+ } else {
+ c = c2;
+ }
lst = list.cons(c, lst);
}
CharList = lst;
@@ -1391,10 +1581,11 @@ string.to_char_list(Str::uo, CharList::in) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
- list.List_1<Character> lst = list.empty_list();
- for (int i = Str.length() - 1; i >= 0; i--) {
- char c = Str.charAt(i);
+ list.List_1<Integer> lst = list.empty_list();
+ for (int i = Str.length(); i > 0; ) {
+ int c = Str.codePointBefore(i);
lst = list.cons(c, lst);
+ i -= java.lang.Character.charCount(c);
}
CharList = lst;
").
@@ -1451,7 +1642,8 @@ string.from_char_list(Chars::in, Str::uo) :-
size = 0;
char_list_ptr = CharList;
while (! MR_list_is_empty(char_list_ptr)) {
- size++;
+ /* XXX can return 0 for invalid code points */
+ size += MR_utf8_width(MR_list_head(char_list_ptr));
char_list_ptr = MR_list_tail(char_list_ptr);
}
@@ -1467,8 +1659,8 @@ string.from_char_list(Chars::in, Str::uo) :-
size = 0;
char_list_ptr = CharList;
while (! MR_list_is_empty(char_list_ptr)) {
- MR_Char c;
- c = (MR_Char) MR_list_head(char_list_ptr);
+ int c;
+ c = MR_list_head(char_list_ptr);
/*
** It is an error to put a null character in a string
** (see the comments at the top of this file).
@@ -1477,7 +1669,7 @@ string.from_char_list(Chars::in, Str::uo) :-
SUCCESS_INDICATOR = MR_FALSE;
break;
}
- Str[size++] = c;
+ size += MR_utf8_encode(Str + size, c);
char_list_ptr = MR_list_tail(char_list_ptr);
}
@@ -1491,7 +1683,12 @@ string.from_char_list(Chars::in, Str::uo) :-
"
System.Text.StringBuilder sb = new System.Text.StringBuilder();
while (!list.is_empty(CharList)) {
- sb.Append((char) list.det_head(CharList));
+ int cp = (int) list.det_head(CharList);
+ if (cp <= 0xffff) {
+ sb.Append((char) cp);
+ } else {
+ sb.Append(System.Char.ConvertFromUtf32(cp));
+ }
CharList = list.det_tail(CharList);
}
Str = sb.ToString();
@@ -1504,9 +1701,14 @@ string.from_char_list(Chars::in, Str::uo) :-
does_not_affect_liveness],
"
java.lang.StringBuilder sb = new StringBuilder();
- Iterable<Character> iterable = new list.ListIterator<Character>(CharList);
- for (char c : iterable) {
- sb.append(c);
+ Iterable<Integer> iterable = new list.ListIterator<Integer>(CharList);
+ for (int c : iterable) {
+ if (c <= 0xffff) {
+ /* Fast path. */
+ sb.append((char) c);
+ } else {
+ sb.append(java.lang.Character.toChars(c));
+ }
}
Str = sb.toString();
SUCCESS_INDICATOR = true;
@@ -1561,7 +1763,7 @@ string.from_rev_char_list(Chars, Str) :-
size = 0;
list_ptr = Chars;
while (!MR_list_is_empty(list_ptr)) {
- size++;
+ size += MR_utf8_width(MR_list_head(list_ptr));
list_ptr = MR_list_tail(list_ptr);
}
@@ -1589,7 +1791,8 @@ string.from_rev_char_list(Chars, Str) :-
SUCCESS_INDICATOR = MR_FALSE;
break;
}
- Str[--size] = c;
+ size -= MR_utf8_width(c);
+ MR_utf8_encode(Str + size, c);
list_ptr = MR_list_tail(list_ptr);
}
}").
@@ -1602,14 +1805,26 @@ string.from_rev_char_list(Chars, Str) :-
int size = 0;
list.List_1 list_ptr = Chars;
while (!list.is_empty(list_ptr)) {
- size++;
+ int c = (int) list.det_head(list_ptr);
+ if (c <= 0xffff) {
+ size++;
+ } else {
+ size += 2;
+ }
list_ptr = list.det_tail(list_ptr);
}
char[] arr = new char[size];
list_ptr = Chars;
while (!list.is_empty(list_ptr)) {
- arr[--size] = (char) list.det_head(list_ptr);
+ int c = (int) list.det_head(list_ptr);
+ if (c <= 0xffff) {
+ arr[--size] = (char) c;
+ } else {
+ string s = System.Char.ConvertFromUtf32(c);
+ arr[--size] = s[1];
+ arr[--size] = s[0];
+ }
list_ptr = list.det_tail(list_ptr);
}
@@ -1663,15 +1878,15 @@ string.uncapitalize_first(S0, S) :-
).
string.all_match(P, String) :-
- all_match_2(string.length(String) - 1, P, String).
+ all_match_2(P, String, 0).
-:- pred all_match_2(int::in, pred(char)::in(pred(in) is semidet), string::in)
+:- pred all_match_2(pred(char)::in(pred(in) is semidet), string::in, int::in)
is semidet.
-string.all_match_2(I, P, String) :-
- ( I >= 0 ->
- P(string.unsafe_index(String, I)),
- string.all_match_2(I - 1, P, String)
+string.all_match_2(P, String, I) :-
+ ( string.unsafe_index_next(String, I, J, Char) ->
+ P(Char),
+ string.all_match_2(P, String, J)
;
true
).
@@ -1817,7 +2032,7 @@ string.is_all_digits(S) :-
").
string.pad_left(String0, PadChar, Width, String) :-
- string.length(String0, Length),
+ string.count_codepoints(String0, Length),
( Length < Width ->
Count = Width - Length,
string.duplicate_char(PadChar, Count, PadString),
@@ -1827,7 +2042,7 @@ string.pad_left(String0, PadChar, Width, String) :-
).
string.pad_right(String0, PadChar, Width, String) :-
- string.length(String0, Length),
+ string.count_codepoints(String0, Length),
( Length < Width ->
Count = Width - Length,
string.duplicate_char(PadChar, Count, PadString),
@@ -2039,7 +2254,7 @@ string.hash(String) = HashVal :-
string.hash_loop(String, Index, Length, !HashVal) :-
( Index < Length ->
- C = char.to_int(string.unsafe_index(String, Index)),
+ string.unsafe_index_code_unit(String, Index, C),
!:HashVal = !.HashVal `xor` (!.HashVal `unchecked_left_shift` 5),
!:HashVal= !.HashVal `xor` C,
string.hash_loop(String, Index + 1, Length, !HashVal)
@@ -2057,7 +2272,7 @@ string.hash2(String) = HashVal :-
string.hash2_loop(String, Index, Length, !HashVal) :-
( Index < Length ->
- C = char.to_int(string.unsafe_index(String, Index)),
+ string.unsafe_index_code_unit(String, Index, C),
!:HashVal = !.HashVal * 37,
!:HashVal= !.HashVal + C,
string.hash2_loop(String, Index + 1, Length, !HashVal)
@@ -2075,7 +2290,7 @@ string.hash3(String) = HashVal :-
string.hash3_loop(String, Index, Length, !HashVal) :-
( Index < Length ->
- C = char.to_int(string.unsafe_index(String, Index)),
+ string.unsafe_index_code_unit(String, Index, C),
!:HashVal = !.HashVal * 49,
!:HashVal= !.HashVal + C,
string.hash3_loop(String, Index + 1, Length, !HashVal)
@@ -2095,12 +2310,16 @@ string.sub_string_search(WholeString, Pattern, Index) :-
does_not_affect_liveness, no_sharing],
"{
char *match;
- match = strstr(WholeString + BeginAt, Pattern);
- if (match) {
- Index = match - WholeString;
- SUCCESS_INDICATOR = MR_TRUE;
- } else {
+ if ((MR_Unsigned) BeginAt > strlen(WholeString)) {
SUCCESS_INDICATOR = MR_FALSE;
+ } else {
+ match = strstr(WholeString + BeginAt, Pattern);
+ if (match) {
+ Index = match - WholeString;
+ SUCCESS_INDICATOR = MR_TRUE;
+ } else {
+ SUCCESS_INDICATOR = MR_FALSE;
+ }
}
}").
@@ -2311,8 +2530,7 @@ width(Width, !PolyTypes, !Chars) :-
non_zero_digit(!Chars),
zero_or_more_occurences(digit, !Chars),
Final = !.Chars,
-
- char_list_remove_suffix(Init, Final, Width)
+ list.remove_suffix(Init, Final, Width)
).
% Do we have a precision?
@@ -2337,7 +2555,7 @@ prec(Prec, !PolyTypes, !Chars) :-
zero_or_more_occurences(digit, !Chars),
Final = !.Chars
->
- char_list_remove_suffix(Init, Final, Prec)
+ list.remove_suffix(Init, Final, Prec)
;
% When no number follows the '.' the precision defaults to 0.
Prec = ['0']
@@ -3896,36 +4114,49 @@ count_extra_trailing_zeroes(FloatStr, I, N0) = N :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
- SUCCESS_INDICATOR = (strchr(Str, Ch) != NULL) && Ch != '\\0';
+ char buf[5];
+ size_t len;
+ if (Ch <= 0x7f) {
+ /* Fast path. */
+ SUCCESS_INDICATOR = (strchr(Str, Ch) != NULL) && Ch != '\\0';
+ } else {
+ len = MR_utf8_encode(buf, Ch);
+ buf[len] = '\\0';
+ SUCCESS_INDICATOR = (strstr(Str, buf) != NULL);
+ }
").
:- pragma foreign_proc("C#",
string.contains_char(Str::in, Ch::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
- SUCCESS_INDICATOR = (Str.IndexOf(Ch) != -1);
+ if (Ch <= 0xffff) {
+ SUCCESS_INDICATOR = (Str.IndexOf((char) Ch) != -1);
+ } else {
+ string s = System.Char.ConvertFromUtf32(Ch);
+ SUCCESS_INDICATOR = Str.Contains(s);
+ }
").
:- pragma foreign_proc("Java",
string.contains_char(Str::in, Ch::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
- SUCCESS_INDICATOR = (Str.indexOf(Ch) != -1);
+ // indexOf(int) handles supplementary characters correctly.
+ SUCCESS_INDICATOR = (Str.indexOf((int) Ch) != -1);
").
string.contains_char(String, Char) :-
- string.contains_char(String, Char, 0, string.length(String)).
+ string.contains_char(String, Char, 0).
-:- pred string.contains_char(string::in, char::in, int::in, int::in)
- is semidet.
+:- pred string.contains_char(string::in, char::in, int::in) is semidet.
-string.contains_char(Str, Char, Index, Length) :-
- ( Index < Length ->
- string.unsafe_index(Str, Index, IndexChar),
+string.contains_char(Str, Char, I) :-
+ ( string.unsafe_index_next(Str, I, J, IndexChar) ->
( IndexChar = Char ->
true
;
- string.contains_char(Str, Char, Index + 1, Length)
+ string.contains_char(Str, Char, J)
)
;
fail
@@ -3946,6 +4177,16 @@ string.index(Str, Index, Char) :-
fail
).
+:- pragma inline(string.index_next/4).
+
+string.index_next(Str, Index, NextIndex, Char) :-
+ Len = string.length(Str),
+ ( string.index_check(Index, Len) ->
+ string.unsafe_index_next(Str, Index, NextIndex, Char)
+ ;
+ fail
+ ).
+
:- pred string.index_check(int::in, int::in) is semidet.
% We should consider making this routine a compiler built-in.
@@ -3975,44 +4216,214 @@ string.index_check(Index, Length) :-
/*-----------------------------------------------------------------------*/
+string.unsafe_index(Str, Index, Char) :-
+ ( string.unsafe_index_2(Str, Index, Char0) ->
+ Char = Char0
+ ;
+ error("string.unsafe_index: illegal sequence")
+ ).
+
+:- pred string.unsafe_index_2(string::in, int::in, char::uo) is semidet.
+
:- pragma foreign_proc("C",
- string.unsafe_index(Str::in, Index::in, Ch::uo),
+ string.unsafe_index_2(Str::in, Index::in, Ch::uo),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
- Ch = Str[Index];
+ Ch = MR_utf8_get(Str, Index);
+ SUCCESS_INDICATOR = (Ch > 0);
").
:- pragma foreign_proc("C#",
- string.unsafe_index(Str::in, Index::in, Ch::uo),
+ string.unsafe_index_2(Str::in, Index::in, Ch::uo),
[will_not_call_mercury, promise_pure, thread_safe],
"
- Ch = Str[Index];
+ char c1 = Str[Index];
+ if (System.Char.IsSurrogate(c1)) {
+ char c2 = Str[Index + 1];
+ try {
+ Ch = System.Char.ConvertToUtf32(c1, c2);
+ } catch (System.ArgumentOutOfRangeException) {
+ Ch = -1;
+ }
+ } else {
+ /* Common case. */
+ Ch = c1;
+ }
+ SUCCESS_INDICATOR = (Ch > 0);
").
:- pragma foreign_proc("Java",
- string.unsafe_index(Str::in, Index::in, Ch::uo),
+ string.unsafe_index_2(Str::in, Index::in, Ch::uo),
[will_not_call_mercury, promise_pure, thread_safe],
"
- Ch = Str.charAt(Index);
+ Ch = Str.codePointAt(Index);
+ /* XXX Unicode: more error checking */
+ SUCCESS_INDICATOR =
+ !java.lang.Character.IsHighSurrogate(Ch) &&
+ !java.lang.Character.IsLowSurrogate(Ch);
").
:- pragma foreign_proc("Erlang",
- string.unsafe_index(Str::in, Index::in, Ch::uo),
+ string.unsafe_index_2(Str::in, Index::in, Ch::uo),
[will_not_call_mercury, promise_pure, thread_safe],
"
- <<_:Index/binary, Ch/integer, _/binary>> = Str
+ /* XXX Unicode */
+ <<_:Index/binary, Ch/integer, _/binary>> = Str,
+ SUCCESS_INDICATOR = true
").
-string.unsafe_index(Str, Index, Char) :-
- ( string.first_char(Str, First, Rest) ->
- ( Index = 0 ->
- Char = First
- ;
- string.unsafe_index(Rest, Index - 1, Char)
- )
- ;
- error("string.unsafe_index: out of bounds")
- ).
+% XXX this portable version is no longer correct as it deals in code points
+% string.unsafe_index(Str, Index, Char) :-
+% ( string.first_char(Str, First, Rest) ->
+% ( Index = 0 ->
+% Char = First
+% ;
+% string.unsafe_index(Rest, Index - 1, Char)
+% )
+% ;
+% error("string.unsafe_index: out of bounds")
+% ).
String ^ unsafe_elem(Index) = unsafe_index(String, Index).
+:- pragma foreign_proc("C",
+ string.unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ int pos = Index;
+ Ch = MR_utf8_get_next(Str, &pos);
+ NextIndex = pos;
+ SUCCESS_INDICATOR = (Ch > 0);
+").
+
+:- pragma foreign_proc("C#",
+ string.unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ if (Index < Str.Length) {
+ Ch = System.Char.ConvertToUtf32(Str, Index);
+ if (Ch <= 0xffff) {
+ NextIndex = Index + 1;
+ } else {
+ NextIndex = Index + 2;
+ }
+ SUCCESS_INDICATOR = true;
+ } else {
+ Ch = -1;
+ NextIndex = Index;
+ SUCCESS_INDICATOR = false;
+ }
+").
+
+:- pragma foreign_proc("Java",
+ string.unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ if (Index < Str.length()) {
+ Ch = Str.codePointAt(Index);
+ SUCCESS_INDICATOR =
+ !java.lang.Character.IsHighSurrogate(Ch) &&
+ !java.lang.Character.IsLowSurrogate(Ch);
+ if (SUCCESS_INDICATOR) {
+ NextIndex = Index + java.lang.Character.charCount(Ch);
+ SUCCESS_INDICATOR = true;
+ } else {
+ Ch = -1;
+ NextIndex = Index;
+ SUCCESS_INDICATOR = false;
+ }
+ } else {
+ Ch = -1;
+ NextIndex = Index;
+ SUCCESS_INDICATOR = false;
+ }
+").
+
+:- pragma foreign_proc("C",
+ string.unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ int pos = Index;
+ Ch = MR_utf8_prev_get(Str, &pos);
+ PrevIndex = pos;
+ SUCCESS_INDICATOR = (Ch > 0);
+").
+
+:- pragma foreign_proc("C#",
+ string.unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ if (Index > 0) {
+ char c2 = Str[Index - 1];
+ if (System.Char.IsSurrogate(c2)) {
+ char c1 = Str[Index - 2];
+ try {
+ Ch = System.Char.ConvertToUtf32(c1, c2);
+ PrevIndex = Index - 2;
+ } catch (System.ArgumentOutOfRangeException) {
+ Ch = -1;
+ PrevIndex = Index;
+ SUCCESS_INDICATOR = false;
+ }
+ } else {
+ /* Common case. */
+ Ch = (int) c2;
+ PrevIndex = Index - 1;
+ }
+ SUCCESS_INDICATOR = true;
+ } else {
+ Ch = -1;
+ PrevIndex = Index;
+ SUCCESS_INDICATOR = false;
+ }
+").
+
+:- pragma foreign_proc("Java",
+ string.unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ if (Index > 0) {
+ Ch = Str.codePointBefore(Index);
+ // XXX Unicode: error checking
+ PrevIndex = Index - java.lang.Character.charCount(Ch);
+ SUCCESS_INDICATOR = true;
+ } else {
+ Ch = -1;
+ PrevIndex = Index;
+ SUCCESS_INDICATOR = false;
+ }
+").
+
+/*-----------------------------------------------------------------------*/
+
+:- pragma foreign_proc("C",
+ string.unsafe_index_code_unit(Str::in, Index::in, Code::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ Code = (MR_UnsignedChar) Str[Index];
+").
+:- pragma foreign_proc("C#",
+ string.unsafe_index_code_unit(Str::in, Index::in, Code::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ Code = Str[Index];
+").
+:- pragma foreign_proc("Java",
+ string.unsafe_index_code_unit(Str::in, Index::in, Code::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ Code = Str[Index];
+").
+:- pragma foreign_proc("Erlang",
+ string.unsafe_index_code_unit(Str::in, Index::in, Code::uo),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ <<_:Index/binary, Code/integer, _/binary>> = Str
+").
+
/*-----------------------------------------------------------------------*/
:- pragma foreign_decl("C",
@@ -4064,11 +4475,29 @@ string.set_char(Char, Index, !Str) :-
size_t len = strlen(Str0);
if ((MR_Unsigned) Index >= len) {
SUCCESS_INDICATOR = MR_FALSE;
- } else {
+ } else if (Str0[Index] <= 0x7f && Ch <= 0x7f) {
+ /* Fast path. */
SUCCESS_INDICATOR = MR_TRUE;
MR_allocate_aligned_string_msg(Str, len, MR_PROC_LABEL);
strcpy(Str, Str0);
MR_set_char(Str, Index, Ch);
+ } else {
+ int oldc = MR_utf8_get(Str0, Index);
+ if (oldc < 0) {
+ SUCCESS_INDICATOR = MR_FALSE;
+ } else {
+ size_t oldwidth = MR_utf8_width(oldc);
+ size_t newwidth = MR_utf8_width(Ch);
+ size_t newlen;
+ size_t tailofs;
+
+ newlen = len - oldwidth + newwidth;
+ MR_allocate_aligned_string_msg(Str, newlen, MR_PROC_LABEL);
+ MR_memcpy(Str, Str0, Index);
+ MR_utf8_encode(Str + Index, Ch);
+ strcpy(Str + Index + newwidth, Str0 + Index + oldwidth);
+ SUCCESS_INDICATOR = MR_TRUE;
+ }
}
").
:- pragma foreign_proc("C#",
@@ -4080,7 +4509,18 @@ string.set_char(Char, Index, !Str) :-
SUCCESS_INDICATOR = false;
} else {
System.Text.StringBuilder sb = new System.Text.StringBuilder(Str0);
- sb[Index] = Ch;
+ if (!System.Char.IsHighSurrogate(Str0, Index) && Ch <= 0xffff) {
+ /* Fast path. */
+ sb[Index] = (char) Ch;
+ } else {
+ // XXX error checking
+ if (System.Char.IsLowSurrogate(Str0, Index + 1)) {
+ sb.Remove(Index, 2);
+ } else {
+ sb.Remove(Index, 1);
+ }
+ sb.Insert(Index, System.Char.ConvertFromUtf32(Ch));
+ }
Str = sb.ToString();
SUCCESS_INDICATOR = true;
}
@@ -4094,7 +4534,17 @@ string.set_char(Char, Index, !Str) :-
SUCCESS_INDICATOR = false;
} else {
java.lang.StringBuilder sb = new StringBuilder(Str0);
- sb.setCharAt(Index, Ch);
+
+ int oldc = sb.codePointAt(Index);
+ int oldwidth = java.lang.Character.charCount(oldc);
+ int newwidth = java.lang.Character.charCount(Ch);
+ if (oldwidth == 1 && newwidth == 1) {
+ sb.setCharAt(Index, (char) Ch);
+ } else {
+ char[] buf = java.lang.Character.toChars(Ch);
+ sb.replace(Index, Index + oldwidth, new String(buf));
+ }
+
Str = sb.toString();
SUCCESS_INDICATOR = true;
}
@@ -4124,6 +4574,7 @@ string.set_char_2(Ch, Index, Str0, Str) :-
% if (Index >= Str0.Length) {
% SUCCESS_INDICATOR = false;
% } else {
+% // UTF-16
% Str = System.String.Concat(Str0.Substring(0, Index),
% System.Convert.ToString(Ch),
% Str0.Substring(Index + 1));
@@ -4152,23 +4603,55 @@ string.unsafe_set_char(Char, Index, !Str) :-
does_not_affect_liveness],
"
size_t len = strlen(Str0);
- MR_allocate_aligned_string_msg(Str, len, MR_PROC_LABEL);
- strcpy(Str, Str0);
- MR_set_char(Str, Index, Ch);
+ if (Str0[Index] <= 0x7f && Ch <= 0x7f) {
+ /* Fast path. */
+ SUCCESS_INDICATOR = MR_TRUE;
+ MR_allocate_aligned_string_msg(Str, len, MR_PROC_LABEL);
+ strcpy(Str, Str0);
+ MR_set_char(Str, Index, Ch);
+ } else {
+ int oldc = MR_utf8_get(Str0, Index);
+ if (oldc < 0) {
+ SUCCESS_INDICATOR = MR_FALSE;
+ } else {
+ size_t oldwidth = MR_utf8_width(oldc);
+ size_t newwidth = MR_utf8_width(Ch);
+ size_t newlen;
+ size_t tailofs;
+
+ newlen = len - oldwidth + newwidth;
+ MR_allocate_aligned_string_msg(Str, newlen, MR_PROC_LABEL);
+ MR_memcpy(Str, Str0, Index);
+ MR_utf8_encode(Str + Index, Ch);
+ strcpy(Str + Index + newwidth, Str0 + Index + oldwidth);
+ SUCCESS_INDICATOR = MR_TRUE;
+ }
+ }
").
:- pragma foreign_proc("C#",
string.unsafe_set_char_2(Ch::in, Index::in, Str0::in, Str::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
- Str = System.String.Concat(Str0.Substring(0, Index),
- System.Convert.ToString(Ch),
- Str0.Substring(Index + 1));
+ // XXX need more error checking
+ if (System.Char.IsHighSurrogate(Str0, Index)) {
+ Str = System.String.Concat(Str0.Substring(0, Index),
+ System.Char.ConvertFromUtf32(Ch),
+ Str0.Substring(Index + 2));
+ } else {
+ Str = System.String.Concat(Str0.Substring(0, Index),
+ System.Char.ConvertFromUtf32(Ch),
+ Str0.Substring(Index + 1));
+ }
").
:- pragma foreign_proc("Java",
string.unsafe_set_char_2(Ch::in, Index::in, Str0::in, Str::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
- Str = Str0.substring(0, Index) + Ch + Str0.substring(Index + 1);
+ int oldc = Str0.codePointAt(Index);
+ int oldwidth = java.lang.Character.charCount(oldc);
+ Str = Str0.subSequence(0, Index)
+ + new String(Character.toChars(Ch))
+ + Str0.subSequence(Index + oldwidth, Str0.length());
").
:- pragma foreign_proc("Erlang",
string.unsafe_set_char_2(Ch::in, Index::in, Str0::in, Str::out),
@@ -4182,6 +4665,7 @@ string.unsafe_set_char(Char, Index, !Str) :-
% string.unsafe_set_char_2(Ch::in, Index::in, Str0::di, Str::uo),
% [will_not_call_mercury, promise_pure, thread_safe, does_not_affect_liveness],
% "
+% /* XXX UTF-8 */
% Str = Str0;
% MR_set_char(Str, Index, Ch);
% ").
@@ -4189,6 +4673,7 @@ string.unsafe_set_char(Char, Index, !Str) :-
% string.unsafe_set_char_2(Ch::in, Index::in, Str0::di, Str::uo),
% [will_not_call_mercury, promise_pure, thread_safe],
% "
+% /* XXX UTF-16 */
% Str = System.String.Concat(Str0.Substring(0, Index),
% System.Convert.ToString(Ch),
% Str0.Substring(Index + 1));
@@ -4197,6 +4682,7 @@ string.unsafe_set_char(Char, Index, !Str) :-
% string.unsafe_set_char_2(Ch::in, Index::in, Str0::di, Str::uo),
% [will_not_call_mercury, promise_pure, thread_safe],
% "
+% /* XXX UTF-16 */
% Str = Str0.substring(0, Index) + Ch + Str0.substring(Index + 1);
% ").
@@ -4270,6 +4756,331 @@ string.length_2(Str, Index, Length) :-
Length = Index
).
+string.count_code_units(Str) = string.length(Str).
+
+string.count_code_units(Str, Length) :-
+ string.length(Str, Length).
+
+/*-----------------------------------------------------------------------*/
+
+:- pragma foreign_decl("C", local, "
+#define MR_utf8_is_single_byte(c) (((unsigned)(c) & 0x80) == 0)
+#define MR_utf8_is_lead_byte(c) (((unsigned)(c) - 0xC0) < 0x3E)
+#define MR_utf8_is_trail_byte(c) (((unsigned)(c) & 0xC0) == 0x80)
+
+/*
+** Advance `*pos' to the beginning of the next code point in `s'.
+** If `*pos' is already at the end of the string then return false
+** without modifying `*pos'.
+*/
+static MR_bool MR_utf8_next(const MR_String s_, int *pos)
+{
+ const unsigned char *s = (const unsigned char *)s_;
+ int c;
+
+ if (*pos == '\\0') {
+ return MR_FALSE;
+ }
+
+ for (;;) {
+ ++(*pos);
+ if (*pos == '\\0') {
+ break;
+ }
+ c = s[*pos];
+ if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
+ break;
+ }
+ }
+
+ return MR_TRUE;
+}
+
+/*
+** Rewind `*pos' to the beginning of the previous code point in `s'.
+** If `*pos' is already at the beginning of the string then return false
+** without modifying `*pos'.
+*/
+static MR_bool MR_utf8_prev(const MR_String s_, int *pos)
+{
+ const unsigned char *s = (const unsigned char *)s_;
+ int c;
+
+ if (*pos <= 0) {
+ return MR_FALSE;
+ }
+
+ while (*pos > 0) {
+ (*pos)--;
+ c = s[*pos];
+ if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
+ break;
+ }
+ }
+
+ return MR_TRUE;
+}
+
+/*
+** Decode and return the code point beginning at `pos' in `s'.
+** Return 0 if at the end of the string (i.e. the NUL terminator).
+** If an illegal code sequence exists at that offset, return -2.
+*/
+int MR_utf8_get(const MR_String s_, int pos)
+{
+ const unsigned char *s = (const unsigned char *)s_;
+ int c;
+ int remain;
+ int minc;
+ int i;
+
+ c = s[pos];
+
+ if (c <= 0x7F) {
+ /* Plain ASCII (including NUL terminator). */
+ return c;
+ }
+
+ if (c <= 0xC1) {
+ /* Trailing byte of multi-byte sequence or an overlong encoding for
+ * code point <= 127.
+ */
+ return -2;
+ }
+
+ if (c <= 0xDF) {
+ /* 2-byte sequence. */
+ c &= 0x1F;
+ remain = 1;
+ minc = 0x80;
+ }
+ else if (c <= 0xEF) {
+ /* 3-byte sequence. */
+ c &= 0x0F;
+ remain = 2;
+ minc = 0x800;
+ }
+ else if (c <= 0xF4) {
+ /* 4-byte sequence. */
+ c &= 0x07;
+ remain = 3;
+ minc = 0x10000;
+ }
+ else {
+ /* Otherwise invalid. */
+ return -2;
+ }
+
+ for (i = 1; i <= remain; i++) {
+ if (s[pos + i] == '\\0') {
+ return -2;
+ }
+ }
+
+ while (remain--) {
+ int d = s[++pos];
+
+ if (!MR_utf8_is_trail_byte(d)) {
+ return -2;
+ }
+
+ c = (c << 6) | (d & 0x3F);
+ }
+
+ /* Check for overlong forms, which could be used to bypass security
+ * validations. We could also check code points aren't above U+10FFFF or in
+ * the surrogate ranges, but we don't.
+ */
+
+ if (c < minc) {
+ return -2;
+ }
+
+ return c;
+}
+
+/*
+** Return the number of bytes required to encode the code point `c'.
+** Return zero if the code point is invalid.
+*/
+static size_t MR_utf8_width(int c)
+{
+ /* So we don't need to check for negative values nor use unsigned ints
+ * in the interface, which are a pain.
+ */
+ unsigned int uc = c;
+
+ if (uc <= 0x7f)
+ return 1;
+ if (uc <= 0x7ff)
+ return 2;
+ if (uc <= 0xffff)
+ return 3;
+ if (uc <= 0x10ffff)
+ return 4;
+ /* The rest are illegal. */
+ return 0;
+}
+
+/*
+**
+*/
+static int MR_utf8_get_next(const MR_String s, int *pos)
+{
+ int c = MR_utf8_get(s, *pos);
+
+ if (c >= 0) {
+ (*pos) += MR_utf8_width(c);
+ return c;
+ }
+
+ if (c == -1) {
+ /* Past end. */
+ return c;
+ }
+
+ /* Some invalid byte sequence. */
+ MR_utf8_next(s, pos);
+ return c;
+}
+
+static int MR_utf8_prev_get(const MR_String s, int *pos)
+{
+ if (MR_utf8_prev(s, pos)) {
+ return MR_utf8_get(s, *pos);
+ }
+
+ /* Past beginning. */
+ return -1;
+}
+
+size_t MR_utf8_encode(char s[], int c)
+{
+ unsigned int uc = c;
+
+ if (uc <= 0x7f) {
+ s[0] = uc;
+ return 1;
+ }
+
+ if (uc <= 0x7ff) {
+ s[0] = 0xC0 | ((uc >> 6) & 0x1F);
+ s[1] = 0x80 | (uc & 0x3F);
+ return 2;
+ }
+
+ if (uc <= 0xffff) {
+ s[0] = 0xE0 | ((uc >> 12) & 0x0F);
+ s[1] = 0x80 | ((uc >> 6) & 0x3F);
+ s[2] = 0x80 | (uc & 0x3F);
+ return 3;
+ }
+
+ if (uc <= 0x10ffff) {
+ s[0] = 0xF0 | ((uc >> 18) & 0x07);
+ s[1] = 0x80 | ((uc >> 12) & 0x3F);
+ s[2] = 0x80 | ((uc >> 6) & 0x3F);
+ s[3] = 0x80 | (uc & 0x3F);
+ return 4;
+ }
+
+ /* Otherwise is illegal. */
+ return 0;
+}
+").
+
+string.count_codepoints(String) = Count :-
+ string.count_codepoints(String, Count).
+
+:- pragma foreign_proc("C",
+ string.count_codepoints(String::in, Count::out),
+ [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+ unsigned char b;
+ int i;
+
+ Count = 0;
+ for (i = 0; ; i++) {
+ b = String[i];
+ if (b == '\\0') {
+ break;
+ }
+ if (MR_utf8_is_single_byte(b) || MR_utf8_is_lead_byte(b)) {
+ Count++;
+ }
+ }
+").
+
+:- pragma foreign_proc("C#",
+ string.count_codepoints(String::in, Count::out),
+ [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+ Count = 0;
+ foreach (char c in String) {
+ if (!System.Char.IsLowSurrogate(c)) {
+ Count++;
+ }
+ }
+").
+
+:- pragma foreign_proc("Java",
+ string.count_codepoints(String::in, Count::out),
+ [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+ Count = String.codePointCount(0, String.length());
+").
+
+string.codepoint_offset(String, N, Index) :-
+ string.codepoint_offset(String, 0, N, Index).
+
+:- pragma foreign_proc("C",
+ string.codepoint_offset(String::in, StartOffset::in, N::in, Index::out),
+ [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+ size_t len;
+ unsigned char b;
+
+ SUCCESS_INDICATOR = MR_FALSE;
+ len = strlen(String);
+ for (Index = StartOffset; Index < len; Index++) {
+ b = String[Index];
+ if (MR_utf8_is_single_byte(b) || MR_utf8_is_lead_byte(b)) {
+ if (N-- == 0) {
+ SUCCESS_INDICATOR = MR_TRUE;
+ break;
+ }
+ }
+ }
+").
+
+:- pragma foreign_proc("C#",
+ string.codepoint_offset(String::in, StartOffset::in, N::in, Index::out),
+ [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+ SUCCESS_INDICATOR = false;
+ for (Index = StartOffset; Index < String.Length; Index++) {
+ if (!System.Char.IsLowSurrogate(String, Index)) {
+ if (N-- == 0) {
+ SUCCESS_INDICATOR = true;
+ break;
+ }
+ }
+ }
+").
+
+:- pragma foreign_proc("Java",
+ string.codepoint_offset(String::in, StartOffset::in, N::in, Index::out),
+ [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+ try {
+ Index = String.offsetByCodePoints(StartOffset, N);
+ SUCCESS_INDICATOR = true;
+ } catch (IndexOutOfBoundsException e) {
+ Index = -1;
+ SUCCESS_INDICATOR = false;
+ }
+").
+
/*-----------------------------------------------------------------------*/
:- pragma promise_equivalent_clauses(string.append/3).
@@ -4513,12 +5324,15 @@ substring(Str, !.Start, !.Count, SubStr) :-
:- func strchars(int, int, string) = list(char).
strchars(I, End, Str) = Chars :-
- ( I >= End ->
- Chars = []
- ;
- C = string.unsafe_index(Str, I),
- Cs = strchars(I + 1, End, Str),
+ (
+ I < End,
+ string.unsafe_index_next(Str, I, J, C),
+ J =< End
+ ->
+ Cs = strchars(J, End, Str),
Chars = [C | Cs]
+ ;
+ Chars = []
).
:- pragma foreign_proc("C",
@@ -4611,6 +5425,23 @@ strchars(I, End, Str) = Chars :-
end
").
+string.substring_by_codepoint(Str, Start, Count) = SubString :-
+ string.substring_by_codepoint(Str, Start, Count, SubString).
+
+string.substring_by_codepoint(Str, Start, Count, SubString) :-
+ ( string.codepoint_offset(Str, Start, StartOffset0) ->
+ StartOffset = StartOffset0
+ ;
+ StartOffset = 0
+ ),
+ ( string.codepoint_offset(Str, StartOffset, Count, EndOffset0) ->
+ EndOffset = EndOffset0
+ ;
+ EndOffset = string.length(Str)
+ ),
+ OffsetCount = EndOffset - StartOffset,
+ string.substring(Str, StartOffset, OffsetCount, SubString).
+
:- pragma foreign_proc("C",
string.unsafe_substring(Str::in, Start::in, Count::in, SubString::uo),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
@@ -4722,24 +5553,36 @@ strchars(I, End, Str) = Chars :-
end
").
-string.split(Str, Count, Left, Right) :-
- ( Count =< 0 ->
+% XXX this portable version is no longer correct as it deals in code points
+% string.split(Str, Count, Left, Right) :-
+% ( Count =< 0 ->
+% Left = "",
+% copy(Str, Right)
+% ;
+% string.to_char_list(Str, List),
+% Len = string.length(Str),
+% ( Count > Len ->
+% Num = Len
+% ;
+% Num = Count
+% ),
+% ( list.split_list(Num, List, LeftList, RightList) ->
+% string.to_char_list(Left, LeftList),
+% string.to_char_list(Right, RightList)
+% ;
+% error("string.split")
+% )
+% ).
+
+string.split_by_codepoint(Str, Count, Left, Right) :-
+ ( string.codepoint_offset(Str, Count, Offset) ->
+ string.split(Str, Offset, Left, Right)
+ ; Count =< 0 ->
Left = "",
copy(Str, Right)
;
- string.to_char_list(Str, List),
- Len = string.length(Str),
- ( Count > Len ->
- Num = Len
- ;
- Num = Count
- ),
- ( list.split_list(Num, List, LeftList, RightList) ->
- string.to_char_list(Left, LeftList),
- string.to_char_list(Right, RightList)
- ;
- error("string.split")
- )
+ copy(Str, Left),
+ Right = ""
).
/*-----------------------------------------------------------------------*/
@@ -4749,10 +5592,12 @@ string.split(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
+ int pos = 0;
+ int c = MR_utf8_get_next(Str, &pos);
SUCCESS_INDICATOR = (
- Str[0] == First &&
+ c == First &&
First != '\\0' &&
- strcmp(Str + 1, Rest) == 0
+ strcmp(Str + pos, Rest) == 0
);
").
:- pragma foreign_proc("C#",
@@ -4760,19 +5605,32 @@ string.split(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe],
"
int len = Str.Length;
- SUCCESS_INDICATOR = (
- len > 0 &&
- Str[0] == First &&
- System.String.CompareOrdinal(Str, 1, Rest, 0, len) == 0
- );
+ if (First <= 0xffff) {
+ SUCCESS_INDICATOR = (
+ len > 0 &&
+ Str[0] == First &&
+ System.String.CompareOrdinal(Str, 1, Rest, 0, len) == 0
+ );
+ } else {
+ string firstchars = System.Char.ConvertFromUtf32(First);
+ SUCCESS_INDICATOR = (
+ len > 1 &&
+ Str[0] == firstchars[0] &&
+ Str[1] == firstchars[1] &&
+ System.String.CompareOrdinal(Str, 2, Rest, 0, len) == 0
+ );
+ }
").
:- pragma foreign_proc("Java",
string.first_char(Str::in, First::in, Rest::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
- SUCCESS_INDICATOR = (Str.length() == Rest.length() + 1 &&
- Str.charAt(0) == First &&
- Str.endsWith(Rest));
+ int toffset = java.lang.Character.charCount(First);
+ SUCCESS_INDICATOR = (
+ Str.length() > 0 &&
+ Str.codePointAt(0) == First &&
+ Str.regionMatches(toffset, Rest, 0, Rest.length())
+ );
").
:- pragma foreign_proc("Erlang",
string.first_char(Str::in, First::in, Rest::in),
@@ -4791,8 +5649,9 @@ string.split(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
- First = Str[0];
- SUCCESS_INDICATOR = (First != '\\0' && strcmp(Str + 1, Rest) == 0);
+ int pos = 0;
+ First = MR_utf8_get_next(Str, &pos);
+ SUCCESS_INDICATOR = (First != '\\0' && strcmp(Str + pos, Rest) == 0);
").
:- pragma foreign_proc("C#",
string.first_char(Str::in, First::uo, Rest::in),
@@ -4800,9 +5659,18 @@ string.split(Str, Count, Left, Right) :-
"
int len = Str.Length;
if (len > 0) {
- SUCCESS_INDICATOR = (System.String.CompareOrdinal(Str, 1, Rest, 0, len)
- == 0);
- First = Str[0];
+ char c1 = Str[0];
+ if (System.Char.IsHighSurrogate(c1)) {
+ /* XXX more error checking */
+ char c2 = Str[1];
+ First = System.Char.ConvertToUtf32(c1, c2);
+ SUCCESS_INDICATOR =
+ (System.String.CompareOrdinal(Str, 2, Rest, 0, len) == 0);
+ } else {
+ First = c1;
+ SUCCESS_INDICATOR =
+ (System.String.CompareOrdinal(Str, 1, Rest, 0, len) == 0);
+ }
} else {
SUCCESS_INDICATOR = false;
First = (char) 0;
@@ -4812,13 +5680,15 @@ string.split(Str, Count, Left, Right) :-
string.first_char(Str::in, First::uo, Rest::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
- if (Str.length() == Rest.length() + 1 && Str.endsWith(Rest)) {
- SUCCESS_INDICATOR = true;
- First = Str.charAt(0);
+ int toffset;
+ if (Str.length() > 0) {
+ First = Str.codePointAt(0);
+ toffset = java.lang.Character.charCount(First);
+ SUCCESS_INDICATOR = Str.regionMatches(toffset, Rest, 0, Rest.length());
} else {
SUCCESS_INDICATOR = false;
// XXX to avoid uninitialized var warning
- First = (char) 0;
+ First = 0;
}
").
:- pragma foreign_proc("Erlang",
@@ -4839,10 +5709,12 @@ string.split(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"{
- if (Str[0] != First || First == '\\0') {
+ int pos = 0;
+ int c = MR_utf8_get_next(Str, &pos);
+ if (c != First || First == '\\0') {
SUCCESS_INDICATOR = MR_FALSE;
} else {
- Str++;
+ Str += pos;
/*
** We need to make a copy to ensure that the pointer is word-aligned.
*/
@@ -4858,8 +5730,15 @@ string.split(Str, Count, Left, Right) :-
int len = Str.Length;
if (len > 0) {
- SUCCESS_INDICATOR = (First == Str[0]);
- Rest = Str.Substring(1);
+ if (First <= 0xffff) {
+ SUCCESS_INDICATOR = (First == Str[0]);
+ Rest = Str.Substring(1);
+ } else {
+ string firststr = System.Char.ConvertFromUtf32(First);
+ SUCCESS_INDICATOR =
+ (System.String.CompareOrdinal(Str, 0, firststr, 0, 2) == 0);
+ Rest = Str.Substring(2);
+ }
} else {
SUCCESS_INDICATOR = false;
Rest = null;
@@ -4872,8 +5751,8 @@ string.split(Str, Count, Left, Right) :-
int len = Str.length();
if (len > 0) {
- SUCCESS_INDICATOR = (First == Str.charAt(0));
- Rest = Str.substring(1);
+ SUCCESS_INDICATOR = (First == Str.codePointAt(0));
+ Rest = Str.substring(java.lang.Character.charCount(First));
} else {
SUCCESS_INDICATOR = false;
// XXX to avoid uninitialized var warning
@@ -4898,11 +5777,12 @@ string.split(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"{
- First = Str[0];
+ int pos = 0;
+ First = MR_utf8_get_next(Str, &pos);
if (First == '\\0') {
SUCCESS_INDICATOR = MR_FALSE;
} else {
- Str++;
+ Str += pos;
/*
** We need to make a copy to ensure that the pointer is word-aligned.
*/
@@ -4920,8 +5800,16 @@ string.split(Str, Count, Left, Right) :-
First = (char) 0;
Rest = null;
} else {
- First = Str[0];
- Rest = Str.Substring(1);
+ /* XXX more error checking */
+ char c1 = Str[0];
+ if (System.Char.IsSurrogate(c1)) {
+ char c2 = Str[1];
+ First = System.Char.ConvertToUtf32(c1, c2);
+ Rest = Str.Substring(2);
+ } else {
+ First = Str[0];
+ Rest = Str.Substring(1);
+ }
SUCCESS_INDICATOR = true;
}
}").
@@ -4935,8 +5823,9 @@ string.split(Str, Count, Left, Right) :-
First = (char) 0;
Rest = null;
} else {
- First = Str.charAt(0);
- Rest = Str.substring(1);
+ // XXX exception?
+ First = Str.codePointAt(0);
+ Rest = Str.substring(java.lang.Character.charCount(First));
SUCCESS_INDICATOR = true;
}
}").
@@ -4959,24 +5848,29 @@ string.split(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"{
- size_t len = strlen(Rest) + 1;
+ size_t firstw = MR_utf8_width(First);
+ size_t len = firstw + strlen(Rest);
MR_allocate_aligned_string_msg(Str, len, MR_PROC_LABEL);
- Str[0] = First;
- strcpy(Str + 1, Rest);
+ MR_utf8_encode(Str, First);
+ strcpy(Str + firstw, Rest);
}").
:- pragma foreign_proc("C#",
string.first_char(Str::uo, First::in, Rest::in),
[will_not_call_mercury, promise_pure, thread_safe],
"{
string FirstStr;
- FirstStr = new System.String(First, 1);
- Str = System.String.Concat(FirstStr, Rest);
+ if (First <= 0xffff) {
+ FirstStr = new System.String((char) First, 1);
+ } else {
+ FirstStr = System.Char.ConvertFromUtf32(First);
+ }
+ Str = FirstStr + Rest;
}").
:- pragma foreign_proc("Java",
string.first_char(Str::uo, First::in, Rest::in),
[will_not_call_mercury, promise_pure, thread_safe],
"{
- java.lang.String FirstStr = java.lang.String.valueOf(First);
+ String FirstStr = new String(Character.toChars(First));
Str = FirstStr.concat(Rest);
}").
:- pragma foreign_proc("Erlang",
@@ -5086,23 +5980,40 @@ string.format(S1, PT) = S2 :-
%------------------------------------------------------------------------------%
string.words_separator(SepP, String) = Words :-
- I = preceding_boundary(isnt(SepP), String, string.length(String) - 1),
- Words = words_2(SepP, String, I, []).
+ next_boundary(SepP, String, 0, WordStart),
+ words_2(SepP, String, WordStart, Words).
-%------------------------------------------------------------------------------%
+:- pred words_2(pred(char)::in(pred(in) is semidet), string::in, int::in,
+ list(string)::out) is det.
+
+words_2(SepP, String, WordStart, Words) :-
+ next_boundary(isnt(SepP), String, WordStart, WordEnd),
+ ( WordEnd = WordStart ->
+ Words = []
+ ;
+ string.unsafe_substring(String, WordStart, WordEnd - WordStart, Word),
+ next_boundary(SepP, String, WordEnd, NextWordStart),
+ ( WordEnd = NextWordStart ->
+ Words = [Word]
+ ;
+ words_2(SepP, String, NextWordStart, Words0),
+ Words = [Word | Words0]
+ )
+ ).
-:- func words_2(pred(char)::in(pred(in) is semidet), string::in, int::in,
- list(string)::in) = (list(string)::out) is det.
+ % Return the smallest I >= I0 such that `not P(String[I])'.
+ %
+:- pred next_boundary(pred(char)::in(pred(in) is semidet), string::in, int::in,
+ int::out) is det.
-words_2(SepP, String, WordEnd, Words0) = Words :-
- ( WordEnd < 0 ->
- Words = Words0
+next_boundary(P, String, I0, I) :-
+ (
+ string.unsafe_index_next(String, I0, I1, Char),
+ P(Char)
+ ->
+ next_boundary(P, String, I1, I)
;
- WordPre = preceding_boundary(SepP, String, WordEnd),
- Word = string.unsafe_substring(String, WordPre + 1,
- WordEnd - WordPre),
- PrevWordEnd = preceding_boundary(isnt(SepP), String, WordPre),
- Words = words_2(SepP, String, PrevWordEnd, [Word | Words0])
+ I = I0
).
%------------------------------------------------------------------------------%
@@ -5113,33 +6024,32 @@ string.words(String) = string.words_separator(char.is_whitespace, String).
string.split_at_separator(DelimP, String) = Substrings :-
Len = string.length(String),
- split_at_separator_2(DelimP, String, Len - 1, Len, [], Substrings).
+ split_at_separator_2(DelimP, String, Len, Len, [], Substrings).
:- pred split_at_separator_2(pred(char)::in(pred(in) is semidet), string::in,
int::in, int::in, list(string)::in, list(string)::out) is det.
split_at_separator_2(DelimP, Str, I, SegEnd, Acc0, Acc) :-
- % Walk Str backwards extending the accumulated list of chunks as chars
- % matching DelimP are found.
+ % Walk Str backwards extending the accumulated list of chunks as code
+ % points matching DelimP are found.
%
- % Invariant: -1 =< I < length(Str)
+ % Invariant: 0 =< I =< length(Str)
% SegEnd is one past the last index of the current segment.
%
- ( I < 0 ->
- % We've reached the beginning of the string.
- Seg = string.unsafe_substring(Str, 0, SegEnd),
- Acc = [Seg | Acc0]
- ;
- C = string.unsafe_index(Str, I),
+ ( string.unsafe_prev_index(Str, I, J, C) ->
( DelimP(C) ->
% Chop here.
- SegStart = I + 1,
+ SegStart = I,
Seg = string.unsafe_substring(Str, SegStart, SegEnd - SegStart),
- split_at_separator_2(DelimP, Str, I - 1, I, [Seg | Acc0], Acc)
+ split_at_separator_2(DelimP, Str, J, J, [Seg | Acc0], Acc)
;
% Extend current segment.
- split_at_separator_2(DelimP, Str, I - 1, SegEnd, Acc0, Acc)
+ split_at_separator_2(DelimP, Str, J, SegEnd, Acc0, Acc)
)
+ ;
+ % We've reached the beginning of the string.
+ Seg = string.unsafe_substring(Str, 0, SegEnd),
+ Acc = [Seg | Acc0]
).
%------------------------------------------------------------------------------%
@@ -5156,36 +6066,16 @@ split_at_string(Needle, Total) =
split_at_string(StartAt, NeedleLen, Needle, Total) = Out :-
( sub_string_search_start(Total, Needle, StartAt, NeedlePos) ->
- BeforeNeedle = substring(Total, StartAt, NeedlePos-StartAt),
+ BeforeNeedle = substring(Total, StartAt, NeedlePos - StartAt),
Tail = split_at_string(NeedlePos+NeedleLen, NeedleLen, Needle, Total),
Out = [BeforeNeedle | Tail]
;
- string__split(Total, StartAt, _skip, Last),
+ string.split(Total, StartAt, _Skip, Last),
Out = [Last]
).
%------------------------------------------------------------------------------%
- % preceding_boundary(SepP, String, I) returns the largest index J =< I
- % in String of the char that is SepP and min(-1, I) if there is no such J.
- % preceding_boundary/3 is intended for finding (in reverse) consecutive
- % maximal sequences of chars satisfying some property. Note that I
- % *must not* exceed the largest valid index for String.
- %
-:- func preceding_boundary(pred(char)::in(pred(in) is semidet), string::in,
- int::in) = (int::out) is det.
-
-preceding_boundary(SepP, String, I) =
- ( I < 0 ->
- I
- ; SepP(string.unsafe_index(String, I)) ->
- I
- ;
- preceding_boundary(SepP, String, I - 1)
- ).
-
-%------------------------------------------------------------------------------%
-
S1 ++ S2 = string.append(S1, S2).
%------------------------------------------------------------------------------%
@@ -5203,11 +6093,13 @@ string.det_base_string_to_int(Base, S) = N :-
%-----------------------------------------------------------------------------%
-chomp(S) =
- ( index(S, length(S) - 1, '\n') ->
- left(S, length(S) - 1)
+chomp(S) = Chomp :-
+ % '\n' is always represented by a single code unit.
+ Offset = string.length(S) - 1,
+ ( index(S, Offset, '\n') ->
+ Chomp = left(S, Offset)
;
- S
+ Chomp = S
).
%-----------------------------------------------------------------------------%
@@ -5235,40 +6127,39 @@ lstrip_pred(P, S) = right(S, length(S) - prefix_length(P, S)).
%-----------------------------------------------------------------------------%
-prefix_length(P, S) = prefix_length_2(0, length(S), P, S).
+prefix_length(P, S) = Index :-
+ prefix_length_2(P, S, 0, Index).
-:- func prefix_length_2(int::in, int::in, pred(char)::in(pred(in) is semidet),
- string::in) = (int::out) is det.
+:- pred prefix_length_2(pred(char)::in(pred(in) is semidet),
+ string::in, int::in, int::out) is det.
-prefix_length_2(I, N, P, S) =
- % XXX We are using if-then-elses to get ordered conjunction.
- ( I < N ->
- ( P(S ^ unsafe_elem(I)) ->
- prefix_length_2(I + 1, N, P, S)
- ;
- I
- )
+prefix_length_2(P, S, I, Index) :-
+ (
+ string.unsafe_index_next(S, I, J, Char),
+ P(Char)
+ ->
+ prefix_length_2(P, S, J, Index)
;
- I
+ Index = I
).
%-----------------------------------------------------------------------------%
-suffix_length(P, S) = suffix_length_2(length(S) - 1, length(S), P, S).
+suffix_length(P, S) = End - Index :-
+ End = string.length(S),
+ suffix_length_2(P, S, End, Index).
-:- func suffix_length_2(int::in, int::in, pred(char)::in(pred(in) is semidet),
- string::in) = (int::out) is det.
+:- pred suffix_length_2(pred(char)::in(pred(in) is semidet),
+ string::in, int::in, int::out) is det.
-suffix_length_2(I, N, P, S) =
- % XXX We are using if-then-elses to get ordered conjunction.
- ( 0 =< I ->
- ( P(S ^ unsafe_elem(I)) ->
- suffix_length_2(I - 1, N, P, S)
- ;
- N - (I + 1)
- )
+suffix_length_2(P, S, I, Index) :-
+ (
+ string.unsafe_prev_index(S, I, J, Char),
+ P(Char)
+ ->
+ suffix_length_2(P, S, J, Index)
;
- N - (I + 1)
+ Index = I
).
%------------------------------------------------------------------------------%
@@ -5686,28 +6577,6 @@ private_builtin_type_info_to_revstrings(PrivateBuiltinTypeInfo, !Rs) :-
det_dynamic_cast(X, Y) :-
det_univ_to_type(univ(X), Y).
-%-----------------------------------------------------------------------------%
-
- % char_list_remove_suffix/3: We use this instead of the more general
- % list.remove_suffix so that (for example) string.format will succeed in
- % grade Java, even though unification has not yet been implemented.
- %
-:- pred char_list_remove_suffix(list(char)::in, list(char)::in,
- list(char)::out) is semidet.
-
-char_list_remove_suffix(List, Suffix, Prefix) :-
- list.length(List, ListLength),
- list.length(Suffix, SuffixLength),
- PrefixLength = ListLength - SuffixLength,
- list.split_list(PrefixLength, List, Prefix, Rest),
- char_list_equal(Suffix, Rest).
-
-:- pred char_list_equal(list(char)::in, list(char)::in) is semidet.
-
-char_list_equal([], []).
-char_list_equal([X | Xs], [X | Ys]) :-
- char_list_equal(Xs, Ys).
-
%------------------------------------------------------------------------------%
string.format_table(Columns, Separator) = Table :-
@@ -5730,7 +6599,7 @@ string.format_table(Columns, Separator) = Table :-
string.format_table_max(ColumnsLimits, Separator) = Table :-
MaxWidthsSenses = list.map(find_max_length_with_limit, ColumnsLimits),
Columns = list.map(project_column_strings, ColumnsLimits),
- SepLen = string.length(Separator),
+ SepLen = string.count_codepoints(Separator),
generate_rows(MaxWidthsSenses, Separator, SepLen, Columns, [], RevRows),
list.reverse(RevRows, Rows),
Table = string.join_list("\n", Rows).
@@ -5784,7 +6653,7 @@ pad_row([Justify - MaxWidth | JustifyWidths], [ColumnString0 | ColumnStrings0],
NextColumn = CurColumn + MaxWidth + SepLen,
pad_row(JustifyWidths, ColumnStrings0, Separator, SepLen, NextColumn,
LineRest),
- ( string.length(ColumnString0) =< MaxWidth ->
+ ( string.count_codepoints(ColumnString0) =< MaxWidth ->
(
Justify = just_left,
ColumnString = string.pad_right(ColumnString0, ' ', MaxWidth)
@@ -5870,7 +6739,7 @@ lpad(Chr, N, Str) = string.pad_left(Str, Chr, N).
:- pred max_str_length(string::in, int::in, int::out) is det.
max_str_length(Str, PrevMaxLen, MaxLen) :-
- Length = string.length(Str),
+ Length = string.count_codepoints(Str),
( Length > PrevMaxLen ->
MaxLen = Length
;
@@ -5883,7 +6752,7 @@ word_wrap(Str, N) = word_wrap_separator(Str, N, "").
word_wrap_separator(Str, N, WordSep) = Wrapped :-
Words = string.words_separator(char.is_whitespace, Str),
- SepLen = string.length(WordSep),
+ SepLen = string.count_codepoints(WordSep),
( SepLen < N ->
word_wrap_2(Words, WordSep, SepLen, 1, N, [], Wrapped)
;
@@ -5899,7 +6768,7 @@ word_wrap_2([], _, _, _, _, RevStrs,
word_wrap_2([Word | Words], WordSep, SepLen, Col, N, Prev, Wrapped) :-
% Col is the column where the next character should be written if there
% is space for a whole word.
- WordLen = string.length(Word),
+ WordLen = string.count_codepoints(Word),
(
% We are on the first column and the length of the word
% is less than the line length.
@@ -5972,10 +6841,10 @@ word_wrap_2([Word | Words], WordSep, SepLen, Col, N, Prev, Wrapped) :-
:- func break_up_string_reverse(string, int, list(string)) = list(string).
break_up_string_reverse(Str, N, Prev) = Strs :-
- ( string.length(Str) =< N ->
+ ( string.count_codepoints(Str) =< N ->
Strs = [Str | Prev]
;
- string.split(Str, N, Left, Right),
+ string.split_by_codepoint(Str, N, Left, Right),
Strs = break_up_string_reverse(Right, N, [Left | Prev])
).
diff --git a/library/term_io.m b/library/term_io.m
index 0be7ccb..9b7acf5 100644
--- a/library/term_io.m
+++ b/library/term_io.m
@@ -784,6 +784,7 @@ mercury_escape_char(Char) = EscapeCode :-
is_mercury_source_char(Char) :-
( char.is_alnum(Char)
; is_mercury_punctuation_char(Char)
+ ; char.to_int(Char) >= 0x80
).
% Currently we only allow the following characters.
diff --git a/runtime/mercury_types.h b/runtime/mercury_types.h
index fc51fc8..fd32326 100644
--- a/runtime/mercury_types.h
+++ b/runtime/mercury_types.h
@@ -128,11 +128,11 @@ typedef MR_Code *MR_CodePtr;
** If you modify them, you will need to modify mercury_string.h as well.
*/
-typedef char MR_Char;
-typedef unsigned char MR_UnsignedChar;
+typedef int MR_Char;
+typedef unsigned int MR_UnsignedChar;
-typedef MR_Char *MR_String;
-typedef const MR_Char *MR_ConstString;
+typedef char *MR_String;
+typedef const char *MR_ConstString;
/*
** Definitions for accessing the representation of the Mercury `array' type.
diff --git a/tests/debugger/tailrec1.m b/tests/debugger/tailrec1.m
index a2949d3..13e680f 100644
--- a/tests/debugger/tailrec1.m
+++ b/tests/debugger/tailrec1.m
@@ -55,7 +55,9 @@ tailrec1_read_strings(Stream, !Words, !IO) :-
int i;
i = 0;
- while ((c = mercury_getc((MercuryFilePtr) Stream)) != EOF && c != '\\n') {
+ while ((c = mercury_get_byte((MercuryFilePtr) Stream)) != EOF &&
+ c != '\\n')
+ {
if (i < 100) {
buf[i] = c;
}
diff --git a/tests/hard_coded/special_char.exp b/tests/hard_coded/special_char.exp
index 52dc034..91f2fe8 100644
--- a/tests/hard_coded/special_char.exp
+++ b/tests/hard_coded/special_char.exp
@@ -1,4 +1,4 @@
Hello world
-?S�
+?SÑ
diff --git a/tests/hard_coded/unicode_test.exp b/tests/hard_coded/unicode_test.exp
index 5ed1bf3..cccf2b8 100644
--- a/tests/hard_coded/unicode_test.exp
+++ b/tests/hard_coded/unicode_test.exp
@@ -1,11 +1,11 @@
00000011
00000011
-11001110 10010100
-11001110 10100000
-11101111 10111111 10111111
-11110100 10001111 10111111 10111111
-11110010 10101011 10110011 10011110
-01110010 11000011 10101001 01110011 01110101 01101101 11000011 10101001
+1110010100
+1110100000
+1111111111111111
+100001111111111111111
+10101011110011011110
+01110010 11101001 01110011 01110101 01101101 11101001
01100001 01100010 01100011 00110001 00110010 00110011
01011100 01110101 00110000 00110000 00110100 00110001
01011100 01110101 00110000 00110000 00110100 00110001
diff --git a/tests/valid/mercury_java_parser_follow_code_bug.m b/tests/valid/mercury_java_parser_follow_code_bug.m
index a9d42a4..92dee13 100644
--- a/tests/valid/mercury_java_parser_follow_code_bug.m
+++ b/tests/valid/mercury_java_parser_follow_code_bug.m
@@ -500,7 +500,7 @@ null_literal -->
% [5]Contents | [6]Prev | [7]Next | [8]Index Java Language Specification
% Second Edition
%
-% [9]Copyright � 2000 Sun Microsystems, Inc. All rights reserved
+% [9]Copyright © 2000 Sun Microsystems, Inc. All rights reserved
% Please send any comments or corrections to the [10]JLS team
% Syntax
--------------------------------------------------------------------------
mercury-reviews mailing list
Post messages to: mercury-reviews at csse.unimelb.edu.au
Administrative Queries: owner-mercury-reviews at csse.unimelb.edu.au
Subscriptions: mercury-reviews-request at csse.unimelb.edu.au
--------------------------------------------------------------------------
More information about the reviews
mailing list