[m-rev.] preliminary review: UTF-8/16 support
Peter Wang
novalazy at gmail.com
Wed Feb 16 17:19:57 AEDT 2011
On 2010-12-01, Peter Wang <novalazy at gmail.com> wrote:
> Here is some work in progress on UTF-8 and UTF-16 string support, in
> case someone would like to look at it or make suggestions.
>
> We would commit to using the Universal Character Set instead of
> trying to be agnostic about string encodings.
> Mercury `char' becomes a Unicode code point, so is re-defined as a
> 32-bit integer (at least).
>
> String indexing predicates return code points instead of code units.
> e.g. `string.index(String, CodeUnitOffset, CodePoint)' returns the
> code point beginning at CodeUnitOffset.
>
> When you iterate over a string you should not advance to Index + 1 any
> more, so there is are predicates which return the next/previous offset:
> string.index_next(String, Offset, CodePoint, NextOffset)
> string.prev_index(String, NextOffset, CodePoint, Offset)
Newer patch as requested by Julien.
Peter
diff --git a/compiler/c_util.m b/compiler/c_util.m
index 8f9c30a..bfe1cf1 100644
--- a/compiler/c_util.m
+++ b/compiler/c_util.m
@@ -179,11 +179,13 @@
:- implementation.
+:- import_module libs.compiler_util.
:- import_module libs.options.
:- import_module bool.
:- import_module int.
:- import_module list.
+:- import_module require.
:- import_module string.
%-----------------------------------------------------------------------------%
@@ -307,7 +309,27 @@ output_quoted_string(S, !IO) :-
output_quoted_string_lang(literal_c, S, !IO).
output_quoted_string_lang(Lang, S, !IO) :-
- do_output_quoted_string(Lang, 0, length(S), S, !IO).
+ (
+ Lang = literal_c,
+ % Avoid a limitation in the MSVC compiler where string literals can be
+ % no longer than 2048 chars. However if you output the string in
+ % chunks, eg "part a" "part b" it will accept a string longer than 2048
+ % chars, go figure!
+ % XXX we could avoid creating new strings
+ string.split_by_codepoint(S, 160, Left, Right),
+ do_output_quoted_string(Lang, Left, 0, !IO),
+ ( Right = "" ->
+ true
+ ;
+ io.write_string("\" \"", !IO),
+ output_quoted_string_lang(Lang, Right, !IO)
+ )
+ ;
+ ( Lang = literal_java
+ ; Lang = literal_csharp
+ ),
+ do_output_quoted_string(Lang, S, 0, !IO)
+ ).
output_quoted_multi_string(Ss, !IO) :-
output_quoted_multi_string_lang(literal_c, Ss, !IO).
@@ -318,50 +340,13 @@ output_quoted_multi_string_lang(Lang, [S | Ss], !IO) :-
output_quoted_char_lang(Lang, char.det_from_int(0), !IO),
output_quoted_multi_string_lang(Lang, Ss, !IO).
-:- pred do_output_quoted_string(literal_language::in, int::in, int::in,
- string::in, io::di, io::uo) is det.
+:- pred do_output_quoted_string(literal_language::in, string::in,
+ int::in, io::di, io::uo) is det.
-do_output_quoted_string(Lang, Cur, Len, S, !IO) :-
- ( Cur < Len ->
- % Avoid a limitation in the MSVC compiler where string literals
- % can be no longer than 2048 chars. However if you output the string
- % in chunks, eg "part a" "part b" it will accept a string longer than
- % 2048 chars, go figure!
- (
- Lang = literal_c,
- Cur \= 0,
- Cur mod 512 = 0
- ->
- io.write_string("\" \"", !IO)
- ;
- true
- ),
-
- string.unsafe_index(S, Cur, Char),
+do_output_quoted_string(Lang, S, Cur, !IO) :-
+ ( string.unsafe_index_next(S, Cur, Next, Char) ->
output_quoted_char_lang(Lang, Char, !IO),
-
- % Check for trigraph sequences in string literals. We break the
- % trigraph by breaking the string into multiple chunks. For example,
- % "??-" gets converted to "?" "?-".
- (
- Lang = literal_c,
- Char = '?',
- Cur + 2 < Len
- ->
- (
- string.unsafe_index(S, Cur + 1, '?'),
- string.unsafe_index(S, Cur + 2, ThirdChar),
- is_trigraph_char(ThirdChar)
- ->
- io.write_string("\" \"", !IO)
- ;
- true
- )
- ;
- true
- ),
-
- do_output_quoted_string(Lang, Cur + 1, Len, S, !IO)
+ do_output_quoted_string(Lang, S, Next, !IO)
;
true
).
@@ -403,25 +388,40 @@ quote_one_char(Lang, Char, RevChars0, RevChars) :-
->
RevChars = [EscapeChar, '\\' | RevChars0]
;
- is_c_source_char(Char)
+ Lang = literal_c,
+ Char = '?'
->
- RevChars = [Char | RevChars0]
+ % Avoid trigraphs by escaping the question marks.
+ RevChars = ['?', '\\' | RevChars0]
;
- Lang = literal_java,
- char.to_int(Char) >= 0x80
+ is_c_source_char(Char)
->
- % If the compiler is built in a C grade (8-bit strings), we assume that
- % both the Mercury source file and Java target file use UTF-8 encoding.
- % Each `Char' will be a UTF-8 code unit in a multi-byte sequence.
- % If the compiler is built in a Java backend, each `Char' will be a
- % UTF-16 code unit, possibly of a surrogate pair. In both cases the
- % code units must be passed through without escaping.
RevChars = [Char | RevChars0]
;
char.to_int(Char, 0)
->
RevChars = ['0', '\\' | RevChars0]
;
+ Int = char.to_int(Char),
+ Int >= 0x80
+ ->
+ (
+ Lang = literal_c,
+ ( char.to_utf8(Char, CodeUnits) ->
+ list.map(octal_escape_any_int, CodeUnits, EscapeCharss),
+ list.condense(EscapeCharss, EscapeChars),
+ reverse_append(EscapeChars, RevChars0, RevChars)
+ ;
+ unexpected($module, $pred, "invalid Unicode code point")
+ )
+ ;
+ Lang = literal_java,
+ RevChars = [Char | RevChars0]
+ ;
+ Lang = literal_csharp,
+ RevChars = [Char | RevChars0]
+ )
+ ;
(
Lang = literal_c,
octal_escape_any_char(Char, EscapeChars)
@@ -430,7 +430,7 @@ quote_one_char(Lang, Char, RevChars0, RevChars) :-
octal_escape_any_char(Char, EscapeChars)
;
Lang = literal_csharp,
- hex_escape_any_char(Char, EscapeChars)
+ unicode_escape_any_char(Char, EscapeChars)
),
reverse_append(EscapeChars, RevChars0, RevChars)
).
@@ -453,20 +453,6 @@ escape_special_char('\v', 'v'). % not in Java
escape_special_char('\r', 'r').
escape_special_char('\f', 'f').
- % Succeed if the given character, prefixed with "??", is a trigraph.
- %
-:- pred is_trigraph_char(char::in) is semidet.
-
-is_trigraph_char('(').
-is_trigraph_char(')').
-is_trigraph_char('<').
-is_trigraph_char('>').
-is_trigraph_char('=').
-is_trigraph_char('/').
-is_trigraph_char('\'').
-is_trigraph_char('!').
-is_trigraph_char('-').
-
% This succeeds iff the specified character is allowed as an (unescaped)
% character in standard-conforming C source code.
%
@@ -501,19 +487,20 @@ reverse_append([X | Xs], L0, L) :-
%
octal_escape_any_char(Char, EscapeCodeChars) :-
char.to_int(Char, Int),
+ octal_escape_any_int(Int, EscapeCodeChars).
+
+:- pred octal_escape_any_int(int::in, list(char)::out) is det.
+
+octal_escape_any_int(Int, EscapeCodeChars) :-
string.int_to_base_string(Int, 8, OctalString0),
string.pad_left(OctalString0, '0', 3, OctalString),
EscapeCodeChars = ['\\' | string.to_char_list(OctalString)].
-:- pred hex_escape_any_char(char::in, list(char)::out) is det.
+:- pred unicode_escape_any_char(char::in, list(char)::out) is det.
- % Convert a character to the corresponding hexadeciaml escape code.
- % XXX This assumes that the target language compiler's representation
- % of characters is the same as the Mercury compiler's.
- %
-hex_escape_any_char(Char, EscapeCodeChars) :-
+unicode_escape_any_char(Char, EscapeCodeChars) :-
char.to_int(Char, Int),
- string.format("\\x%04x", [i(Int)], HexString),
+ string.format("\\u%04x", [i(Int)], HexString),
string.to_char_list(HexString, EscapeCodeChars).
%-----------------------------------------------------------------------------%
diff --git a/compiler/compile_target_code.m b/compiler/compile_target_code.m
index fe4a07b..5552c4f 100644
--- a/compiler/compile_target_code.m
+++ b/compiler/compile_target_code.m
@@ -404,8 +404,8 @@ do_compile_c_file(ErrorStream, PIC, C_File, O_File, Globals, Succeeded, !IO) :-
string.append_list([
CC, " ",
AllCFlags,
- " -c ", C_File, " ",
- NameObjectFile, O_File], Command),
+ " -c ", quote_arg(C_File), " ",
+ NameObjectFile, quote_arg(O_File)], Command),
get_maybe_filtercc_command(Globals, MaybeFilterCmd),
invoke_system_command_maybe_filter_output(Globals, ErrorStream,
cmd_verbose_commands, Command, MaybeFilterCmd, Succeeded, !IO).
@@ -1262,7 +1262,8 @@ module_name_to_file_name_ext(Globals, Ext, MkDir, ModuleName, FileName, !IO) :-
invoke_mkinit(Globals, InitFileStream, Verbosity,
MkInit, Args, FileNames, MkInitOK, !IO) :-
- join_quoted_string_list(FileNames, "", "\n", "", TargetFileNames),
+ % mkinit expects unquoted file names.
+ join_string_list(FileNames, "", "\n", "", TargetFileNames),
io.make_temp(TmpFile, !IO),
io.open_output(TmpFile, OpenResult, !IO),
@@ -1969,7 +1970,7 @@ link_exe_or_shared_lib(Globals, ErrorStream, LinkTargetType, ModuleName,
UndefOpt, " ",
ThreadOpts, " ",
TraceOpts, " ",
- " -o ", OutputFileName, " ",
+ " -o ", quote_arg(OutputFileName), " ",
Objects, " ",
LinkOptSep, " ",
LinkLibraryDirectories, " ",
diff --git a/compiler/elds_to_erlang.m b/compiler/elds_to_erlang.m
index 6beb095..23d3884 100644
--- a/compiler/elds_to_erlang.m
+++ b/compiler/elds_to_erlang.m
@@ -1178,7 +1178,7 @@ write_with_escaping(StringOrAtom, String, !IO) :-
write_with_escaping_2(StringOrAtom, Char, !IO) :-
char.to_int(Char, Int),
(
- 32 =< Int, Int =< 126,
+ 32 =< Int, % Int =< 126,
Char \= ('\\'),
(
StringOrAtom = in_string,
diff --git a/compiler/mlds_to_cs.m b/compiler/mlds_to_cs.m
index 9d8133d..686b8b0 100644
--- a/compiler/mlds_to_cs.m
+++ b/compiler/mlds_to_cs.m
@@ -2238,7 +2238,8 @@ type_to_string(Info, MLDS_Type, String, ArrayDims) :-
ArrayDims = []
;
MLDS_Type = mlds_native_char_type,
- String = "char",
+ % C# `char' not large enough for code points so we must use `int'.
+ String = "int",
ArrayDims = []
;
MLDS_Type = mlds_foreign_type(ForeignType),
@@ -2333,7 +2334,8 @@ type_to_string(Info, MLDS_Type, String, ArrayDims) :-
mercury_type_to_string(Info, Type, CtorCat, String, ArrayDims) :-
(
CtorCat = ctor_cat_builtin(cat_builtin_char),
- String = "char",
+ % C# `char' not large enough for code points so we must use `int'.
+ String = "int",
ArrayDims = []
;
CtorCat = ctor_cat_builtin(cat_builtin_int),
@@ -3513,9 +3515,10 @@ csharp_builtin_type(Type, "double") :-
Type = mlds_native_float_type.
csharp_builtin_type(Type, "double") :-
Type = mercury_type(builtin_type(builtin_type_float), _, _).
-csharp_builtin_type(Type, "char") :-
+ % C# `char' not large enough for code points so we must use `int'.
+csharp_builtin_type(Type, "int") :-
Type = mlds_native_char_type.
-csharp_builtin_type(Type, "char") :-
+csharp_builtin_type(Type, "int") :-
Type = mercury_type(builtin_type(builtin_type_char), _, _).
csharp_builtin_type(Type, "bool") :-
Type = mlds_native_bool_type.
@@ -3609,7 +3612,7 @@ output_rval_const(Info, Const, !IO) :-
output_int_const(N, !IO)
;
Const = mlconst_char(N),
- io.write_string("((char) ", !IO),
+ io.write_string("( ", !IO),
output_int_const(N, !IO),
io.write_string(")", !IO)
;
diff --git a/compiler/mlds_to_java.m b/compiler/mlds_to_java.m
index 3004247..cf9db1c 100644
--- a/compiler/mlds_to_java.m
+++ b/compiler/mlds_to_java.m
@@ -3449,7 +3449,8 @@ type_to_string(Info, MLDS_Type, String, ArrayDims) :-
ArrayDims = []
;
MLDS_Type = mlds_native_char_type,
- String = "char",
+ % Java `char' not large enough for code points so we must use `int'.
+ String = "int",
ArrayDims = []
;
MLDS_Type = mlds_foreign_type(ForeignType),
@@ -3543,7 +3544,8 @@ type_to_string(Info, MLDS_Type, String, ArrayDims) :-
mercury_type_to_string(Info, Type, CtorCat, String, ArrayDims) :-
(
CtorCat = ctor_cat_builtin(cat_builtin_char),
- String = "char",
+ % Java `char' not large enough for code points so we must use `int'.
+ String = "int",
ArrayDims = []
;
CtorCat = ctor_cat_builtin(cat_builtin_int),
@@ -4872,9 +4874,10 @@ java_builtin_type(Type, "double", "java.lang.Double", "doubleValue") :-
Type = mlds_native_float_type.
java_builtin_type(Type, "double", "java.lang.Double", "doubleValue") :-
Type = mercury_type(builtin_type(builtin_type_float), _, _).
-java_builtin_type(Type, "char", "java.lang.Character", "charValue") :-
+ % Java `char' not large enough for code points so we must use `int'.
+java_builtin_type(Type, "int", "java.lang.Integer", "intValue") :-
Type = mlds_native_char_type.
-java_builtin_type(Type, "char", "java.lang.Character", "charValue") :-
+java_builtin_type(Type, "int", "java.lang.Integer", "intValue") :-
Type = mercury_type(builtin_type(builtin_type_char), _, _).
java_builtin_type(Type, "boolean", "java.lang.Boolean", "booleanValue") :-
Type = mlds_native_bool_type.
@@ -5001,7 +5004,7 @@ output_rval_const(Info, Const, !IO) :-
output_int_const(N, !IO)
;
Const = mlconst_char(N),
- io.write_string("((char) ", !IO),
+ io.write_string("(", !IO),
output_int_const(N, !IO),
io.write_string(")", !IO)
;
diff --git a/doc/reference_manual.texi b/doc/reference_manual.texi
index 5b7879f..f33f0a6 100644
--- a/doc/reference_manual.texi
+++ b/doc/reference_manual.texi
@@ -6742,11 +6742,13 @@ than @samp{sizeof(MR_Word)}.
@node C# data passing conventions
@subsection C# data passing conventions
-The Mercury types @code{int}, @code{float}, @code{char},
+The Mercury types @code{int}, @code{float},
and @code{string} are mapped to the Common Language Infrastructure (CLI) types
- at code{System.Int32}, @code{System.Double}, @code{System.Char} and
+ at code{System.Int32}, @code{System.Double}, and
@code{System.String} respectively, which correspond to the C# types
- at code{int}, @code{double}, @code{char}, and @code{string}.
+ at code{int}, @code{double}, and @code{string}.
+The Mercury type @code{char} is mapped like @code{int}; @emph{not} to the CLI
+type @code{System.Char} because that only holds 16-bit numeric values.
For the Mercury standard library type @samp{bool.bool}, there is a
corresponding C# type, @code{mr_bool.Bool_0}. C# code can refer to the
@@ -6858,10 +6860,12 @@ List_1 list.cons(object head, List_1 tail)
@node Java data passing conventions
@subsection Java data passing conventions
-The Mercury types @code{int}, @code{float}, @code{char},
+The Mercury types @code{int}, @code{float},
and @code{string} are mapped to the Java types
- at code{int}, @code{double}, @code{char} and
+ at code{int}, @code{double}, and
@code{java.lang.String} respectively.
+The Mercury type @code{char} is mapped like @code{int}; @emph{not} to the Java
+type @code{char} because that only holds 16-bit numeric values.
For the Mercury standard library type @samp{bool.bool}, there is a
corresponding Java type, @code{bool.Bool_0}. Java code can refer to the
diff --git a/library/char.m b/library/char.m
index 5771ebf..f4ef1df 100644
--- a/library/char.m
+++ b/library/char.m
@@ -24,31 +24,28 @@
:- module char.
:- interface.
+
:- import_module enum.
+:- import_module list.
:- import_module pretty_printer.
%-----------------------------------------------------------------------------%
+ % A Unicode code point.
+ %
:- type char == character.
:- instance enum(character).
% Convert a character to its corresponding numerical code (integer value).
- % Beware that the mapping from characters to numerical codes is
- % implementation-dependent; there is no guarantee that the integer values
- % for characters will fit in 8 bits. Furthermore, the value returned from
- % char.to_int might be different than the byte(s) used to store the
- % character in a file. There is also no guarantee that characters created
- % using `char.to_int(out, in)' can be written to files or to the standard
- % output or standard error streams. For example, an implementation might
- % represent characters using Unicode, but store files in an 8-bit national
- % character set.
%
% Note that '\0' is not accepted as a Mercury null character constant.
% Instead, a null character can be created using `char.det_from_int(0)'.
% Null characters aren't very useful in Mercury because they aren't
% allowed in strings.
%
+ % Codepoints outside the Unicode range are also not accepted.
+ %
:- func char.to_int(char) = int.
:- pred char.to_int(char, int).
:- mode char.to_int(in, out) is det.
@@ -97,8 +94,8 @@
:- mode char.lower_upper(in, out) is semidet.
:- mode char.lower_upper(out, in) is semidet.
- % True iff the character is whitespace, i.e. a space, tab,
- % newline, carriage return, form-feed, or vertical tab.
+ % True iff the character is a whitespace character in the ASCII range,
+ % i.e. a space, tab, newline, carriage return, form-feed, or vertical tab.
%
:- pred char.is_whitespace(char::in) is semidet.
@@ -175,11 +172,26 @@
%
:- func char.char_to_doc(char) = pretty_printer.doc.
+ % Encode a Unicode code point in UTF-8.
+ %
+:- pred char.to_utf8(char::in, list(int)::out) is semidet.
+
+ % Encode a Unicode code point in UTF-16 (native endianness).
+ %
+:- pred char.to_utf16(char::in, list(int)::out) is semidet.
+
+ % Succeed if `Char' is a Unicode surrogate code point.
+ % In UTF-16, a code point with a scalar value greater than 0xffff
+ % is encoded with a pair of surrogate code points.
+ %
+:- pred char.is_surrogate(char::in) is semidet.
+
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
:- implementation.
+:- import_module int.
:- import_module require.
:- import_module term_io.
@@ -513,13 +525,8 @@ char.det_from_int(Int) = Char :-
[will_not_call_mercury, promise_pure, thread_safe,
does_not_affect_liveness],
"
- /*
- ** If the integer doesn't fit into a char, then the assignment
- ** `Character = Int' below will truncate it. SUCCESS_INDICATOR will be set
- ** to true only if the result was not truncated.
- */
Character = Int;
- SUCCESS_INDICATOR = ((MR_UnsignedChar) Character == Int);
+ SUCCESS_INDICATOR = (Character >= 0 && Character <= 0x10ffff);
").
:- pragma foreign_proc("C#",
@@ -540,8 +547,8 @@ char.det_from_int(Int) = Char :-
char.to_int(Character::out, Int::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
- Character = (char) Int;
- SUCCESS_INDICATOR = (Character == Int);
+ Character = Int;
+ SUCCESS_INDICATOR = (Int >= 0 && Int <= 0x10ffff);
").
:- pragma foreign_proc("Java",
@@ -562,8 +569,8 @@ char.det_from_int(Int) = Char :-
char.to_int(Character::out, Int::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
- Character = (char) Int;
- SUCCESS_INDICATOR = ((int) Character == Int);
+ Character = Int;
+ SUCCESS_INDICATOR = (Int >= 0 && Int <= 0x10ffff);
").
:- pragma foreign_proc("Erlang",
@@ -584,16 +591,8 @@ char.det_from_int(Int) = Char :-
char.to_int(Character::out, Int::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
- case
- Int >= 0 andalso Int < 256
- of
- true ->
- SUCCESS_INDICATOR = true,
- Character = Int;
- false ->
- SUCCESS_INDICATOR = false,
- Character = -1
- end
+ Character = Int,
+ SUCCESS_INDICATOR = (Int >= 0 andalso Int =< 16#10ffff)
").
% We used unsigned character codes, so the minimum character code
@@ -607,33 +606,70 @@ char.min_char_value(0).
[will_not_call_mercury, promise_pure, thread_safe,
does_not_affect_liveness],
"
- Max = UCHAR_MAX;
+ Max = 0x10ffff;
").
:- pragma foreign_proc("C#",
char.max_char_value(Max::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
- // .NET uses 16-bit 'Unicode'. This might be either UCS-2,
- // where Unicode characters that don't fit in 16 bits are encoded
- // in two 16 bit characters, or it might be just the 16-bit subset,
- // i.e. only the Unicode characters that fit in 16 bits.
- // For our purposes, it doesn't matter.
- Max = 0xffff;
+ Max = 0x10ffff;
").
:- pragma foreign_proc("Java",
char.max_char_value(Max::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
- Max = (int) java.lang.Character.MAX_VALUE;
+ Max = 0x10ffff;
").
:- pragma foreign_proc("Erlang",
char.max_char_value(Max::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
- % XXX actually Erlang doesn't have chars so there should be no maximum
- Max = 255
+ Max = 16#10ffff
").
+char.to_utf8(Char, CodeUnits) :-
+ Int = char.to_int(Char),
+ ( Int =< 0x7f ->
+ CodeUnits = [Int]
+ ; Int =< 0x7ff ->
+ A = 0xc0 \/ ((Int >> 6) /\ 0x1f),
+ B = 0x80 \/ (Int /\ 0x3f),
+ CodeUnits = [A, B]
+ ; Int =< 0xffff ->
+ A = 0xe0 \/ ((Int >> 12) /\ 0x0f),
+ B = 0x80 \/ ((Int >> 6) /\ 0x3f),
+ C = 0x80 \/ (Int /\ 0x3f),
+ CodeUnits = [A, B, C]
+ ; Int =< 0x10ffff ->
+ A = 0xf0 \/ ((Int >> 18) /\ 0x07),
+ B = 0x80 \/ ((Int >> 12) /\ 0x3f),
+ C = 0x80 \/ ((Int >> 6) /\ 0x3f),
+ D = 0x80 \/ (Int /\ 0x3f),
+ CodeUnits = [A, B, C, D]
+ ;
+ % Illegal code point.
+ fail
+ ).
+
+char.to_utf16(Char, CodeUnits) :-
+ Int = char.to_int(Char),
+ ( Int =< 0xffff ->
+ CodeUnits = [Int]
+ ; Int =< 0x10ffff ->
+ U = Int - 0x10000,
+ A = 0xd800 \/ (U >> 10),
+ B = 0xdc00 \/ (U /\ 0x3ff),
+ CodeUnits = [A, B]
+ ;
+ % Illegal code point.
+ fail
+ ).
+
+char.is_surrogate(Char) :-
+ Int = char.to_int(Char),
+ Int >= 0xd800,
+ Int =< 0xdfff.
+
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
% Ralph Becket <rwab1 at cl.cam.ac.uk> 27/04/99
diff --git a/library/io.m b/library/io.m
index 89d0d94..ed568d5 100644
--- a/library/io.m
+++ b/library/io.m
@@ -133,7 +133,7 @@
% Text input predicates
%
- % Reads a character from the current input stream.
+ % Reads a character (code point) from the current input stream.
%
:- pred io.read_char(io.result(char)::out, io::di, io::uo) is det.
@@ -142,7 +142,7 @@
:- pred io.read_word(io.result(list(char))::out, io::di, io::uo) is det.
% Reads a line from the current input stream, returns the result
- % as a list of chars.
+ % as a list of characters (code points).
%
:- pred io.read_line(io.result(list(char))::out, io::di, io::uo) is det.
@@ -152,14 +152,14 @@
%
:- pred io.read_line_as_string(io.result(string)::out, io::di, io::uo) is det.
- % Reads all the characters from the current input stream until
- % eof or error.
+ % Reads all the characters (code points) from the current input stream
+ % until eof or error.
%
:- pred io.read_file(io.maybe_partial_res(list(char))::out, io::di, io::uo)
is det.
- % Reads all the characters from the current input stream until
- % eof or error. Returns the result as a string rather than
+ % Reads all the characters (code points) from the current input stream
+ % until eof or error. Returns the result as a string rather than
% as a list of char.
%
% Returns an error if the file contains a null character, because
@@ -168,7 +168,7 @@
:- pred io.read_file_as_string(io.maybe_partial_res(string)::out,
io::di, io::uo) is det.
- % Applies the given closure to each character read from
+ % Applies the given closure to each character (code point) read from
% the input stream in turn, until eof or error.
%
:- pred io.input_stream_foldl(pred(char, T, T), T, io.maybe_partial_res(T),
@@ -178,7 +178,7 @@
:- mode io.input_stream_foldl((pred(in, in, out) is cc_multi), in, out,
di, uo) is cc_multi.
- % Applies the given closure to each character read from
+ % Applies the given closure to each character (code point) read from
% the input stream in turn, until eof or error.
%
:- pred io.input_stream_foldl_io(pred(char, io, io), io.res, io, io).
@@ -187,7 +187,7 @@
:- mode io.input_stream_foldl_io((pred(in, di, uo) is cc_multi), out, di, uo)
is cc_multi.
- % Applies the given closure to each character read from
+ % Applies the given closure to each character (code point) read from
% the input stream in turn, until eof or error.
%
:- pred io.input_stream_foldl2_io(pred(char, T, T, io, io),
@@ -197,9 +197,9 @@
:- mode io.input_stream_foldl2_io((pred(in, in, out, di, uo) is cc_multi),
in, out, di, uo) is cc_multi.
- % Applies the given closure to each character read from the input stream
- % in turn, until eof or error, or the closure returns `no' as its
- % second argument.
+ % Applies the given closure to each character (code point) read from the
+ % input stream in turn, until eof or error, or the closure returns `no' as
+ % its second argument.
%
:- pred io.input_stream_foldl2_io_maybe_stop(
pred(char, bool, T, T, io, io),
@@ -211,7 +211,7 @@
(pred(in, out, in, out, di, uo) is cc_multi),
in, out, di, uo) is cc_multi.
- % Un-reads a character from the current input stream.
+ % Un-reads a character (code point) from the current input stream.
% You can put back as many characters as you like.
% You can even put back something that you didn't actually read.
% Note: `io.putback_char' uses the C library function ungetc().
@@ -220,12 +220,12 @@
%
:- pred io.putback_char(char::in, io::di, io::uo) is det.
- % Reads a character from specified stream.
+ % Reads a character (code point) from specified stream.
%
:- pred io.read_char(io.input_stream::in, io.result(char)::out,
io::di, io::uo) is det.
- % Reads a character from the specified stream.
+ % Reads a character (code point) from the specified stream.
% This interface avoids memory allocation when there is no error.
%
:- pred io.read_char_unboxed(io.input_stream::in, io.result::out, char::out,
@@ -237,7 +237,7 @@
io::di, io::uo) is det.
% Reads a line from specified stream, returning the result
- % as a list of chars.
+ % as a list of characters (code point).
%
:- pred io.read_line(io.input_stream::in, io.result(list(char))::out,
io::di, io::uo) is det.
@@ -249,7 +249,7 @@
:- pred io.read_line_as_string(io.input_stream::in, io.result(string)::out,
io::di, io::uo) is det.
- % Reads all the characters from the given input stream until
+ % Reads all the characters (code points) from the given input stream until
% eof or error.
%
:- pred io.read_file(io.input_stream::in,
@@ -264,7 +264,7 @@
:- pred io.read_file_as_string(io.input_stream::in,
io.maybe_partial_res(string)::out, io::di, io::uo) is det.
- % Applies the given closure to each character read from
+ % Applies the given closure to each character (code point) read from
% the input stream in turn, until eof or error.
%
:- pred io.input_stream_foldl(io.input_stream, pred(char, T, T),
@@ -274,7 +274,7 @@
:- mode io.input_stream_foldl(in, in(pred(in, in, out) is cc_multi),
in, out, di, uo) is cc_multi.
- % Applies the given closure to each character read from
+ % Applies the given closure to each character (code point) read from
% the input stream in turn, until eof or error.
%
:- pred io.input_stream_foldl_io(io.input_stream, pred(char, io, io),
@@ -284,7 +284,7 @@
:- mode io.input_stream_foldl_io(in, in(pred(in, di, uo) is cc_multi),
out, di, uo) is cc_multi.
- % Applies the given closure to each character read from
+ % Applies the given closure to each character (code point) read from
% the input stream in turn, until eof or error.
%
:- pred io.input_stream_foldl2_io(io.input_stream,
@@ -297,9 +297,9 @@
in(pred(in, in, out, di, uo) is cc_multi),
in, out, di, uo) is cc_multi.
- % Applies the given closure to each character read from the input stream
- % in turn, until eof or error, or the closure returns `no' as its
- % second argument.
+ % Applies the given closure to each character (code point) read from the
+ % input stream in turn, until eof or error, or the closure returns `no' as
+ % its second argument.
%
:- pred io.input_stream_foldl2_io_maybe_stop(io.input_stream,
pred(char, bool, T, T, io, io),
@@ -1806,12 +1806,12 @@
% we don't want the C/Java/etc code to depend on how Mercury stores
% its discriminated union data types.
- % Reads a character from specified stream, and returns the numerical value
- % for that character (as from char.to_int). This may involve converting
- % external character encodings into Mercury's internal character
- % representation and (for text streams) converting OS line indicators,
- % e.g. CR-LF for Windows, to '\n' characters. Returns -1 if at EOF,
- % -2 if an error occurs.
+ % Reads a character (code point) from specified stream, and returns the
+ % numerical value for that character (as from char.to_int). This may
+ % involve converting external character encodings into Mercury's internal
+ % character representation and (for text streams) converting OS line
+ % indicators, e.g. CR-LF for Windows, to '\n' characters.
+ % Returns -1 if at EOF, -2 if an error occurs.
%
:- pred io.read_char_code(io.input_stream::in, int::out, io::di, io::uo)
is det.
@@ -2170,15 +2170,15 @@ io.read_line_as_string(input_stream(Stream), Result, !IO) :-
#define ML_IO_BYTES_TO_WORDS(n) (((n) + sizeof(MR_Word) - 1) / sizeof(MR_Word))
#define ML_IO_READ_LINE_START 1024
- MR_Char initial_read_buffer[ML_IO_READ_LINE_START];
- MR_Char *read_buffer = initial_read_buffer;
+ char initial_read_buffer[ML_IO_READ_LINE_START];
+ char *read_buffer = initial_read_buffer;
size_t read_buf_size = ML_IO_READ_LINE_START;
size_t i;
int char_code = '\\0';
Res = 0;
for (i = 0; char_code != '\\n'; ) {
- char_code = mercury_getc(Stream);
+ char_code = mercury_get_byte(Stream);
if (char_code == EOF) {
if (i == 0) {
Res = -1;
@@ -2189,21 +2189,17 @@ io.read_line_as_string(input_stream(Stream), Result, !IO) :-
Res = -2;
break;
}
- if (char_code != (MR_UnsignedChar) char_code) {
- Res = -3;
- break;
- }
read_buffer[i++] = char_code;
MR_assert(i <= read_buf_size);
if (i == read_buf_size) {
/* Grow the read buffer */
read_buf_size = ML_IO_READ_LINE_GROW(read_buf_size);
if (read_buffer == initial_read_buffer) {
- read_buffer = MR_NEW_ARRAY(MR_Char, read_buf_size);
+ read_buffer = MR_NEW_ARRAY(char, read_buf_size);
MR_memcpy(read_buffer, initial_read_buffer,
ML_IO_READ_LINE_START);
} else {
- read_buffer = MR_RESIZE_ARRAY(read_buffer, MR_Char,
+ read_buffer = MR_RESIZE_ARRAY(read_buffer, char,
read_buf_size);
}
}
@@ -2211,10 +2207,10 @@ io.read_line_as_string(input_stream(Stream), Result, !IO) :-
if (Res == 0) {
MR_Word ret_string_word;
MR_offset_incr_hp_atomic_msg(ret_string_word,
- 0, ML_IO_BYTES_TO_WORDS((i + 1) * sizeof(MR_Char)),
+ 0, ML_IO_BYTES_TO_WORDS((i + 1) * sizeof(char)),
MR_PROC_LABEL, ""string.string/0"");
RetString = (MR_String) ret_string_word;
- MR_memcpy(RetString, read_buffer, i * sizeof(MR_Char));
+ MR_memcpy(RetString, read_buffer, i * sizeof(char));
RetString[i] = '\\0';
} else {
/*
@@ -4018,7 +4014,7 @@ have_file_ids :- semidet_fail.
% Buffer sizes are measured in Chars.
:- type buffer.
-:- pragma foreign_type(c, buffer, "MR_Char *", [can_pass_as_mercury_type]).
+:- pragma foreign_type(c, buffer, "char *", [can_pass_as_mercury_type]).
% XXX It would be better to use a char_array (e.g. defined as char[] in
% C#) type rather than array(char). This is because on the Java and IL
@@ -4040,9 +4036,9 @@ have_file_ids :- semidet_fail.
"{
MR_Word buf;
MR_offset_incr_hp_atomic_msg(buf, 0,
- (Size * sizeof(MR_Char) + sizeof(MR_Word) - 1) / sizeof(MR_Word),
+ (Size * sizeof(char) + sizeof(MR_Word) - 1) / sizeof(MR_Word),
MR_PROC_LABEL, ""io:buffer/0"");
- Buffer = (MR_Char *) buf;
+ Buffer = (char *) buf;
}").
io.alloc_buffer(Size, buffer(Array)) :-
@@ -4058,28 +4054,28 @@ io.alloc_buffer(Size, buffer(Array)) :-
[will_not_call_mercury, promise_pure, tabled_for_io, thread_safe,
does_not_affect_liveness],
"{
- MR_CHECK_EXPR_TYPE(Buffer0, MR_Char *);
- MR_CHECK_EXPR_TYPE(Buffer, MR_Char *);
+ MR_CHECK_EXPR_TYPE(Buffer0, char *);
+ MR_CHECK_EXPR_TYPE(Buffer, char *);
#ifdef MR_CONSERVATIVE_GC
- Buffer = MR_GC_realloc(Buffer0, NewSize * sizeof(MR_Char));
+ Buffer = MR_GC_realloc(Buffer0, NewSize * sizeof(char));
#else
- if (Buffer0 + OldSize == (MR_Char *) MR_hp) {
+ if (Buffer0 + OldSize == (char *) MR_hp) {
MR_Word next;
MR_offset_incr_hp_atomic_msg(next, 0,
- (NewSize * sizeof(MR_Char) + sizeof(MR_Word) - 1)
+ (NewSize * sizeof(char) + sizeof(MR_Word) - 1)
/ sizeof(MR_Word),
MR_PROC_LABEL, ""io:buffer/0"");
- assert(Buffer0 + OldSize == (MR_Char *) next);
+ assert(Buffer0 + OldSize == (char *) next);
Buffer = Buffer0;
} else {
/* just have to alloc and copy */
MR_Word buf;
MR_offset_incr_hp_atomic_msg(buf, 0,
- (NewSize * sizeof(MR_Char) + sizeof(MR_Word) - 1)
+ (NewSize * sizeof(char) + sizeof(MR_Word) - 1)
/ sizeof(MR_Word),
MR_PROC_LABEL, ""io:buffer/0"");
- Buffer = (MR_Char *) buf;
+ Buffer = (char *) buf;
if (OldSize > NewSize) {
MR_memcpy(Buffer, Buffer0, NewSize);
} else {
@@ -4128,8 +4124,8 @@ io.buffer_to_string(buffer(Array), Len, String) :-
"{
int items_read;
- MR_CHECK_EXPR_TYPE(Buffer0, MR_Char *);
- MR_CHECK_EXPR_TYPE(Buffer, MR_Char *);
+ MR_CHECK_EXPR_TYPE(Buffer0, char *);
+ MR_CHECK_EXPR_TYPE(Buffer, char *);
items_read = MR_READ(*Stream, Buffer0 + Pos0, Size - Pos0);
@@ -5818,6 +5814,18 @@ int ML_fprintf(MercuryFilePtr mf, const char *format, ...);
return true;
}
+ private int read_code_unit()
+ throws java.io.IOException
+ {
+ if (!refill_buffer()) {
+ return -1;
+ }
+
+ char c = buf[buf_pos];
+ buf_pos++;
+ return (int) c;
+ }
+
/*
** read_char(): [Java]
**
@@ -5827,18 +5835,22 @@ int ML_fprintf(MercuryFilePtr mf, const char *format, ...);
public int read_char()
throws java.io.IOException
{
- if (!refill_buffer()) {
+ final int c1 = read_code_unit();
+ if (c1 == -1) {
return -1;
}
-
- char c = buf[buf_pos];
- buf_pos++;
-
- if (c == '\\n') {
+ if (c1 == '\\n') {
line_number++;
}
-
- return c;
+ if (!Character.isHighSurrogate((char) c1)) {
+ return c1;
+ }
+ final int c2 = read_code_unit();
+ if (c2 != -1 && !Character.isLowSurrogate((char) c2)) {
+ // Return replacement character.
+ return 0xfffd;
+ }
+ return Character.toCodePoint((char) c1, (char) c2);
}
/*
@@ -5911,7 +5923,7 @@ int ML_fprintf(MercuryFilePtr mf, const char *format, ...);
}
}
- public void ungetc(char c) {
+ private void unget_code_unit(char c) {
/*
** If necessary, shift the unread characters in the input buffer
** to make room at the front of the buffer. If the buffer is full
@@ -5933,11 +5945,19 @@ int ML_fprintf(MercuryFilePtr mf, const char *format, ...);
}
}
- buf_pos--;
- buf[buf_pos] = c;
+ buf[--buf_pos] = c;
+ }
- if (c == '\\n') {
- line_number--;
+ public void ungetc(int c) {
+ if (Character.charCount(c) == 1) {
+ unget_code_unit((char) c);
+ if (c == '\\n') {
+ line_number--;
+ }
+ } else {
+ char[] units = Character.toChars(c);
+ unget_code_unit(units[1]);
+ unget_code_unit(units[0]);
}
}
@@ -7004,7 +7024,11 @@ mercury_print_string(MR_MercuryFileStruct mf, string s)
// So we have to output each character separately.
for (int i = 0; i < s.Length; i++) {
- if (s[i] == '\\n') {
+ if (System.Char.IsSurrogate(s[i])) {
+ // XXX unicode: more error checking
+ mf.writer.Write(s.Substring(i, 2));
+ i++;
+ } else if (s[i] == '\\n') {
mf.line_number++;
mf.writer.WriteLine("""");
} else {
@@ -7017,10 +7041,14 @@ mercury_print_string(MR_MercuryFileStruct mf, string s)
").
+:- pragma foreign_decl("C", "
+extern int mercury_get_byte(MercuryFilePtr mf);
+").
+
:- pragma foreign_code("C", "
int
-mercury_getc(MercuryFilePtr mf)
+mercury_get_byte(MercuryFilePtr mf)
{
int c = MR_GETCH(*mf);
if (c == '\\n') {
@@ -7077,6 +7105,9 @@ mercury_getc(MR_MercuryFileStruct mf)
// If not, we still need to treat this as a newline, and thus
// increment the line counter.
mf.line_number++;
+ } else if (System.Char.IsSurrogate((char) c)) {
+ int c2 = mf.reader.Read();
+ c = System.Char.ConvertToUtf32((char) c, (char) c2);
}
} else /* c == NewLine[0] */ {
switch (io.NewLine.Length) {
@@ -7288,6 +7319,16 @@ mercury_close(MercuryFilePtr mf)
:- pragma foreign_code("C#", "
public static void
+mercury_write_codepoint(System.IO.TextWriter w, int c)
+{
+ if (c <= 0xffff) {
+ w.Write((char) c);
+ } else {
+ w.Write(System.Char.ConvertFromUtf32(c));
+ }
+}
+
+public static void
mercury_close(MR_MercuryFileStruct mf)
{
if (mf.reader != null) {
@@ -7336,7 +7377,48 @@ io.read_char_code(input_stream(Stream), CharCode, !IO) :-
[will_not_call_mercury, promise_pure, tabled_for_io,
does_not_affect_liveness, no_sharing],
"
- CharCode = mercury_getc(Stream);
+ char buf[5];
+ int nbytes;
+ int i;
+ int c;
+ unsigned int uc;
+
+ c = mercury_get_byte(Stream);
+ uc = c;
+ if (uc <= 0x7f) {
+ CharCode = uc;
+ } else if (c == EOF) {
+ CharCode = -1;
+ } else {
+ if ((uc & 0xE0) == 0xC0) {
+ nbytes = 2;
+ } else if ((uc & 0xF0) == 0xE0) {
+ nbytes = 3;
+ } else if ((uc & 0xF8) == 0xF0) {
+ nbytes = 4;
+ } else {
+ nbytes = 0;
+ }
+ if (nbytes > 0) {
+ buf[0] = uc;
+ for (i = 1; i < nbytes; i++) {
+ uc = mercury_get_byte(Stream);
+ buf[i] = uc;
+ }
+ buf[i] = '\\0';
+ CharCode = MR_utf8_get(buf, 0);
+ if (CharCode < 0) {
+ /* Invalid byte sequence. */
+ errno = EILSEQ;
+ CharCode = -2;
+ }
+ } else {
+ /* Invalid byte sequence. */
+ errno = EILSEQ;
+ CharCode = -2;
+ }
+ }
+
MR_update_io(IO0, IO);
").
@@ -7350,7 +7432,7 @@ io.read_byte_val(input_stream(Stream), ByteVal, !IO) :-
[will_not_call_mercury, promise_pure, tabled_for_io,
does_not_affect_liveness, no_sharing],
"
- ByteVal = mercury_getc(Stream);
+ ByteVal = mercury_get_byte(Stream);
MR_update_io(IO0, IO);
").
@@ -7367,9 +7449,20 @@ io.putback_char(input_stream(Stream), Character, !IO) :-
if (Character == '\\n') {
MR_line_number(*mf)--;
}
- /* XXX should work even if ungetc() fails */
- if (MR_UNGETCH(*mf, Character) == EOF) {
- mercury_io_error(mf, ""io.putback_char: ungetc failed"");
+ if (Character <= 0x7f) {
+ if (MR_UNGETCH(*mf, Character) == EOF) {
+ mercury_io_error(mf, ""io.putback_char: ungetc failed"");
+ }
+ } else {
+ char buf[5];
+ ssize_t len;
+ len = MR_utf8_encode(buf, Character);
+ /* XXX this depends on multiple putback */
+ for (; len > 0; len--) {
+ if (MR_UNGETCH(*mf, buf[len - 1]) == EOF) {
+ mercury_io_error(mf, ""io.putback_char: ungetc failed"");
+ }
+ }
}
MR_update_io(IO0, IO);
").
@@ -7384,7 +7477,6 @@ io.putback_byte(binary_input_stream(Stream), Character, !IO) :-
does_not_affect_liveness, no_sharing],
"
MercuryFilePtr mf = Stream;
- /* XXX should work even if ungetc() fails */
if (MR_UNGETCH(*mf, Character) == EOF) {
mercury_io_error(mf, ""io.putback_byte: ungetc failed"");
}
@@ -7520,11 +7612,24 @@ io.putback_byte(binary_input_stream(Stream), Character, !IO) :-
does_not_affect_liveness, no_sharing],
"
MercuryFilePtr out = mercury_current_text_output();
- if (MR_PUTCH(*out, Character) < 0) {
- mercury_output_error(out);
- }
- if (Character == '\\n') {
- MR_line_number(*out)++;
+ char buf[5];
+ size_t len;
+ int i;
+ if (Character <= 0x7f) {
+ if (MR_PUTCH(*out, Character) < 0) {
+ mercury_output_error(out);
+ }
+ if (Character == '\\n') {
+ MR_line_number(*out)++;
+ }
+ } else {
+ len = MR_utf8_encode(buf, Character);
+ for (i = 0; i < len; i++) {
+ if (MR_PUTCH(*out, buf[i]) < 0) {
+ mercury_output_error(out);
+ break;
+ }
+ }
}
MR_update_io(IO0, IO);
").
@@ -7626,7 +7731,7 @@ io.write_bitmap(Bitmap, Start, NumBytes, !IO) :-
switch (io.mercury_current_text_output.line_ending) {
case io.ML_line_ending_kind.ML_raw_binary:
case io.ML_line_ending_kind.ML_Unix_line_ending:
- w.Write(Character);
+ mercury_write_codepoint(w, Character);
break;
case io.ML_line_ending_kind.ML_OS_line_ending:
w.WriteLine("""");
@@ -7634,7 +7739,7 @@ io.write_bitmap(Bitmap, Start, NumBytes, !IO) :-
}
io.mercury_current_text_output.line_number++;
} else {
- w.Write(Character);
+ mercury_write_codepoint(w, Character);
}
").
@@ -7677,8 +7782,11 @@ io.write_bitmap(Bitmap, Start, NumBytes, !IO) :-
io.write_char(Chr::in, _IO0::di, _IO::uo),
[may_call_mercury, promise_pure, thread_safe, tabled_for_io, terminates],
"
- io.mercury_current_text_output.get().write_or_throw(
- Character.toString(Chr));
+ io.MR_TextOutputFile stream = io.mercury_current_text_output.get();
+ char[] buf = java.lang.Character.toChars(Chr);
+ for (char c : buf) {
+ stream.put_or_throw(c);
+ }
").
:- pragma foreign_proc("Java",
io.write_int(Val::in, _IO0::di, _IO::uo),
@@ -7857,11 +7965,24 @@ io.write_char(output_stream(Stream), Character, !IO) :-
[may_call_mercury, promise_pure, tabled_for_io, thread_safe, terminates,
does_not_affect_liveness, no_sharing],
"
- if (MR_PUTCH(*Stream, Character) < 0) {
- mercury_output_error(Stream);
- }
- if (Character == '\\n') {
- MR_line_number(*Stream)++;
+ if (Character <= 0x7f) {
+ if (MR_PUTCH(*Stream, Character) < 0) {
+ mercury_output_error(Stream);
+ }
+ if (Character == '\\n') {
+ MR_line_number(*Stream)++;
+ }
+ } else {
+ char buf[5];
+ size_t len;
+ int i;
+ len = MR_utf8_encode(buf, Character);
+ for (i = 0; i < len; i++) {
+ if (MR_PUTCH(*Stream, buf[i]) < 0) {
+ mercury_output_error(Stream);
+ break;
+ }
+ }
}
MR_update_io(IO0, IO);
").
@@ -8014,7 +8135,7 @@ io.flush_binary_output(binary_output_stream(Stream), !IO) :-
switch (stream.line_ending) {
case io.ML_line_ending_kind.ML_raw_binary:
case io.ML_line_ending_kind.ML_Unix_line_ending:
- w.Write(Character);
+ mercury_write_codepoint(w, Character);
break;
case io.ML_line_ending_kind.ML_OS_line_ending:
w.WriteLine("""");
@@ -8022,7 +8143,7 @@ io.flush_binary_output(binary_output_stream(Stream), !IO) :-
}
stream.line_number++;
} else {
- w.Write(Character);
+ mercury_write_codepoint(w, Character);
}
").
@@ -8089,7 +8210,10 @@ io.flush_binary_output(binary_output_stream(Stream), !IO) :-
io.write_char_2(Stream::in, Character::in, _IO0::di, _IO::uo),
[may_call_mercury, promise_pure, thread_safe, tabled_for_io, terminates],
"
- ((io.MR_TextOutputFile) Stream).put_or_throw(Character);
+ char[] buf = java.lang.Character.toChars(Character);
+ for (char c : buf) {
+ ((io.MR_TextOutputFile) Stream).put_or_throw(c);
+ }
").
:- pragma foreign_proc("Java",
diff --git a/library/lexer.m b/library/lexer.m
index 95fb5cc..ac6b828 100644
--- a/library/lexer.m
+++ b/library/lexer.m
@@ -319,8 +319,7 @@ string_get_context(StartPosn, Context, !Posn) :-
string_read_char(String, Len, Char, Posn0, Posn) :-
Posn0 = posn(LineNum0, LineOffset0, Offset0),
Offset0 < Len,
- string.unsafe_index(String, Offset0, Char),
- Offset = Offset0 + 1,
+ string.unsafe_index_next(String, Offset0, Offset, Char),
( Char = '\n' ->
LineNum = LineNum0 + 1,
Posn = posn(LineNum, Offset, Offset)
@@ -332,13 +331,15 @@ string_read_char(String, Len, Char, Posn0, Posn) :-
string_ungetchar(String, Posn0, Posn) :-
Posn0 = posn(LineNum0, LineOffset0, Offset0),
- Offset = Offset0 - 1,
- string.unsafe_index(String, Offset, Char),
- ( Char = '\n' ->
- LineNum = LineNum0 - 1,
- Posn = posn(LineNum, Offset, Offset)
+ ( string.unsafe_prev_index(String, Offset0, Offset, Char) ->
+ ( Char = '\n' ->
+ LineNum = LineNum0 - 1,
+ Posn = posn(LineNum, Offset, Offset)
+ ;
+ Posn = posn(LineNum0, LineOffset0, Offset)
+ )
;
- Posn = posn(LineNum0, LineOffset0, Offset)
+ Posn = Posn0
).
:- pred grab_string(string::in, posn::in, string::out,
@@ -1223,13 +1224,14 @@ get_unicode_escape(Stream, NumHexChars, QuoteChar, Chars, HexChars, Token,
( if
rev_char_list_to_string(HexChars, HexString),
string.base_string_to_int(16, HexString, UnicodeCharCode),
- convert_unicode_char_to_target_chars(UnicodeCharCode, UTFChars)
+ allowed_unicode_char_code(UnicodeCharCode),
+ char.from_int(UnicodeCharCode, UnicodeChar)
then
( if UnicodeCharCode = 0 then
Token = null_character_error
else
get_quoted_name(Stream, QuoteChar,
- list.reverse(UTFChars) ++ Chars, Token, !IO)
+ [UnicodeChar | Chars], Token, !IO)
)
else
Token = error("invalid Unicode character code")
@@ -1263,9 +1265,10 @@ string_get_unicode_escape(NumHexChars, String, Len, QuoteChar, Chars,
( if
rev_char_list_to_string(HexChars, HexString),
string.base_string_to_int(16, HexString, UnicodeCharCode),
- convert_unicode_char_to_target_chars(UnicodeCharCode, UTFChars)
+ allowed_unicode_char_code(UnicodeCharCode),
+ char.from_int(UnicodeCharCode, UnicodeChar)
then
- RevCharsWithUnicode = list.reverse(UTFChars) ++ Chars,
+ RevCharsWithUnicode = [UnicodeChar | Chars],
( if UnicodeCharCode = 0 then
string_get_context(Posn0, Context, !Posn),
Token = null_character_error
@@ -1292,90 +1295,6 @@ string_get_unicode_escape(NumHexChars, String, Len, QuoteChar, Chars,
)
).
-:- pred convert_unicode_char_to_target_chars(int::in, list(char)::out)
- is semidet.
-
-convert_unicode_char_to_target_chars(UnicodeCharCode, Chars) :-
- BackendEncoding = backend_unicode_encoding,
- (
- BackendEncoding = utf8,
- encode_unicode_char_as_utf8(UnicodeCharCode, Chars)
- ;
- BackendEncoding = utf16,
- encode_unicode_char_as_utf16(UnicodeCharCode, Chars)
- ).
-
-:- pred encode_unicode_char_as_utf8(int::in, list(char)::out) is semidet.
-
-encode_unicode_char_as_utf8(UnicodeCharCode, UTF8Chars) :-
- allowed_unicode_char_code(UnicodeCharCode),
-
- % Refer to table 3-5 of the Unicode 4.0.0 standard (available from
- % www.unicode.org) for documentation on the bit distribution patterns used
- % below.
-
- ( if UnicodeCharCode =< 0x00007F then
- UTF8Chars = [char.det_from_int(UnicodeCharCode)]
- else if UnicodeCharCode =< 0x0007FF then
- Part1 = (0b11111000000 /\ UnicodeCharCode) >> 6,
- Part2 = 0b00000111111 /\ UnicodeCharCode,
- char.det_from_int(Part1 \/ 0b11000000, UTF8Char1),
- char.det_from_int(Part2 \/ 0b10000000, UTF8Char2),
- UTF8Chars = [UTF8Char1, UTF8Char2]
- else if UnicodeCharCode =< 0x00FFFF then
- Part1 = (0b1111000000000000 /\ UnicodeCharCode) >> 12,
- Part2 = (0b0000111111000000 /\ UnicodeCharCode) >> 6,
- Part3 = 0b0000000000111111 /\ UnicodeCharCode,
- char.det_from_int(Part1 \/ 0b11100000, UTF8Char1),
- char.det_from_int(Part2 \/ 0b10000000, UTF8Char2),
- char.det_from_int(Part3 \/ 0b10000000, UTF8Char3),
- UTF8Chars = [UTF8Char1, UTF8Char2, UTF8Char3]
- else
- Part1 = (0b111000000000000000000 /\ UnicodeCharCode) >> 18,
- Part2 = (0b000111111000000000000 /\ UnicodeCharCode) >> 12,
- Part3 = (0b000000000111111000000 /\ UnicodeCharCode) >> 6,
- Part4 = 0b000000000000000111111 /\ UnicodeCharCode,
- char.det_from_int(Part1 \/ 0b11110000, UTF8Char1),
- char.det_from_int(Part2 \/ 0b10000000, UTF8Char2),
- char.det_from_int(Part3 \/ 0b10000000, UTF8Char3),
- char.det_from_int(Part4 \/ 0b10000000, UTF8Char4),
- UTF8Chars = [UTF8Char1, UTF8Char2, UTF8Char3, UTF8Char4]
- ).
-
-:- pred encode_unicode_char_as_utf16(int::in, list(char)::out) is semidet.
-
- % This predicate should only be called on backends that have
- % a 16 bit character type.
- %
-encode_unicode_char_as_utf16(UnicodeCharCode, UTF16Chars) :-
- allowed_unicode_char_code(UnicodeCharCode),
-
- % If the code point is less than or equal to 0xFFFF
- % then the UTF-16 encoding is simply the code point value,
- % otherwise we construct a surrogate pair.
-
- ( if UnicodeCharCode =< 0xFFFF then
- char.det_from_int(UnicodeCharCode, Char),
- UTF16Chars = [Char]
- else
- %
- % Refer to table 3-4 of the Unicode 4.0.0 standard (available from
- % www.unicode.org) for documentation on the bit distribution patterns
- % used below.
- %
- UUUUU = (0b111110000000000000000 /\ UnicodeCharCode) >> 16,
- XXXXXX = (0b000001111110000000000 /\ UnicodeCharCode) >> 10,
- XXXXXXXXXX = (0b000000000001111111111 /\ UnicodeCharCode),
- WWWWW = UUUUU - 1,
- Surrogate1Lead = 0b1101100000000000,
- Surrogate2Lead = 0b1101110000000000,
- Surrogate1 = Surrogate1Lead \/ (WWWWW << 6) \/ XXXXXX,
- Surrogate2 = Surrogate2Lead \/ XXXXXXXXXX,
- char.det_from_int(Surrogate1, Surrogate1Char),
- char.det_from_int(Surrogate2, Surrogate2Char),
- UTF16Chars = [Surrogate1Char, Surrogate2Char]
- ).
-
:- pred allowed_unicode_char_code(int::in) is semidet.
% Succeeds if the give code point is a legal Unicode code point
@@ -1389,60 +1308,6 @@ allowed_unicode_char_code(Code) :-
Code >= 0xD800, Code =< 0xDFFF
).
-:- type unicode_encoding
- ---> utf8
- ; utf16.
-
-:- func backend_unicode_encoding = unicode_encoding.
-
-backend_unicode_encoding = Encoding :-
- Int = backend_unicode_encoding_int,
- ( unicode_encoding_int_to_encoding(Int, EncodingPrime) ->
- Encoding = EncodingPrime
- ;
- error("backend_unicode_encoding: unexpected Unicode encoding code")
- ).
-
-:- pred unicode_encoding_int_to_encoding(int::in, unicode_encoding::out)
- is semidet.
-
-unicode_encoding_int_to_encoding(0, utf8).
-unicode_encoding_int_to_encoding(1, utf16).
-
-:- func backend_unicode_encoding_int = int.
-
-:- pragma inline(backend_unicode_encoding_int/0).
-
-:- pragma foreign_proc("C",
- backend_unicode_encoding_int = (EncodingInt::out),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
- does_not_affect_liveness],
-"
- EncodingInt = 0;
-").
-
-:- pragma foreign_proc("Java",
- backend_unicode_encoding_int = (EncodingInt::out),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail],
-"
- EncodingInt = 1;
-").
-
-:- pragma foreign_proc("C#",
- backend_unicode_encoding_int = (EncodingInt::out),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail],
-"
- EncodingInt = 1;
-").
-
-:- pragma foreign_proc("Erlang",
- backend_unicode_encoding_int = (EncodingInt::out),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
- does_not_affect_liveness],
-"
- EncodingInt = 0
-").
-
:- pred get_hex_escape(io.input_stream::in, char::in, list(char)::in,
list(char)::in, token::out, io::di, io::uo) is det.
diff --git a/library/string.m b/library/string.m
index 8ab5454..0534ee4 100644
--- a/library/string.m
+++ b/library/string.m
@@ -20,18 +20,24 @@
%
% The representation of strings is implementation dependent and subject to
% change. In the current implementation, when Mercury is compiled to C, strings
-% are represented as in C, using a null character as the string terminator.
+% are UTF-8 encoded, using a null character as the string terminator.
+% A single code point requires one to four bytes (code units) to encode.
+%
% When Mercury is compiled to Java, strings are represented as Java `String's.
-% When Mercury is compiled to .NET IL code, strings are represented as .NET
-% `System.String's.
+% When Mercury is compiled to C# code, strings are represented as
+% `System.String's. In both cases, strings are UTF-16 encoded. A single code
+% point requires one or two 16-bit integers (code units) to encode.
+%
+% Whe Mercury is compiled to Erlang, strings are represented as Erlang
+% binaries using UTF-8 encoding.
+% XXX However, this is incomplete.
%
% The builtin comparison operation on strings is also implementation dependent.
% In the current implementation, when Mercury is compiled to C, string
% comparison is implemented using C's strcmp() function. When Mercury
% is compiled to Java, string comparison is implemented using Java's
-% String.compareTo() method. When Mercury is compiled to .NET IL code
-% string comparison is implemented using C#'s System.String.CompareOrdinal()
-% method.
+% String.compareTo() method. When Mercury is compiled to C#, string comparison
+% is implemented using C#'s System.String.CompareOrdinal() method.
%
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
@@ -69,14 +75,43 @@
:- type text_file
---> text_file(string).
- % Determine the length of a string.
+ % Determine the length of a string, in code units.
% An empty string has length zero.
%
+ % NOTE: code points (characters) are encoded using one or more code units,
+ % i.e. bytes for UTF-8; 16-bit integers for UTF-16.
+ %
:- func string.length(string::in) = (int::uo) is det.
:- pred string.length(string, int).
:- mode string.length(in, uo) is det.
:- mode string.length(ui, uo) is det.
+ % Synonyms for string.length.
+ %
+:- func string.count_code_units(string) = int.
+:- pred string.count_code_units(string::in, int::out) is det.
+
+ % Determine the number of code points in a string.
+ %
+:- func string.count_codepoints(string) = int.
+:- pred string.count_codepoints(string::in, int::out) is det.
+
+ % string.codepoint_offset(String, CodePointCount, CodePointOffset):
+ % Equivalent to `string.codepoint_offset(String, 0, CodePointCount,
+ % CodePointOffset)'.
+ %
+:- pred string.codepoint_offset(string::in, int::in, int::out) is semidet.
+
+ % string.codepoint_offset(String, StartOffset, CodePointCount,
+ % CodePointOffset):
+ %
+ % Return the offset into `String' where, starting from `StartOffset',
+ % `CodePointCount' code points are skipped. Fails if either `StartOffset'
+ % or `CodePointOffset' are out of range.
+ %
+:- pred string.codepoint_offset(string::in, int::in, int::in, int::out)
+ is semidet.
+
% Append two strings together.
%
:- func string.append(string::in, string::in) = (string::uo) is det.
@@ -170,7 +205,7 @@
:- mode string.string_ops_noncanon(in, in, in, out) is cc_multi.
% string.char_to_string(Char, String).
- % Converts a character (single-character atom) to a string or vice versa.
+ % Converts a character (code point) to a string or vice versa.
%
:- func string.char_to_string(char::in) = (string::uo) is det.
:- pred string.char_to_string(char, string).
@@ -238,7 +273,7 @@
:- func string.from_c_pointer(c_pointer::in) = (string::uo) is det.
% string.first_char(String, Char, Rest) is true iff Char is the first
- % character of String, and Rest is the remainder.
+ % character (code point) of String, and Rest is the remainder.
%
% WARNING: string.first_char makes a copy of Rest because the garbage
% collector doesn't handle references into the middle of an object,
@@ -298,16 +333,18 @@
:- func string.uncapitalize_first(string) = string.
:- pred string.uncapitalize_first(string::in, string::out) is det.
- % Convert the string to a list of characters.
- % Throws an exception if the list of characters contains a null character.
+ % Convert the string to a list of characters (code points).
+ % Throws an exception if the list of characters contains a null character
+ % or surrogate code point.
%
:- func string.to_char_list(string) = list(char).
:- pred string.to_char_list(string, list(char)).
:- mode string.to_char_list(in, out) is det.
:- mode string.to_char_list(uo, in) is det.
- % Convert a list of characters to a string.
- % Throws an exception if the list of characters contains a null character.
+ % Convert a list of characters (code points) to a string.
+ % Throws an exception if the list of characters contains a null character
+ % or surrogate code point.
%
:- func string.from_char_list(list(char)::in) = (string::uo) is det.
:- pred string.from_char_list(list(char), string).
@@ -315,23 +352,34 @@
:- mode string.from_char_list(out, in) is det.
% As above, but fail instead of throwing an exception if the
- % list contains a null character.
+ % list contains a null character or surrogate code point.
%
:- pred string.semidet_from_char_list(list(char)::in, string::uo) is semidet.
% Same as string.from_char_list, except that it reverses the order
% of the characters.
- % Throws an exception if the list of characters contains a null character.
+ % Throws an exception if the list of characters contains a null character
+ % or surrogate code point.
%
:- func string.from_rev_char_list(list(char)::in) = (string::uo) is det.
:- pred string.from_rev_char_list(list(char)::in, string::uo) is det.
% As above, but fail instead of throwing an exception if the
- % list contains a null character.
+ % list contains a null character or surrogate code point.
%
:- pred string.semidet_from_rev_char_list(list(char)::in, string::uo)
is semidet.
+ % Convert a string into a list of code units.
+ %
+:- pred string.to_code_unit_list(string::in, list(int)::out) is det.
+
+ % Convert a list of code units to a string.
+ % Fails if the list does not contain a valid encoding of a string,
+ % in the encoding expected by the current process.
+ %
+:- pred string.from_code_unit_list(list(int)::in, string::uo) is semidet.
+
% Converts a signed base 10 string to an int; throws an exception
% if the string argument does not match the regexp [+-]?[0-9]+
% or the number is not in the range [int.min_int+1, int.max_int].
@@ -371,15 +419,17 @@
%
:- pred string.to_float(string::in, float::out) is semidet.
- % True if string contains only alphabetic characters (letters).
+ % True if string contains only alphabetic characters [A-Za-z].
%
:- pred string.is_all_alpha(string::in) is semidet.
- % True if string contains only alphabetic characters and underscores.
+ % True if string contains only alphabetic characters [A-Za-z] and
+ % underscores.
%
:- pred string.is_all_alpha_or_underscore(string::in) is semidet.
- % True if string contains only letters, digits, and underscores.
+ % True if string contains only alphabetic characters [A-Za-z], digits,
+ % and underscores.
%
:- pred string.is_all_alnum_or_underscore(string::in) is semidet.
@@ -389,7 +439,7 @@
% string.all_match(TestPred, String):
%
- % True if TestPred is true when applied to each character in
+ % True if TestPred is true when applied to each character (code point) in
% String or if String is the empty string.
%
:- pred string.all_match(pred(char)::in(pred(in) is semidet), string::in)
@@ -398,6 +448,8 @@
% string.pad_left(String0, PadChar, Width, String):
% Insert `PadChar's at the left of `String0' until it is at least as long
% as `Width', giving `String'.
+ % Width is measured as the number of code points, which is too simplistic
+ % if the string may contain zero or double width code points.
%
:- func string.pad_left(string, char, int) = string.
:- pred string.pad_left(string::in, char::in, int::in, string::out) is det.
@@ -405,33 +457,39 @@
% string.pad_right(String0, PadChar, Width, String):
% Insert `PadChar's at the right of `String0' until it is at least as long
% as `Width', giving `String'.
+ % Width is measured as the number of code points, which is too simplistic
+ % if the string may contain zero or double width code points.
%
:- func string.pad_right(string, char, int) = string.
:- pred string.pad_right(string::in, char::in, int::in, string::out) is det.
% string.duplicate_char(Char, Count, String):
% Construct a string consisting of `Count' occurrences of `Char'
- % in sequence.
+ % code points in sequence.
%
:- func string.duplicate_char(char::in, int::in) = (string::uo) is det.
:- pred string.duplicate_char(char::in, int::in, string::uo) is det.
% string.contains_char(String, Char):
- % Succeed if `Char' occurs in `String'.
+ % Succeed if the code point `Char' occurs in `String'.
%
:- pred string.contains_char(string::in, char::in) is semidet.
% string.index(String, Index, Char):
- % `Char' is the (`Index' + 1)-th character of `String'.
- % Fails if `Index' is out of range (negative, or greater than or equal to
- % the length of `String').
+ % `Char' is the character (code point) in `String', beginning at the
+ % code unit `Index'. Fails if `Index' is out of range (negative, or
+ % greater than or equal to the length of `String').
+ %
+ % Calls error/1 if an illegal sequence is detected.
%
:- pred string.index(string::in, int::in, char::uo) is semidet.
% string.index_det(String, Index, Char):
- % `Char' is the (`Index' + 1)-th character of `String'.
+ % `Char' is the character (code point) in `String', beginning at the
+ % code unit `Index'.
% Calls error/1 if `Index' is out of range (negative, or greater than
- % or equal to the length of `String').
+ % or equal to the length of `String'), or if an illegal sequence is
+ % detected.
%
:- func string.index_det(string, int) = char.
:- pred string.index_det(string::in, int::in, char::uo) is det.
@@ -442,7 +500,8 @@
:- func string ^ elem(int) = char.
% string.unsafe_index(String, Index, Char):
- % `Char' is the (`Index' + 1)-th character of `String'.
+ % `Char' is the character (code point) in `String', beginning at the
+ % code unit `Index'.
% WARNING: behavior is UNDEFINED if `Index' is out of range
% (negative, or greater than or equal to the length of `String').
% This version is constant time, whereas string.index_det
@@ -456,56 +515,92 @@
%
:- func string ^ unsafe_elem(int) = char.
+ % string.index_next(String, Index, NextIndex, Char):
+ % Like `string.index'/3 but also returns the position of the code unit
+ % that follows the code point beginning at `Index',
+ % i.e. NextIndex = Index + num_bytes_to_encode(Char).
+ %
+:- pred string.index_next(string::in, int::in, int::out, char::uo) is semidet.
+
+ % string.unsafe_index_next(String, Index, NextIndex, Char):
+ % `Char' is the character (code point) in `String', beginning at the
+ % code unit `Index'. `NextIndex' is the offset following the encoding
+ % of `Char'. Fails if `Index' is equal to the length of `String'.
+ % WARNING: behavior is UNDEFINED if `Index' is out of range
+ % (negative, or greater than the length of `String').
+ %
+:- pred string.unsafe_index_next(string::in, int::in, int::out, char::uo)
+ is semidet.
+
+ % string.unsafe_prev_index(String, Index, CharIndex, Char):
+ % `Char' is the character (code point) in `String' immediately _before_
+ % the code unit `Index'. `CharIndex' is the offset of the beginning of
+ % `Char'. Fails if `Index' is zero.
+ % WARNING: behavior is UNDEFINED if `Index' is out of range
+ % (negative, or greater than or equal to the length of `String').
+ %
+:- pred string.unsafe_prev_index(string::in, int::in, int::out, char::uo)
+ is semidet.
+
+ % string.unsafe_index_code_unit(String, Index, CodeList):
+ % `Code' unit is the code unit in `String' at the offset `Index'.
+ % WARNING: behavior is UNDEFINED if `Index' is out of range
+ % (negative, or greater than or equal to the length of `String').
+ %
+:- pred string.unsafe_index_code_unit(string::in, int::in, int::out) is det.
+
% string.chomp(String):
% `String' minus any single trailing newline character.
%
:- func string.chomp(string) = string.
% string.lstrip(String):
- % `String' minus any initial whitespace characters.
+ % `String' minus any initial whitespace characters in the ASCII range.
%
:- func string.lstrip(string) = string.
% string.rstrip(String):
- % `String' minus any trailing whitespace characters.
+ % `String' minus any trailing whitespace characters in the ASCII range.
%
:- func string.rstrip(string) = string.
% string.strip(String):
- % `String' minus any initial and trailing whitespace characters.
+ % `String' minus any initial and trailing whitespace characters in the
+ % ASCII range.
%
:- func string.strip(string) = string.
% string.lstrip_pred(Pred, String):
- % `String' minus the maximal prefix consisting entirely of chars
- % satisfying `Pred'.
+ % `String' minus the maximal prefix consisting entirely of characters
+ % (code points) satisfying `Pred'.
%
:- func string.lstrip_pred(pred(char)::in(pred(in) is semidet), string::in)
= (string::out) is det.
% string.rstrip_pred(Pred, String):
- % `String' minus the maximal suffix consisting entirely of chars
- % satisfying `Pred'.
+ % `String' minus the maximal suffix consisting entirely of characters
+ % (code points) satisfying `Pred'.
%
:- func string.rstrip_pred(pred(char)::in(pred(in) is semidet), string::in)
= (string::out) is det.
% string.prefix_length(Pred, String):
- % The length of the maximal prefix of `String' consisting entirely of
- % chars satisfying Pred.
+ % The length (in code units) of the maximal prefix of `String' consisting
+ % entirely of characters (code points) satisfying Pred.
%
:- func string.prefix_length(pred(char)::in(pred(in) is semidet), string::in)
= (int::out) is det.
% string.suffix_length(Pred, String):
- % The length of the maximal suffix of `String' consisting entirely of chars
- % satisfying Pred.
+ % The length (in code units) of the maximal suffix of `String' consisting
+ % entirely of characters (code points) satisfying Pred.
%
:- func suffix_length(pred(char)::in(pred(in) is semidet), string::in)
= (int::out) is det.
% string.set_char(Char, Index, String0, String):
- % `String' is `String0' with the (`Index' + 1)-th character set to `Char'.
+ % `String' is `String0', with the code point beginning at code unit
+ % `Index' removed and replaced by `Char'.
% Fails if `Index' is out of range (negative, or greater than or equal to
% the length of `String0').
%
@@ -516,7 +611,8 @@
%:- mode string.set_char(in, in, di, uo) is semidet.
% string.set_char_det(Char, Index, String0, String):
- % `String' is `String0' with the (`Index' + 1)-th character set to `Char'.
+ % `String' is `String0', with the code point beginning at code unit
+ % `Index' removed and replaced by `Char'.
% Calls error/1 if `Index' is out of range (negative, or greater than
% or equal to the length of `String0').
%
@@ -528,7 +624,8 @@
%:- mode string.set_char_det(in, in, di, uo) is det.
% string.unsafe_set_char(Char, Index, String0, String):
- % `String' is `String0' with the (`Index' + 1)-th character set to `Char'.
+ % `String' is `String0', with the code point beginning at code unit
+ % `Index' removed and replaced by `Char'.
% WARNING: behavior is UNDEFINED if `Index' is out of range
% (negative, or greater than or equal to the length of `String0').
% This version is constant time, whereas string.set_char_det
@@ -547,8 +644,8 @@
% string.foldl(Closure, String, !Acc):
% `Closure' is an accumulator predicate which is to be called for each
- % character of the string `String' in turn. The initial value of the
- % accumulator is `!.Acc' and the final value is `!:Acc'.
+ % character (code point) of the string `String' in turn. The initial
+ % value of the accumulator is `!.Acc' and the final value is `!:Acc'.
% (string.foldl is equivalent to
% string.to_char_list(String, Chars),
% list.foldl(Closure, Chars, !Acc)
@@ -583,6 +680,8 @@
% is equivalent to string.foldl(Closure, SubString, !Acc)
% where SubString = string.substring(String, Start, Count).
%
+ % `Start' and `Count' are in terms of code units.
+ %
:- func string.foldl_substring(func(char, A) = A, string, int, int, A) = A.
:- pred string.foldl_substring(pred(char, A, A), string, int, int, A, A).
:- mode string.foldl_substring(pred(in, in, out) is det, in, in, in,
@@ -599,6 +698,8 @@
% string.foldl_substring2(Closure, String, Start, Count, !Acc1, !Acc2)
% A variant of string.foldl_substring with two accumulators.
%
+ % `Start' and `Count' are in terms of code units.
+ %
:- pred string.foldl2_substring(pred(char, A, A, B, B),
string, int, int, A, A, B, B).
:- mode string.foldl2_substring(pred(in, di, uo, di, uo) is det,
@@ -629,6 +730,8 @@
% is equivalent to string.foldr(Closure, SubString, !Acc)
% where SubString = string.substring(String, Start, Count).
%
+ % `Start' and `Count' are in terms of code units.
+ %
:- func string.foldr_substring(func(char, T) = T, string, int, int, T) = T.
:- pred string.foldr_substring(pred(char, T, T), string, int, int, T, T).
:- mode string.foldr_substring(pred(in, in, out) is det, in, in, in,
@@ -644,7 +747,8 @@
% string.words_separator(SepP, String) returns the list of non-empty
% substrings of String (in first to last order) that are delimited
- % by non-empty sequences of chars matched by SepP. For example,
+ % by non-empty sequences of characters (code points) matched by SepP.
+ % For example,
%
% string.words_separator(char.is_whitespace, " the cat sat on the mat") =
% ["the", "cat", "sat", "on", "the", "mat"]
@@ -661,7 +765,7 @@
% string.split_at_separator(SepP, String) returns the list of
% substrings of String (in first to last order) that are delimited
- % by chars matched by SepP. For example,
+ % by characters (code points) matched by SepP. For example,
%
% string.split_at_separator(char.is_whitespace, " a cat sat on the mat")
% = ["", "a", "cat", "", "sat", "on", "the", "", "mat"]
@@ -686,33 +790,59 @@
%
:- func string.split_at_string(string, string) = list(string).
- % string.split(String, Count, LeftSubstring, RightSubstring):
- % `LeftSubstring' is the left-most `Count' characters of `String',
- % and `RightSubstring' is the remainder of `String'.
+ % string.split(String, Index, LeftSubstring, RightSubstring):
+ % Split a string into two substrings, at the code unit `Index'.
% (If `Count' is out of the range [0, length of `String'], it is treated
% as if it were the nearest end-point of that range.)
%
:- pred string.split(string::in, int::in, string::uo, string::uo) is det.
+ % string.split_by_codepoint(String, Count, LeftSubstring, RightSubstring):
+ % `LeftSubstring' is the left-most `Count' characters (code points) of
+ % `String', and `RightSubstring' is the remainder of `String'.
+ % (If `Count' is out of the range [0, length of `String'], it is treated
+ % as if it were the nearest end-point of that range.)
+ %
+:- pred string.split_by_codepoint(string::in, int::in, string::uo, string::uo)
+ is det.
+
% string.left(String, Count, LeftSubstring):
- % `LeftSubstring' is the left-most `Count' characters of `String'.
+ % `LeftSubstring' is the left-most `Count' code _units_ of `String'.
% (If `Count' is out of the range [0, length of `String'], it is treated
% as if it were the nearest end-point of that range.)
%
:- func string.left(string::in, int::in) = (string::uo) is det.
:- pred string.left(string::in, int::in, string::uo) is det.
+ % string.left_by_codepoint(String, Count, LeftSubstring):
+ % `LeftSubstring' is the left-most `Count' characters (code points) of
+ % `String'.
+ % (If `Count' is out of the range [0, length of `String'], it is treated
+ % as if it were the nearest end-point of that range.)
+ %
+:- func string.left_by_codepoint(string::in, int::in) = (string::uo) is det.
+:- pred string.left_by_codepoint(string::in, int::in, string::uo) is det.
+
% string.right(String, Count, RightSubstring):
- % `RightSubstring' is the right-most `Count' characters of `String'.
+ % `RightSubstring' is the right-most `Count' code _units_ of `String'.
% (If `Count' is out of the range [0, length of `String'], it is treated
% as if it were the nearest end-point of that range.)
%
:- func string.right(string::in, int::in) = (string::uo) is det.
:- pred string.right(string::in, int::in, string::uo) is det.
+ % string.right_by_codepoint(String, Count, RightSubstring):
+ % `RightSubstring' is the right-most `Count' characters (code points) of
+ % `String'.
+ % (If `Count' is out of the range [0, length of `String'], it is treated
+ % as if it were the nearest end-point of that range.)
+ %
+:- func string.right_by_codepoint(string::in, int::in) = (string::uo) is det.
+:- pred string.right_by_codepoint(string::in, int::in, string::uo) is det.
+
% string.substring(String, Start, Count, Substring):
- % `Substring' is first the `Count' characters in what would remain
- % of `String' after the first `Start' characters were removed.
+ % `Substring' is first the `Count' code _units_ in what would remain
+ % of `String' after the first `Start' code _units_ were removed.
% (If `Start' is out of the range [0, length of `String'], it is treated
% as if it were the nearest end-point of that range.
% If `Count' is out of the range [0, length of `String' - `Start'],
@@ -721,15 +851,28 @@
:- func string.substring(string::in, int::in, int::in) = (string::uo) is det.
:- pred string.substring(string::in, int::in, int::in, string::uo) is det.
+ % string.substring_by_codepoint(String, Start, Count, Substring):
+ % `Substring' is first the `Count' code points in what would remain
+ % of `String' after the first `Start' code points were removed.
+ % (If `Start' is out of the range [0, length of `String'], it is treated
+ % as if it were the nearest end-point of that range.
+ % If `Count' is out of the range [0, length of `String' - `Start'],
+ % it is treated as if it were the nearest end-point of that range.)
+ %
+:- func string.substring_by_codepoint(string::in, int::in, int::in)
+ = (string::uo) is det.
+:- pred string.substring_by_codepoint(string::in, int::in, int::in, string::uo)
+ is det.
+
% string.unsafe_substring(String, Start, Count, Substring):
- % `Substring' is first the `Count' characters in what would remain
- % of `String' after the first `Start' characters were removed.
+ % `Substring' is first the `Count' code _units_ in what would remain
+ % of `String' after the first `Start' code _units_ were removed.
% WARNING: if `Start' is out of the range [0, length of `String'],
% or if `Count' is out of the range [0, length of `String' - `Start'],
% then the behaviour is UNDEFINED. Use with care!
% This version takes time proportional to the length of the substring,
% whereas string.substring may take time proportional to the length
- %% of the whole string.
+ % of the whole string.
%
:- func string.unsafe_substring(string::in, int::in, int::in) = (string::uo)
is det.
@@ -757,16 +900,16 @@
:- func string.hash3(string) = int.
% string.sub_string_search(String, SubString, Index).
- % `Index' is the position in `String' where the first occurrence of
- % `SubString' begins. Indices start at zero, so if `SubString' is a prefix
- % of `String', this will return Index = 0.
+ % `Index' is the code unit position in `String' where the first
+ % occurrence of `SubString' begins. Indices start at zero, so if
+ % `SubString' is a prefix of `String', this will return Index = 0.
%
:- pred string.sub_string_search(string::in, string::in, int::out) is semidet.
% string.sub_string_search_start(String, SubString, BeginAt, Index).
- % `Index' is the position in `String' where the first occurrence of
- % `SubString' occurs such that 'Index' is greater than or equal to
- % `BeginAt'. Indices start at zero,
+ % `Index' is the code unit position in `String' where the first
+ % occurrence of `SubString' occurs such that 'Index' is greater than or
+ % equal to `BeginAt'. Indices start at zero.
%
:- pred string.sub_string_search_start(string::in, string::in, int::in,
int::out)
@@ -848,6 +991,12 @@
% bb * 22
% ccc * 333
%
+ % NOTE: the result is only aligned in termed of numbers of code points.
+ % The output may not be properly aligned if any code points occupy more or
+ % less than one column when printed.
+ %
+ % (In the future, we could try to use wcswidth.)
+ %
:- func string.format_table(list(justified_column), string) = string.
% format_table_max(Columns, Separator) does the same job as format_table,
@@ -861,11 +1010,14 @@
; right(list(string)).
% word_wrap(Str, N) = Wrapped.
- % Wrapped is Str with newlines inserted between words so that at most
- % N characters appear on a line and each line contains as many whole words
- % as possible. If any one word exceeds N characters in length then it will
- % be broken over two (or more) lines. Sequences of whitespace characters
- % are replaced by a single space.
+ % Wrapped is Str with newlines inserted between words (separated by ASCII
+ % space characters) so that at most N code points appear on a line and each
+ % line contains as many whole words as possible. If any one word exceeds N
+ % code point in length then it will be broken over two (or more) lines.
+ % Sequences of whitespace characters are replaced by a single space.
+ %
+ % NOTE: this function does not care what a code point is, nor how wide are
+ % the glyphs for each code point.
%
:- func string.word_wrap(string, int) = string.
@@ -873,7 +1025,7 @@
% word_wrap_separator/3 is like word_wrap/2, except that words that
% need to be broken up over multiple lines have WordSeparator inserted
% between each piece. If the length of WordSeparator is greater than
- % or equal to N, then no separator is used.
+ % or equal to N code points, then no separator is used.
%
:- func string.word_wrap_separator(string, int, string) = string.
@@ -1008,12 +1160,14 @@ string.foldl2(Closure, String, !Acc1, !Acc2) :-
string.foldl_substring(Closure, String, Start0, Count0, !Acc) :-
Start = max(0, Start0),
Count = min(Count0, length(String) - Start),
- string.foldl_substring_2(Closure, String, Start, Count, !Acc).
+ End = Start + Count,
+ string.foldl_substring_2(Closure, String, Start, End, !Acc).
string.foldl2_substring(Closure, String, Start0, Count0, !Acc1, !Acc2) :-
Start = max(0, Start0),
Count = min(Count0, length(String) - Start),
- string.foldl2_substring_2(Closure, String, Start, Count, !Acc1, !Acc2).
+ End = Start + Count,
+ string.foldl2_substring_2(Closure, String, Start, End, !Acc1, !Acc2).
:- pred string.foldl_substring_2(pred(char, A, A), string, int, int, A, A).
:- mode string.foldl_substring_2(pred(in, di, uo) is det, in, in, in,
@@ -1027,10 +1181,14 @@ string.foldl2_substring(Closure, String, Start0, Count0, !Acc1, !Acc2) :-
:- mode string.foldl_substring_2(pred(in, in, out) is multi, in, in, in,
in, out) is multi.
-string.foldl_substring_2(Closure, String, I, Count, !Acc) :-
- ( 0 < Count ->
- Closure(string.unsafe_index(String, I), !Acc),
- string.foldl_substring_2(Closure, String, I + 1, Count - 1, !Acc)
+string.foldl_substring_2(Closure, String, I, End, !Acc) :-
+ (
+ I < End,
+ string.unsafe_index_next(String, I, J, Char),
+ J =< End
+ ->
+ Closure(Char, !Acc),
+ string.foldl_substring_2(Closure, String, J, End, !Acc)
;
true
).
@@ -1050,11 +1208,14 @@ string.foldl_substring_2(Closure, String, I, Count, !Acc) :-
:- mode string.foldl2_substring_2(pred(in, in, out, in, out) is multi,
in, in, in, in, out, in, out) is multi.
-string.foldl2_substring_2(Closure, String, I, Count, !Acc1, !Acc2) :-
- ( 0 < Count ->
- Closure(string.unsafe_index(String, I), !Acc1, !Acc2),
- string.foldl2_substring_2(Closure, String, I + 1, Count - 1,
- !Acc1, !Acc2)
+string.foldl2_substring_2(Closure, String, I, End, !Acc1, !Acc2) :-
+ (
+ I < End,
+ string.unsafe_index_next(String, I, J, Char),
+ J =< End
+ ->
+ Closure(Char, !Acc1, !Acc2),
+ string.foldl2_substring_2(Closure, String, J, End, !Acc1, !Acc2)
;
true
).
@@ -1073,7 +1234,8 @@ string.foldr(Closure, String, Acc0, Acc) :-
string.foldr_substring(Closure, String, Start0, Count0, Acc0, Acc) :-
Start = max(0, Start0),
Count = min(Count0, length(String) - Start),
- string.foldr_substring_2(Closure, String, Start, Count, Acc0, Acc).
+ End = Start + Count,
+ string.foldr_substring_2(Closure, String, Start, End, Acc0, Acc).
:- pred string.foldr_substring_2(pred(char, T, T), string, int, int, T, T).
:- mode string.foldr_substring_2(pred(in, in, out) is det, in, in, in,
@@ -1087,10 +1249,14 @@ string.foldr_substring(Closure, String, Start0, Count0, Acc0, Acc) :-
:- mode string.foldr_substring_2(pred(in, in, out) is multi, in, in, in,
in, out) is multi.
-string.foldr_substring_2(Closure, String, I, Count, !Acc) :-
- ( 0 < Count ->
- Closure(string.unsafe_index(String, I + Count - 1), !Acc),
- string.foldr_substring_2(Closure, String, I, Count - 1, !Acc)
+string.foldr_substring_2(Closure, String, Start, I, !Acc) :-
+ (
+ I > Start,
+ string.unsafe_prev_index(String, I, J, Char),
+ J >= Start
+ ->
+ Closure(Char, !Acc),
+ string.foldr_substring_2(Closure, String, Start, J, !Acc)
;
true
).
@@ -1098,11 +1264,25 @@ string.foldr_substring_2(Closure, String, I, Count, !Acc) :-
string.left(String, Count, LeftString) :-
string.split(String, Count, LeftString, _RightString).
+string.left_by_codepoint(String, Count) = LeftString :-
+ string.left_by_codepoint(String, Count, LeftString).
+
+string.left_by_codepoint(String, Count, LeftString) :-
+ string.split_by_codepoint(String, Count, LeftString, _RightString).
+
string.right(String, RightCount, RightString) :-
string.length(String, Length),
LeftCount = Length - RightCount,
string.split(String, LeftCount, _LeftString, RightString).
+string.right_by_codepoint(String, RightCount) = RightString :-
+ string.right_by_codepoint(String, RightCount, RightString).
+
+string.right_by_codepoint(String, RightCount, RightString) :-
+ string.count_codepoints(String, TotalCount),
+ LeftCount = TotalCount - RightCount,
+ string.split_by_codepoint(String, LeftCount, _LeftString, RightString).
+
string.remove_suffix(String, Suffix, StringWithoutSuffix) :-
string.suffix(String, Suffix),
string.left(String, length(String) - length(Suffix), StringWithoutSuffix).
@@ -1141,27 +1321,28 @@ string.prefix(String::in, Prefix::in) :-
PreLen =< Len,
prefix_2_iii(String, Prefix, PreLen - 1).
string.prefix(String::in, Prefix::out) :-
- Len = length(String),
- prefix_2_ioii(String, Prefix, 0, Len).
+ prefix_2_ioi(String, Prefix, 0).
:- pred prefix_2_iii(string::in, string::in, int::in) is semidet.
prefix_2_iii(String, Prefix, I) :-
( 0 =< I ->
- (String `unsafe_index` I) = (Prefix `unsafe_index` I) `with_type` char,
+ string.unsafe_index_code_unit(String, I, C),
+ string.unsafe_index_code_unit(Prefix, I, C),
prefix_2_iii(String, Prefix, I - 1)
;
true
).
-:- pred prefix_2_ioii(string::in, string::out, int::in, int::in) is multi.
-
-prefix_2_ioii(String, Prefix, PreLen, _Len) :-
- Prefix = unsafe_substring(String, 0, PreLen).
+:- pred prefix_2_ioi(string::in, string::out, int::in) is multi.
-prefix_2_ioii(String, Prefix, PreLen, Len) :-
- PreLen < Len,
- prefix_2_ioii(String, Prefix, PreLen + 1, Len).
+prefix_2_ioi(String, Prefix, Cur) :-
+ (
+ Prefix = unsafe_substring(String, 0, Cur)
+ ;
+ string.unsafe_index_next(String, Cur, Next, _),
+ prefix_2_ioi(String, Prefix, Next)
+ ).
:- pragma promise_equivalent_clauses(string.suffix/2).
@@ -1172,15 +1353,15 @@ string.suffix(String::in, Suffix::in) :-
suffix_2_iiii(String, Suffix, 0, Len - PreLen, PreLen).
string.suffix(String::in, Suffix::out) :-
Len = length(String),
- suffix_2_ioii(String, Suffix, 0, Len).
+ suffix_2_ioii(String, Suffix, Len, Len).
:- pred suffix_2_iiii(string::in, string::in, int::in, int::in, int::in)
is semidet.
suffix_2_iiii(String, Suffix, I, Offset, Len) :-
( I < Len ->
- (String `unsafe_index` (I + Offset)) =
- (Suffix `unsafe_index` I) `with_type` char,
+ string.unsafe_index_code_unit(String, I + Offset, C),
+ string.unsafe_index_code_unit(Suffix, I, C),
suffix_2_iiii(String, Suffix, I + 1, Offset, Len)
;
true
@@ -1188,12 +1369,13 @@ suffix_2_iiii(String, Suffix, I, Offset, Len) :-
:- pred suffix_2_ioii(string::in, string::out, int::in, int::in) is multi.
-suffix_2_ioii(String, Suffix, SufLen, Len) :-
- Suffix = unsafe_substring(String, Len - SufLen, SufLen).
-
-suffix_2_ioii(String, Suffix, SufLen, Len) :-
- SufLen < Len,
- suffix_2_ioii(String, Suffix, SufLen + 1, Len).
+suffix_2_ioii(String, Suffix, Cur, Len) :-
+ (
+ string.unsafe_substring(String, Cur, Len, Suffix)
+ ;
+ string.unsafe_prev_index(String, Cur, Prev, _),
+ suffix_2_ioii(String, Suffix, Prev, Len)
+ ).
string.char_to_string(Char, String) :-
string.to_char_list(String, [Char]).
@@ -1364,11 +1546,16 @@ string.to_char_list(Str::uo, CharList::in) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"{
- MR_ConstString p = Str + strlen(Str);
+ int pos = strlen(Str);
+ int c;
+
CharList = MR_list_empty_msg(MR_PROC_LABEL);
- while (p > Str) {
- p--;
- CharList = MR_char_list_cons_msg((MR_UnsignedChar) *p, CharList,
+ for (;;) {
+ c = MR_utf8_prev_get(Str, &pos);
+ if (c <= 0) {
+ break;
+ }
+ CharList = MR_char_list_cons_msg((MR_UnsignedChar) c, CharList,
MR_PROC_LABEL);
}
}").
@@ -1380,7 +1567,23 @@ string.to_char_list(Str::uo, CharList::in) :-
"
list.List_1 lst = list.empty_list();
for (int i = Str.Length - 1; i >= 0; i--) {
- char c = Str[i];
+ int c;
+ char c2 = Str[i];
+ if (System.Char.IsLowSurrogate(c2)) {
+ try {
+ char c1 = Str[i - 1];
+ c = System.Char.ConvertToUtf32(c1, c2);
+ i--;
+ } catch (System.ArgumentOutOfRangeException) {
+ c = 0xfffd;
+ } catch (System.IndexOutOfRangeException) {
+ c = 0xfffd;
+ }
+ } else if (System.Char.IsHighSurrogate(c2)) {
+ c = 0xfffd;
+ } else {
+ c = c2;
+ }
lst = list.cons(c, lst);
}
CharList = lst;
@@ -1391,10 +1594,11 @@ string.to_char_list(Str::uo, CharList::in) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
- list.List_1<Character> lst = list.empty_list();
- for (int i = Str.length() - 1; i >= 0; i--) {
- char c = Str.charAt(i);
+ list.List_1<Integer> lst = list.empty_list();
+ for (int i = Str.length(); i > 0; ) {
+ int c = Str.codePointBefore(i);
lst = list.cons(c, lst);
+ i -= java.lang.Character.charCount(c);
}
CharList = lst;
").
@@ -1404,7 +1608,7 @@ string.to_char_list(Str::uo, CharList::in) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness],
"
- CharList = binary_to_list(Str)
+ CharList = unicode:characters_to_list(Str)
").
string.to_char_list_2(Str, CharList) :-
@@ -1451,7 +1655,7 @@ string.from_char_list(Chars::in, Str::uo) :-
size = 0;
char_list_ptr = CharList;
while (! MR_list_is_empty(char_list_ptr)) {
- size++;
+ size += MR_utf8_width(MR_list_head(char_list_ptr));
char_list_ptr = MR_list_tail(char_list_ptr);
}
@@ -1467,8 +1671,8 @@ string.from_char_list(Chars::in, Str::uo) :-
size = 0;
char_list_ptr = CharList;
while (! MR_list_is_empty(char_list_ptr)) {
- MR_Char c;
- c = (MR_Char) MR_list_head(char_list_ptr);
+ int c;
+ c = MR_list_head(char_list_ptr);
/*
** It is an error to put a null character in a string
** (see the comments at the top of this file).
@@ -1477,7 +1681,7 @@ string.from_char_list(Chars::in, Str::uo) :-
SUCCESS_INDICATOR = MR_FALSE;
break;
}
- Str[size++] = c;
+ size += MR_utf8_encode(Str + size, c);
char_list_ptr = MR_list_tail(char_list_ptr);
}
@@ -1491,7 +1695,12 @@ string.from_char_list(Chars::in, Str::uo) :-
"
System.Text.StringBuilder sb = new System.Text.StringBuilder();
while (!list.is_empty(CharList)) {
- sb.Append((char) list.det_head(CharList));
+ int cp = (int) list.det_head(CharList);
+ if (cp <= 0xffff) {
+ sb.Append((char) cp);
+ } else {
+ sb.Append(System.Char.ConvertFromUtf32(cp));
+ }
CharList = list.det_tail(CharList);
}
Str = sb.ToString();
@@ -1504,9 +1713,14 @@ string.from_char_list(Chars::in, Str::uo) :-
does_not_affect_liveness],
"
java.lang.StringBuilder sb = new StringBuilder();
- Iterable<Character> iterable = new list.ListIterator<Character>(CharList);
- for (char c : iterable) {
- sb.append(c);
+ Iterable<Integer> iterable = new list.ListIterator<Integer>(CharList);
+ for (int c : iterable) {
+ if (c <= 0xffff) {
+ /* Fast path. */
+ sb.append((char) c);
+ } else {
+ sb.append(java.lang.Character.toChars(c));
+ }
}
Str = sb.toString();
SUCCESS_INDICATOR = true;
@@ -1517,8 +1731,8 @@ string.from_char_list(Chars::in, Str::uo) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness],
"
- SUCCESS_INDICATOR = true,
- Str = list_to_binary(CharList)
+ Str = unicode:characters_to_binary(CharList),
+ SUCCESS_INDICATOR = true
").
:- pragma promise_equivalent_clauses(string.semidet_from_char_list/2).
@@ -1561,7 +1775,7 @@ string.from_rev_char_list(Chars, Str) :-
size = 0;
list_ptr = Chars;
while (!MR_list_is_empty(list_ptr)) {
- size++;
+ size += MR_utf8_width(MR_list_head(list_ptr));
list_ptr = MR_list_tail(list_ptr);
}
@@ -1589,7 +1803,8 @@ string.from_rev_char_list(Chars, Str) :-
SUCCESS_INDICATOR = MR_FALSE;
break;
}
- Str[--size] = c;
+ size -= MR_utf8_width(c);
+ MR_utf8_encode(Str + size, c);
list_ptr = MR_list_tail(list_ptr);
}
}").
@@ -1602,14 +1817,26 @@ string.from_rev_char_list(Chars, Str) :-
int size = 0;
list.List_1 list_ptr = Chars;
while (!list.is_empty(list_ptr)) {
- size++;
+ int c = (int) list.det_head(list_ptr);
+ if (c <= 0xffff) {
+ size++;
+ } else {
+ size += 2;
+ }
list_ptr = list.det_tail(list_ptr);
}
char[] arr = new char[size];
list_ptr = Chars;
while (!list.is_empty(list_ptr)) {
- arr[--size] = (char) list.det_head(list_ptr);
+ int c = (int) list.det_head(list_ptr);
+ if (c <= 0xffff) {
+ arr[--size] = (char) c;
+ } else {
+ string s = System.Char.ConvertFromUtf32(c);
+ arr[--size] = s[1];
+ arr[--size] = s[0];
+ }
list_ptr = list.det_tail(list_ptr);
}
@@ -1617,11 +1844,158 @@ string.from_rev_char_list(Chars, Str) :-
SUCCESS_INDICATOR = true;
").
+% XXX semidet_from_rev_char_list for Java?
+
string.semidet_from_rev_char_list(Chars::in, Str::uo) :-
string.semidet_from_char_list(list.reverse(Chars), Str).
%---------------------------------------------------------------------------%
+string.to_code_unit_list(String, List) :-
+ string.to_code_unit_list_2(String, 0, string.length(String), List).
+
+:- pred string.to_code_unit_list_2(string::in, int::in, int::in,
+ list(int)::out) is det.
+
+string.to_code_unit_list_2(String, Index, End, List) :-
+ ( Index >= End ->
+ List = []
+ ;
+ string.unsafe_index_code_unit(String, Index, Code),
+ string.to_code_unit_list_2(String, Index + 1, End, Tail),
+ List = [Code | Tail]
+ ).
+
+%-----------------------------------------------------------------------------%
+
+:- pragma foreign_proc("C",
+ string.from_code_unit_list(CodeList::in, Str::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, may_not_duplicate, no_sharing],
+"
+ MR_Word list_ptr;
+ size_t size;
+
+ size = 0;
+ list_ptr = CodeList;
+ while (! MR_list_is_empty(list_ptr)) {
+ size++;
+ list_ptr = MR_list_tail(list_ptr);
+ }
+
+ MR_allocate_aligned_string_msg(Str, size, MR_PROC_LABEL);
+
+ SUCCESS_INDICATOR = MR_TRUE;
+ size = 0;
+ list_ptr = CodeList;
+ while (! MR_list_is_empty(list_ptr)) {
+ int c;
+ c = MR_list_head(list_ptr);
+ /*
+ ** It is an error to put a null character in a string
+ ** (see the comments at the top of this file).
+ */
+ if (c == '\\0' || c > 0xff) {
+ SUCCESS_INDICATOR = MR_FALSE;
+ break;
+ }
+ Str[size] = c;
+ size++;
+ list_ptr = MR_list_tail(list_ptr);
+ }
+
+ Str[size] = '\\0';
+
+ SUCCESS_INDICATOR = SUCCESS_INDICATOR && MR_utf8_verify(Str);
+").
+
+:- pragma foreign_proc("Java",
+ string.from_code_unit_list(CodeList::in, Str::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness],
+"
+ java.lang.StringBuilder sb = new java.lang.StringBuilder();
+ boolean prev_high = false;
+
+ SUCCESS_INDICATOR = true;
+
+ Iterable<Integer> iterable = new list.ListIterator<Integer>(CodeList);
+ for (int i : iterable) {
+ char c = (char) i;
+ if (prev_high) {
+ if (!java.lang.Character.isLowSurrogate(c)) {
+ SUCCESS_INDICATOR = false;
+ break;
+ }
+ prev_high = false;
+ } else if (java.lang.Character.isHighSurrogate(c)) {
+ prev_high = true;
+ } else if (java.lang.Character.isLowSurrogate(c)) {
+ SUCCESS_INDICATOR = false;
+ break;
+ }
+ sb.append(c);
+ }
+
+ SUCCESS_INDICATOR = SUCCESS_INDICATOR && !prev_high;
+
+ if (SUCCESS_INDICATOR) {
+ Str = sb.toString();
+ } else {
+ Str = """";
+ }
+").
+
+:- pragma foreign_proc("C#",
+ string.from_code_unit_list(CodeList::in, Str::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness],
+"
+ System.Text.StringBuilder sb = new System.Text.StringBuilder();
+ bool prev_high = false;
+
+ SUCCESS_INDICATOR = true;
+
+ while (!list.is_empty(CodeList)) {
+ /* Both casts are required. */
+ char c = (char) (int) list.det_head(CodeList);
+ if (prev_high) {
+ if (!System.Char.IsLowSurrogate(c)) {
+ SUCCESS_INDICATOR = false;
+ break;
+ }
+ prev_high = false;
+ } else if (System.Char.IsHighSurrogate(c)) {
+ prev_high = true;
+ } else if (System.Char.IsLowSurrogate(c)) {
+ SUCCESS_INDICATOR = false;
+ break;
+ }
+ sb.Append(c);
+ CodeList = list.det_tail(CodeList);
+ }
+
+ SUCCESS_INDICATOR = SUCCESS_INDICATOR && !prev_high;
+
+ if (SUCCESS_INDICATOR) {
+ Str = sb.ToString();
+ } else {
+ Str = """";
+ }
+").
+
+:- pragma foreign_proc("Erlang",
+ string.from_code_unit_list(CodeList::in, Str::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness],
+"
+ % XXX should validate the string
+ Str = list_to_binary(CodeList),
+ SUCCESS_INDICATOR = true
+").
+
+%---------------------------------------------------------------------------%
+
string.to_upper(StrIn, StrOut) :-
string.to_char_list(StrIn, List),
string.char_list_to_upper(List, ListUpp),
@@ -1663,15 +2037,15 @@ string.uncapitalize_first(S0, S) :-
).
string.all_match(P, String) :-
- all_match_2(string.length(String) - 1, P, String).
+ all_match_2(P, String, 0).
-:- pred all_match_2(int::in, pred(char)::in(pred(in) is semidet), string::in)
+:- pred all_match_2(pred(char)::in(pred(in) is semidet), string::in, int::in)
is semidet.
-string.all_match_2(I, P, String) :-
- ( I >= 0 ->
- P(string.unsafe_index(String, I)),
- string.all_match_2(I - 1, P, String)
+string.all_match_2(P, String, I) :-
+ ( string.unsafe_index_next(String, I, J, Char) ->
+ P(Char),
+ string.all_match_2(P, String, J)
;
true
).
@@ -1817,7 +2191,7 @@ string.is_all_digits(S) :-
").
string.pad_left(String0, PadChar, Width, String) :-
- string.length(String0, Length),
+ string.count_codepoints(String0, Length),
( Length < Width ->
Count = Width - Length,
string.duplicate_char(PadChar, Count, PadString),
@@ -1827,7 +2201,7 @@ string.pad_left(String0, PadChar, Width, String) :-
).
string.pad_right(String0, PadChar, Width, String) :-
- string.length(String0, Length),
+ string.count_codepoints(String0, Length),
( Length < Width ->
Count = Width - Length,
string.duplicate_char(PadChar, Count, PadString),
@@ -2039,9 +2413,9 @@ string.hash(String) = HashVal :-
string.hash_loop(String, Index, Length, !HashVal) :-
( Index < Length ->
- C = char.to_int(string.unsafe_index(String, Index)),
+ string.unsafe_index_code_unit(String, Index, C),
!:HashVal = !.HashVal `xor` (!.HashVal `unchecked_left_shift` 5),
- !:HashVal= !.HashVal `xor` C,
+ !:HashVal = !.HashVal `xor` C,
string.hash_loop(String, Index + 1, Length, !HashVal)
;
true
@@ -2057,7 +2431,7 @@ string.hash2(String) = HashVal :-
string.hash2_loop(String, Index, Length, !HashVal) :-
( Index < Length ->
- C = char.to_int(string.unsafe_index(String, Index)),
+ string.unsafe_index_code_unit(String, Index, C),
!:HashVal = !.HashVal * 37,
!:HashVal= !.HashVal + C,
string.hash2_loop(String, Index + 1, Length, !HashVal)
@@ -2075,7 +2449,7 @@ string.hash3(String) = HashVal :-
string.hash3_loop(String, Index, Length, !HashVal) :-
( Index < Length ->
- C = char.to_int(string.unsafe_index(String, Index)),
+ string.unsafe_index_code_unit(String, Index, C),
!:HashVal = !.HashVal * 49,
!:HashVal= !.HashVal + C,
string.hash3_loop(String, Index + 1, Length, !HashVal)
@@ -2095,12 +2469,16 @@ string.sub_string_search(WholeString, Pattern, Index) :-
does_not_affect_liveness, no_sharing],
"{
char *match;
- match = strstr(WholeString + BeginAt, Pattern);
- if (match) {
- Index = match - WholeString;
- SUCCESS_INDICATOR = MR_TRUE;
- } else {
+ if ((MR_Unsigned) BeginAt > strlen(WholeString)) {
SUCCESS_INDICATOR = MR_FALSE;
+ } else {
+ match = strstr(WholeString + BeginAt, Pattern);
+ if (match) {
+ Index = match - WholeString;
+ SUCCESS_INDICATOR = MR_TRUE;
+ } else {
+ SUCCESS_INDICATOR = MR_FALSE;
+ }
}
}").
@@ -2311,8 +2689,7 @@ width(Width, !PolyTypes, !Chars) :-
non_zero_digit(!Chars),
zero_or_more_occurences(digit, !Chars),
Final = !.Chars,
-
- char_list_remove_suffix(Init, Final, Width)
+ list.remove_suffix(Init, Final, Width)
).
% Do we have a precision?
@@ -2337,7 +2714,7 @@ prec(Prec, !PolyTypes, !Chars) :-
zero_or_more_occurences(digit, !Chars),
Final = !.Chars
->
- char_list_remove_suffix(Init, Final, Prec)
+ list.remove_suffix(Init, Final, Prec)
;
% When no number follows the '.' the precision defaults to 0.
Prec = ['0']
@@ -3896,36 +4273,49 @@ count_extra_trailing_zeroes(FloatStr, I, N0) = N :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
- SUCCESS_INDICATOR = (strchr(Str, Ch) != NULL) && Ch != '\\0';
+ char buf[5];
+ size_t len;
+ if (Ch <= 0x7f) {
+ /* Fast path. */
+ SUCCESS_INDICATOR = (strchr(Str, Ch) != NULL) && Ch != '\\0';
+ } else {
+ len = MR_utf8_encode(buf, Ch);
+ buf[len] = '\\0';
+ SUCCESS_INDICATOR = (strstr(Str, buf) != NULL);
+ }
").
:- pragma foreign_proc("C#",
string.contains_char(Str::in, Ch::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
- SUCCESS_INDICATOR = (Str.IndexOf(Ch) != -1);
+ if (Ch <= 0xffff) {
+ SUCCESS_INDICATOR = (Str.IndexOf((char) Ch) != -1);
+ } else {
+ string s = System.Char.ConvertFromUtf32(Ch);
+ SUCCESS_INDICATOR = Str.Contains(s);
+ }
").
:- pragma foreign_proc("Java",
string.contains_char(Str::in, Ch::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
- SUCCESS_INDICATOR = (Str.indexOf(Ch) != -1);
+ // indexOf(int) handles supplementary characters correctly.
+ SUCCESS_INDICATOR = (Str.indexOf((int) Ch) != -1);
").
string.contains_char(String, Char) :-
- string.contains_char(String, Char, 0, string.length(String)).
+ string.contains_char(String, Char, 0).
-:- pred string.contains_char(string::in, char::in, int::in, int::in)
- is semidet.
+:- pred string.contains_char(string::in, char::in, int::in) is semidet.
-string.contains_char(Str, Char, Index, Length) :-
- ( Index < Length ->
- string.unsafe_index(Str, Index, IndexChar),
+string.contains_char(Str, Char, I) :-
+ ( string.unsafe_index_next(Str, I, J, IndexChar) ->
( IndexChar = Char ->
true
;
- string.contains_char(Str, Char, Index + 1, Length)
+ string.contains_char(Str, Char, J)
)
;
fail
@@ -3946,6 +4336,16 @@ string.index(Str, Index, Char) :-
fail
).
+:- pragma inline(string.index_next/4).
+
+string.index_next(Str, Index, NextIndex, Char) :-
+ Len = string.length(Str),
+ ( string.index_check(Index, Len) ->
+ string.unsafe_index_next(Str, Index, NextIndex, Char)
+ ;
+ fail
+ ).
+
:- pred string.index_check(int::in, int::in) is semidet.
% We should consider making this routine a compiler built-in.
@@ -3975,44 +4375,267 @@ string.index_check(Index, Length) :-
/*-----------------------------------------------------------------------*/
+string.unsafe_index(Str, Index, Char) :-
+ ( string.unsafe_index_2(Str, Index, Char0) ->
+ Char = Char0
+ ;
+ error("string.unsafe_index: illegal sequence")
+ ).
+
+:- pred string.unsafe_index_2(string::in, int::in, char::uo) is semidet.
+
:- pragma foreign_proc("C",
- string.unsafe_index(Str::in, Index::in, Ch::uo),
+ string.unsafe_index_2(Str::in, Index::in, Ch::uo),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
- Ch = Str[Index];
+ Ch = MR_utf8_get(Str, Index);
+ SUCCESS_INDICATOR = (Ch > 0);
").
:- pragma foreign_proc("C#",
- string.unsafe_index(Str::in, Index::in, Ch::uo),
+ string.unsafe_index_2(Str::in, Index::in, Ch::uo),
[will_not_call_mercury, promise_pure, thread_safe],
"
- Ch = Str[Index];
+ char c1 = Str[Index];
+ if (System.Char.IsSurrogate(c1)) {
+ try {
+ char c2 = Str[Index + 1];
+ Ch = System.Char.ConvertToUtf32(c1, c2);
+ } catch (System.ArgumentOutOfRangeException) {
+ Ch = -1;
+ } catch (System.IndexOutOfRangeException) {
+ Ch = -1;
+ }
+ } else {
+ /* Common case. */
+ Ch = c1;
+ }
+ SUCCESS_INDICATOR = (Ch > 0);
").
:- pragma foreign_proc("Java",
- string.unsafe_index(Str::in, Index::in, Ch::uo),
+ string.unsafe_index_2(Str::in, Index::in, Ch::uo),
[will_not_call_mercury, promise_pure, thread_safe],
"
- Ch = Str.charAt(Index);
+ Ch = Str.codePointAt(Index);
+ /* XXX Unicode Java: more error checking */
+ /* XXX Unicode Java: could be more efficient */
+ SUCCESS_INDICATOR =
+ !java.lang.Character.isHighSurrogate((char) Ch) &&
+ !java.lang.Character.isLowSurrogate((char) Ch);
").
:- pragma foreign_proc("Erlang",
- string.unsafe_index(Str::in, Index::in, Ch::uo),
+ string.unsafe_index_2(Str::in, Index::in, Ch::uo),
[will_not_call_mercury, promise_pure, thread_safe],
"
- <<_:Index/binary, Ch/integer, _/binary>> = Str
+ <<_:Index/binary, Ch/utf8, _/binary>> = Str,
+ SUCCESS_INDICATOR = true
").
-string.unsafe_index(Str, Index, Char) :-
- ( string.first_char(Str, First, Rest) ->
- ( Index = 0 ->
- Char = First
- ;
- string.unsafe_index(Rest, Index - 1, Char)
- )
- ;
- error("string.unsafe_index: out of bounds")
- ).
String ^ unsafe_elem(Index) = unsafe_index(String, Index).
+:- pragma foreign_proc("C",
+ string.unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ int pos = Index;
+ Ch = MR_utf8_get_next(Str, &pos);
+ NextIndex = pos;
+ SUCCESS_INDICATOR = (Ch > 0);
+").
+
+:- pragma foreign_proc("C#",
+ string.unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ if (Index < Str.Length) {
+ Ch = System.Char.ConvertToUtf32(Str, Index);
+ if (Ch <= 0xffff) {
+ NextIndex = Index + 1;
+ } else {
+ NextIndex = Index + 2;
+ }
+ SUCCESS_INDICATOR = true;
+ } else {
+ Ch = -1;
+ NextIndex = Index;
+ SUCCESS_INDICATOR = false;
+ }
+").
+
+:- pragma foreign_proc("Java",
+ string.unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ if (Index < Str.length()) {
+ Ch = Str.codePointAt(Index);
+ /* XXX Unicode Java: more error checking */
+ /* XXX Unicode Java: could be faster */
+ SUCCESS_INDICATOR =
+ !java.lang.Character.isHighSurrogate((char) Ch) &&
+ !java.lang.Character.isLowSurrogate((char) Ch);
+ if (SUCCESS_INDICATOR) {
+ NextIndex = Index + java.lang.Character.charCount(Ch);
+ SUCCESS_INDICATOR = true;
+ } else {
+ Ch = -1;
+ NextIndex = Index;
+ SUCCESS_INDICATOR = false;
+ }
+ } else {
+ Ch = -1;
+ NextIndex = Index;
+ SUCCESS_INDICATOR = false;
+ }
+").
+
+:- pragma foreign_proc("Erlang",
+ string.unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ case Str of
+ << _:Index/binary, Ch/utf8, _/binary >> ->
+ if
+ Ch =< 16#7f ->
+ NextIndex = Index + 1;
+ Ch =< 16#7ff ->
+ NextIndex = Index + 2;
+ Ch =< 16#ffff ->
+ NextIndex = Index + 3;
+ true ->
+ NextIndex = Index + 4
+ end,
+ SUCCESS_INDICATOR = true;
+ _ ->
+ Ch = -1,
+ NextIndex = Index,
+ SUCCESS_INDICATOR = false
+ end
+").
+
+:- pragma foreign_proc("C",
+ string.unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ int pos = Index;
+ Ch = MR_utf8_prev_get(Str, &pos);
+ PrevIndex = pos;
+ SUCCESS_INDICATOR = (Ch > 0);
+").
+
+:- pragma foreign_proc("C#",
+ string.unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ if (Index > 0) {
+ char c2 = Str[Index - 1];
+ if (System.Char.IsSurrogate(c2)) {
+ try {
+ char c1 = Str[Index - 2];
+ Ch = System.Char.ConvertToUtf32(c1, c2);
+ PrevIndex = Index - 2;
+ } catch (System.IndexOutOfRangeException) {
+ Ch = -1;
+ PrevIndex = Index;
+ SUCCESS_INDICATOR = false;
+ } catch (System.ArgumentOutOfRangeException) {
+ Ch = -1;
+ PrevIndex = Index;
+ SUCCESS_INDICATOR = false;
+ }
+ } else {
+ /* Common case. */
+ Ch = (int) c2;
+ PrevIndex = Index - 1;
+ }
+ SUCCESS_INDICATOR = true;
+ } else {
+ Ch = -1;
+ PrevIndex = Index;
+ SUCCESS_INDICATOR = false;
+ }
+").
+
+:- pragma foreign_proc("Java",
+ string.unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ if (Index > 0) {
+ Ch = Str.codePointBefore(Index);
+ PrevIndex = Index - java.lang.Character.charCount(Ch);
+ SUCCESS_INDICATOR = true;
+ } else {
+ Ch = -1;
+ PrevIndex = Index;
+ SUCCESS_INDICATOR = false;
+ }
+").
+
+:- pragma foreign_proc("Erlang",
+ string.unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ {PrevIndex, Ch} = do_unsafe_prev_index(Str, Index - 1),
+ SUCCESS_INDICATOR = (Ch =/= -1)
+").
+
+:- pragma foreign_code("Erlang", "
+do_unsafe_prev_index(Str, Index) ->
+ if Index >= 0 ->
+ case Str of
+ <<_:Index/binary, Ch/integer, _/binary>> ->
+ if
+ (Ch band 16#80) =:= 0 ->
+ {Index, Ch};
+ (Ch band 16#C0) == 16#80 ->
+ do_unsafe_prev_index(Str, Index - 1);
+ true ->
+ <<_:Index/binary, Ch2/utf8, _/binary>> = Str,
+ {Index, Ch2}
+ end;
+ true ->
+ {Index, -1}
+ end
+ ; true ->
+ {Index, -1}
+ end.
+").
+
+/*-----------------------------------------------------------------------*/
+
+:- pragma foreign_proc("C",
+ string.unsafe_index_code_unit(Str::in, Index::in, Code::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ const unsigned char *s = (const unsigned char *)Str;
+ Code = s[Index];
+").
+:- pragma foreign_proc("C#",
+ string.unsafe_index_code_unit(Str::in, Index::in, Code::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ Code = Str[Index];
+").
+:- pragma foreign_proc("Java",
+ string.unsafe_index_code_unit(Str::in, Index::in, Code::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ Code = Str.charAt(Index);
+").
+:- pragma foreign_proc("Erlang",
+ string.unsafe_index_code_unit(Str::in, Index::in, Code::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ <<_:Index/binary, Code/integer, _/binary>> = Str
+").
+
/*-----------------------------------------------------------------------*/
:- pragma foreign_decl("C",
@@ -4022,9 +4645,9 @@ String ^ unsafe_elem(Index) = unsafe_index(String, Index).
** GNU C version egcs-1.1.2 crashes with `fixed or forbidden register
** spilled' in grade asm_fast.gc.tr.debug if we write this inline.
*/
- extern void MR_set_char(MR_String str, MR_Integer ind, MR_Char ch);
+ extern void MR_set_code_unit(MR_String str, MR_Integer ind, char ch);
#else
- #define MR_set_char(str, ind, ch) \\
+ #define MR_set_code_unit(str, ind, ch) \\
((str)[ind] = (ch))
#endif
").
@@ -4036,7 +4659,7 @@ String ^ unsafe_elem(Index) = unsafe_index(String, Index).
** GNU C version egcs-1.1.2 crashes with `fixed or forbidden register
** spilled' in grade asm_fast.gc.tr.debug if we write this inline.
*/
- void MR_set_char(MR_String str, MR_Integer ind, MR_Char ch)
+ void MR_set_code_unit(MR_String str, MR_Integer ind, char ch)
{
str[ind] = ch;
}
@@ -4064,11 +4687,29 @@ string.set_char(Char, Index, !Str) :-
size_t len = strlen(Str0);
if ((MR_Unsigned) Index >= len) {
SUCCESS_INDICATOR = MR_FALSE;
- } else {
+ } else if (Str0[Index] <= 0x7f && Ch <= 0x7f) {
+ /* Fast path. */
SUCCESS_INDICATOR = MR_TRUE;
MR_allocate_aligned_string_msg(Str, len, MR_PROC_LABEL);
strcpy(Str, Str0);
- MR_set_char(Str, Index, Ch);
+ MR_set_code_unit(Str, Index, Ch);
+ } else {
+ int oldc = MR_utf8_get(Str0, Index);
+ if (oldc < 0) {
+ SUCCESS_INDICATOR = MR_FALSE;
+ } else {
+ size_t oldwidth = MR_utf8_width(oldc);
+ size_t newwidth = MR_utf8_width(Ch);
+ size_t newlen;
+ size_t tailofs;
+
+ newlen = len - oldwidth + newwidth;
+ MR_allocate_aligned_string_msg(Str, newlen, MR_PROC_LABEL);
+ MR_memcpy(Str, Str0, Index);
+ MR_utf8_encode(Str + Index, Ch);
+ strcpy(Str + Index + newwidth, Str0 + Index + oldwidth);
+ SUCCESS_INDICATOR = MR_TRUE;
+ }
}
").
:- pragma foreign_proc("C#",
@@ -4080,7 +4721,18 @@ string.set_char(Char, Index, !Str) :-
SUCCESS_INDICATOR = false;
} else {
System.Text.StringBuilder sb = new System.Text.StringBuilder(Str0);
- sb[Index] = Ch;
+ if (!System.Char.IsHighSurrogate(Str0, Index) && Ch <= 0xffff) {
+ /* Fast path. */
+ sb[Index] = (char) Ch;
+ } else {
+ // XXX Unicode C#: error checking
+ if (System.Char.IsLowSurrogate(Str0, Index + 1)) {
+ sb.Remove(Index, 2);
+ } else {
+ sb.Remove(Index, 1);
+ }
+ sb.Insert(Index, System.Char.ConvertFromUtf32(Ch));
+ }
Str = sb.ToString();
SUCCESS_INDICATOR = true;
}
@@ -4094,7 +4746,17 @@ string.set_char(Char, Index, !Str) :-
SUCCESS_INDICATOR = false;
} else {
java.lang.StringBuilder sb = new StringBuilder(Str0);
- sb.setCharAt(Index, Ch);
+
+ int oldc = sb.codePointAt(Index);
+ int oldwidth = java.lang.Character.charCount(oldc);
+ int newwidth = java.lang.Character.charCount(Ch);
+ if (oldwidth == 1 && newwidth == 1) {
+ sb.setCharAt(Index, (char) Ch);
+ } else {
+ char[] buf = java.lang.Character.toChars(Ch);
+ sb.replace(Index, Index + oldwidth, new String(buf));
+ }
+
Str = sb.toString();
SUCCESS_INDICATOR = true;
}
@@ -4104,6 +4766,7 @@ string.set_char_2(Ch, Index, Str0, Str) :-
list.replace_nth(List0, Index + 1, Ch, List),
string.to_char_list(Str, List).
+% XXX Unicode: what is this?
% :- pragma foreign_proc("C",
% string.set_char_2(Ch::in, Index::in, Str0::di, Str::uo),
% [will_not_call_mercury, promise_pure, thread_safe, does_not_affect_liveness],
@@ -4124,6 +4787,7 @@ string.set_char_2(Ch, Index, Str0, Str) :-
% if (Index >= Str0.Length) {
% SUCCESS_INDICATOR = false;
% } else {
+% // UTF-16
% Str = System.String.Concat(Str0.Substring(0, Index),
% System.Convert.ToString(Ch),
% Str0.Substring(Index + 1));
@@ -4152,32 +4816,65 @@ string.unsafe_set_char(Char, Index, !Str) :-
does_not_affect_liveness],
"
size_t len = strlen(Str0);
- MR_allocate_aligned_string_msg(Str, len, MR_PROC_LABEL);
- strcpy(Str, Str0);
- MR_set_char(Str, Index, Ch);
+ if (Str0[Index] <= 0x7f && Ch <= 0x7f) {
+ /* Fast path. */
+ SUCCESS_INDICATOR = MR_TRUE;
+ MR_allocate_aligned_string_msg(Str, len, MR_PROC_LABEL);
+ strcpy(Str, Str0);
+ MR_set_code_unit(Str, Index, Ch);
+ } else {
+ int oldc = MR_utf8_get(Str0, Index);
+ if (oldc < 0) {
+ SUCCESS_INDICATOR = MR_FALSE;
+ } else {
+ size_t oldwidth = MR_utf8_width(oldc);
+ size_t newwidth = MR_utf8_width(Ch);
+ size_t newlen;
+ size_t tailofs;
+
+ newlen = len - oldwidth + newwidth;
+ MR_allocate_aligned_string_msg(Str, newlen, MR_PROC_LABEL);
+ MR_memcpy(Str, Str0, Index);
+ MR_utf8_encode(Str + Index, Ch);
+ strcpy(Str + Index + newwidth, Str0 + Index + oldwidth);
+ SUCCESS_INDICATOR = MR_TRUE;
+ }
+ }
").
:- pragma foreign_proc("C#",
string.unsafe_set_char_2(Ch::in, Index::in, Str0::in, Str::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
- Str = System.String.Concat(Str0.Substring(0, Index),
- System.Convert.ToString(Ch),
- Str0.Substring(Index + 1));
+ // XXX Unicode C#: need more error checking
+ if (System.Char.IsHighSurrogate(Str0, Index)) {
+ Str = System.String.Concat(Str0.Substring(0, Index),
+ System.Char.ConvertFromUtf32(Ch),
+ Str0.Substring(Index + 2));
+ } else {
+ Str = System.String.Concat(Str0.Substring(0, Index),
+ System.Char.ConvertFromUtf32(Ch),
+ Str0.Substring(Index + 1));
+ }
").
:- pragma foreign_proc("Java",
string.unsafe_set_char_2(Ch::in, Index::in, Str0::in, Str::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
- Str = Str0.substring(0, Index) + Ch + Str0.substring(Index + 1);
+ int oldc = Str0.codePointAt(Index);
+ int oldwidth = java.lang.Character.charCount(oldc);
+ Str = Str0.subSequence(0, Index)
+ + new String(Character.toChars(Ch))
+ + Str0.subSequence(Index + oldwidth, Str0.length());
").
:- pragma foreign_proc("Erlang",
string.unsafe_set_char_2(Ch::in, Index::in, Str0::in, Str::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
- <<Left:Index/binary, _/integer, Right/binary>> = Str0,
- Str = list_to_binary([Left, Ch, Right])
+ <<Left:Index/binary, _/utf8, Right/binary>> = Str0,
+ Str = unicode:characters_to_binary([Left, Ch, Right])
").
+% NOTE: these procs do not understand variable width encodings
% :- pragma foreign_proc("C",
% string.unsafe_set_char_2(Ch::in, Index::in, Str0::di, Str::uo),
% [will_not_call_mercury, promise_pure, thread_safe, does_not_affect_liveness],
@@ -4256,20 +4953,140 @@ string.unsafe_set_char(Char, Index, !Str) :-
Length = size(Str)
").
-string.length(Str0, Len) :-
- % XXX This copy is only necessary because of the ui.
- copy(Str0, Str),
- string.length_2(Str, 0, Len).
+string.length(Str, Len) :-
+ string.to_code_unit_list(Str, CodeList),
+ list.length(CodeList, Len).
+
+string.count_code_units(Str) = string.length(Str).
+
+string.count_code_units(Str, Length) :-
+ string.length(Str, Length).
+
+/*-----------------------------------------------------------------------*/
+
+string.count_codepoints(String) = Count :-
+ string.count_codepoints(String, Count).
+
+string.count_codepoints(String, Count) :-
+ count_codepoints_2(String, 0, 0, Count).
+
+:- pred count_codepoints_2(string::in, int::in, int::in, int::out) is det.
+
+count_codepoints_2(String, I, Count0, Count) :-
+ ( string.unsafe_index_next(String, I, J, _) ->
+ count_codepoints_2(String, J, Count0 + 1, Count)
+ ;
+ Count = Count0
+ ).
+
+:- pragma foreign_proc("C",
+ string.count_codepoints(String::in, Count::out),
+ [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+ unsigned char b;
+ int i;
+
+ Count = 0;
+ for (i = 0; ; i++) {
+ b = String[i];
+ if (b == '\\0') {
+ break;
+ }
+ if (MR_utf8_is_single_byte(b) || MR_utf8_is_lead_byte(b)) {
+ Count++;
+ }
+ }
+").
+
+:- pragma foreign_proc("C#",
+ string.count_codepoints(String::in, Count::out),
+ [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+ Count = 0;
+ foreach (char c in String) {
+ if (!System.Char.IsLowSurrogate(c)) {
+ Count++;
+ }
+ }
+").
+
+:- pragma foreign_proc("Java",
+ string.count_codepoints(String::in, Count::out),
+ [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+ Count = String.codePointCount(0, String.length());
+").
-:- pred string.length_2(string::in, int::in, int::out) is det.
+ % Note: we do not define what happens with unpaired surrogates.
+ %
+string.codepoint_offset(String, N, Index) :-
+ string.codepoint_offset(String, 0, N, Index).
+
+% XXX unicode: untested
+string.codepoint_offset(String, StartOffset, N, Index) :-
+ StartOffset >= 0,
+ StartOffset < string.length(String),
+ string.codepoint_offset_2(String, StartOffset, N, Index).
-string.length_2(Str, Index, Length) :-
- ( string.index(Str, Index, _) ->
- string.length_2(Str, Index + 1, Length)
+% XXX unicode: untested
+:- pred codepoint_offset_2(string::in, int::in, int::in, int::out) is semidet.
+
+codepoint_offset_2(String, Offset, N, Index) :-
+ ( N = 0 ->
+ Index = Offset
;
- Length = Index
+ string.unsafe_index_next(String, Offset, NextOffset, _),
+ string.codepoint_offset_2(String, NextOffset, N - 1, Index)
).
+:- pragma foreign_proc("C",
+ string.codepoint_offset(String::in, StartOffset::in, N::in, Index::out),
+ [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+ size_t len;
+ unsigned char b;
+
+ SUCCESS_INDICATOR = MR_FALSE;
+ len = strlen(String);
+ for (Index = StartOffset; Index < len; Index++) {
+ b = String[Index];
+ if (MR_utf8_is_single_byte(b) || MR_utf8_is_lead_byte(b)) {
+ if (N-- == 0) {
+ SUCCESS_INDICATOR = MR_TRUE;
+ break;
+ }
+ }
+ }
+").
+
+:- pragma foreign_proc("C#",
+ string.codepoint_offset(String::in, StartOffset::in, N::in, Index::out),
+ [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+ SUCCESS_INDICATOR = false;
+ for (Index = StartOffset; Index < String.Length; Index++) {
+ if (!System.Char.IsLowSurrogate(String, Index)) {
+ if (N-- == 0) {
+ SUCCESS_INDICATOR = true;
+ break;
+ }
+ }
+ }
+").
+
+:- pragma foreign_proc("Java",
+ string.codepoint_offset(String::in, StartOffset::in, N::in, Index::out),
+ [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+ try {
+ Index = String.offsetByCodePoints(StartOffset, N);
+ SUCCESS_INDICATOR = true;
+ } catch (IndexOutOfBoundsException e) {
+ Index = -1;
+ SUCCESS_INDICATOR = false;
+ }
+").
+
/*-----------------------------------------------------------------------*/
:- pragma promise_equivalent_clauses(string.append/3).
@@ -4438,7 +5255,8 @@ string.append_ooi_2(NextS1Len, S3Len, S1, S2, S3) :-
(
string.append_ooi_3(NextS1Len, S3Len, S1, S2, S3)
;
- string.append_ooi_2(NextS1Len + 1, S3Len, S1, S2, S3)
+ string.unsafe_index_next(S3, NextS1Len, AdvS1Len, _),
+ string.append_ooi_2(AdvS1Len, S3Len, S1, S2, S3)
)
).
@@ -4513,12 +5331,15 @@ substring(Str, !.Start, !.Count, SubStr) :-
:- func strchars(int, int, string) = list(char).
strchars(I, End, Str) = Chars :-
- ( I >= End ->
- Chars = []
- ;
- C = string.unsafe_index(Str, I),
- Cs = strchars(I + 1, End, Str),
+ (
+ I < End,
+ string.unsafe_index_next(Str, I, J, C),
+ J =< End
+ ->
+ Cs = strchars(J, End, Str),
Chars = [C | Cs]
+ ;
+ Chars = []
).
:- pragma foreign_proc("C",
@@ -4611,6 +5432,23 @@ strchars(I, End, Str) = Chars :-
end
").
+string.substring_by_codepoint(Str, Start, Count) = SubString :-
+ string.substring_by_codepoint(Str, Start, Count, SubString).
+
+string.substring_by_codepoint(Str, Start, Count, SubString) :-
+ ( string.codepoint_offset(Str, Start, StartOffset0) ->
+ StartOffset = StartOffset0
+ ;
+ StartOffset = 0
+ ),
+ ( string.codepoint_offset(Str, StartOffset, Count, EndOffset0) ->
+ EndOffset = EndOffset0
+ ;
+ EndOffset = string.length(Str)
+ ),
+ OffsetCount = EndOffset - StartOffset,
+ string.substring(Str, StartOffset, OffsetCount, SubString).
+
:- pragma foreign_proc("C",
string.unsafe_substring(Str::in, Start::in, Count::in, SubString::uo),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
@@ -4722,24 +5560,36 @@ strchars(I, End, Str) = Chars :-
end
").
-string.split(Str, Count, Left, Right) :-
- ( Count =< 0 ->
+% XXX this portable version is no longer correct as it deals in code points
+% string.split(Str, Count, Left, Right) :-
+% ( Count =< 0 ->
+% Left = "",
+% copy(Str, Right)
+% ;
+% string.to_char_list(Str, List),
+% Len = string.length(Str),
+% ( Count > Len ->
+% Num = Len
+% ;
+% Num = Count
+% ),
+% ( list.split_list(Num, List, LeftList, RightList) ->
+% string.to_char_list(Left, LeftList),
+% string.to_char_list(Right, RightList)
+% ;
+% error("string.split")
+% )
+% ).
+
+string.split_by_codepoint(Str, Count, Left, Right) :-
+ ( string.codepoint_offset(Str, Count, Offset) ->
+ string.split(Str, Offset, Left, Right)
+ ; Count =< 0 ->
Left = "",
copy(Str, Right)
;
- string.to_char_list(Str, List),
- Len = string.length(Str),
- ( Count > Len ->
- Num = Len
- ;
- Num = Count
- ),
- ( list.split_list(Num, List, LeftList, RightList) ->
- string.to_char_list(Left, LeftList),
- string.to_char_list(Right, RightList)
- ;
- error("string.split")
- )
+ copy(Str, Left),
+ Right = ""
).
/*-----------------------------------------------------------------------*/
@@ -4749,10 +5599,12 @@ string.split(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
+ int pos = 0;
+ int c = MR_utf8_get_next(Str, &pos);
SUCCESS_INDICATOR = (
- Str[0] == First &&
+ c == First &&
First != '\\0' &&
- strcmp(Str + 1, Rest) == 0
+ strcmp(Str + pos, Rest) == 0
);
").
:- pragma foreign_proc("C#",
@@ -4760,26 +5612,39 @@ string.split(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe],
"
int len = Str.Length;
- SUCCESS_INDICATOR = (
- len > 0 &&
- Str[0] == First &&
- System.String.CompareOrdinal(Str, 1, Rest, 0, len) == 0
- );
+ if (First <= 0xffff) {
+ SUCCESS_INDICATOR = (
+ len > 0 &&
+ Str[0] == First &&
+ System.String.CompareOrdinal(Str, 1, Rest, 0, len) == 0
+ );
+ } else {
+ string firstchars = System.Char.ConvertFromUtf32(First);
+ SUCCESS_INDICATOR = (
+ len > 1 &&
+ Str[0] == firstchars[0] &&
+ Str[1] == firstchars[1] &&
+ System.String.CompareOrdinal(Str, 2, Rest, 0, len) == 0
+ );
+ }
").
:- pragma foreign_proc("Java",
string.first_char(Str::in, First::in, Rest::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
- SUCCESS_INDICATOR = (Str.length() == Rest.length() + 1 &&
- Str.charAt(0) == First &&
- Str.endsWith(Rest));
+ int toffset = java.lang.Character.charCount(First);
+ SUCCESS_INDICATOR = (
+ Str.length() > 0 &&
+ Str.codePointAt(0) == First &&
+ Str.regionMatches(toffset, Rest, 0, Rest.length())
+ );
").
:- pragma foreign_proc("Erlang",
string.first_char(Str::in, First::in, Rest::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
case Str of
- <<First, Rest/binary>> ->
+ <<First/utf8, Rest/binary>> ->
SUCCESS_INDICATOR = true;
_ ->
SUCCESS_INDICATOR = false
@@ -4791,19 +5656,31 @@ string.split(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
- First = Str[0];
- SUCCESS_INDICATOR = (First != '\\0' && strcmp(Str + 1, Rest) == 0);
+ int pos = 0;
+ First = MR_utf8_get_next(Str, &pos);
+ SUCCESS_INDICATOR = (First != '\\0' && strcmp(Str + pos, Rest) == 0);
").
:- pragma foreign_proc("C#",
string.first_char(Str::in, First::uo, Rest::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
- int len = Str.Length;
- if (len > 0) {
- SUCCESS_INDICATOR = (System.String.CompareOrdinal(Str, 1, Rest, 0, len)
- == 0);
- First = Str[0];
- } else {
+ try {
+ int len = Str.Length;
+ char c1 = Str[0];
+ if (System.Char.IsHighSurrogate(c1)) {
+ char c2 = Str[1];
+ First = System.Char.ConvertToUtf32(c1, c2);
+ SUCCESS_INDICATOR =
+ (System.String.CompareOrdinal(Str, 2, Rest, 0, len) == 0);
+ } else {
+ First = c1;
+ SUCCESS_INDICATOR =
+ (System.String.CompareOrdinal(Str, 1, Rest, 0, len) == 0);
+ }
+ } catch (System.IndexOutOfRangeException) {
+ SUCCESS_INDICATOR = false;
+ First = (char) 0;
+ } catch (System.ArgumentOutOfRangeException) {
SUCCESS_INDICATOR = false;
First = (char) 0;
}
@@ -4812,13 +5689,15 @@ string.split(Str, Count, Left, Right) :-
string.first_char(Str::in, First::uo, Rest::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
- if (Str.length() == Rest.length() + 1 && Str.endsWith(Rest)) {
- SUCCESS_INDICATOR = true;
- First = Str.charAt(0);
+ int toffset;
+ if (Str.length() > 0) {
+ First = Str.codePointAt(0);
+ toffset = java.lang.Character.charCount(First);
+ SUCCESS_INDICATOR = Str.regionMatches(toffset, Rest, 0, Rest.length());
} else {
SUCCESS_INDICATOR = false;
// XXX to avoid uninitialized var warning
- First = (char) 0;
+ First = 0;
}
").
:- pragma foreign_proc("Erlang",
@@ -4826,7 +5705,7 @@ string.split(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe],
"
case Str of
- << First, Rest/binary >> ->
+ <<First/utf8, Rest/binary>> ->
SUCCESS_INDICATOR = true;
_ ->
SUCCESS_INDICATOR = false,
@@ -4839,10 +5718,12 @@ string.split(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"{
- if (Str[0] != First || First == '\\0') {
+ int pos = 0;
+ int c = MR_utf8_get_next(Str, &pos);
+ if (c != First || First == '\\0') {
SUCCESS_INDICATOR = MR_FALSE;
} else {
- Str++;
+ Str += pos;
/*
** We need to make a copy to ensure that the pointer is word-aligned.
*/
@@ -4858,8 +5739,15 @@ string.split(Str, Count, Left, Right) :-
int len = Str.Length;
if (len > 0) {
- SUCCESS_INDICATOR = (First == Str[0]);
- Rest = Str.Substring(1);
+ if (First <= 0xffff) {
+ SUCCESS_INDICATOR = (First == Str[0]);
+ Rest = Str.Substring(1);
+ } else {
+ string firststr = System.Char.ConvertFromUtf32(First);
+ SUCCESS_INDICATOR =
+ (System.String.CompareOrdinal(Str, 0, firststr, 0, 2) == 0);
+ Rest = Str.Substring(2);
+ }
} else {
SUCCESS_INDICATOR = false;
Rest = null;
@@ -4872,8 +5760,8 @@ string.split(Str, Count, Left, Right) :-
int len = Str.length();
if (len > 0) {
- SUCCESS_INDICATOR = (First == Str.charAt(0));
- Rest = Str.substring(1);
+ SUCCESS_INDICATOR = (First == Str.codePointAt(0));
+ Rest = Str.substring(java.lang.Character.charCount(First));
} else {
SUCCESS_INDICATOR = false;
// XXX to avoid uninitialized var warning
@@ -4885,7 +5773,7 @@ string.split(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe],
"
case Str of
- <<First, Rest/binary>> ->
+ <<First/utf8, Rest/binary>> ->
SUCCESS_INDICATOR = true;
_ ->
SUCCESS_INDICATOR = false,
@@ -4898,11 +5786,12 @@ string.split(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"{
- First = Str[0];
+ int pos = 0;
+ First = MR_utf8_get_next(Str, &pos);
if (First == '\\0') {
SUCCESS_INDICATOR = MR_FALSE;
} else {
- Str++;
+ Str += pos;
/*
** We need to make a copy to ensure that the pointer is word-aligned.
*/
@@ -4915,14 +5804,25 @@ string.split(Str, Count, Left, Right) :-
string.first_char(Str::in, First::uo, Rest::uo),
[will_not_call_mercury, promise_pure, thread_safe],
"{
- if (Str.Length == 0) {
+ try {
+ char c1 = Str[0];
+ if (System.Char.IsHighSurrogate(c1)) {
+ char c2 = Str[1];
+ First = System.Char.ConvertToUtf32(c1, c2);
+ Rest = Str.Substring(2);
+ } else {
+ First = Str[0];
+ Rest = Str.Substring(1);
+ }
+ SUCCESS_INDICATOR = true;
+ } catch (System.IndexOutOfRangeException) {
+ SUCCESS_INDICATOR = false;
+ First = (char) 0;
+ Rest = null;
+ } catch (System.ArgumentOutOfRangeException) {
SUCCESS_INDICATOR = false;
First = (char) 0;
Rest = null;
- } else {
- First = Str[0];
- Rest = Str.Substring(1);
- SUCCESS_INDICATOR = true;
}
}").
:- pragma foreign_proc("Java",
@@ -4931,12 +5831,12 @@ string.split(Str, Count, Left, Right) :-
"{
if (Str.length() == 0) {
SUCCESS_INDICATOR = false;
- // XXX to avoid uninitialized var warnings:
First = (char) 0;
Rest = null;
} else {
- First = Str.charAt(0);
- Rest = Str.substring(1);
+ // XXX Unicode Java: do error checking?
+ First = Str.codePointAt(0);
+ Rest = Str.substring(java.lang.Character.charCount(First));
SUCCESS_INDICATOR = true;
}
}").
@@ -4945,7 +5845,7 @@ string.split(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe],
"
case Str of
- <<First, Rest/binary>> ->
+ <<First/utf8, Rest/binary>> ->
SUCCESS_INDICATOR = true;
_ ->
SUCCESS_INDICATOR = false,
@@ -4959,31 +5859,36 @@ string.split(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"{
- size_t len = strlen(Rest) + 1;
+ size_t firstw = MR_utf8_width(First);
+ size_t len = firstw + strlen(Rest);
MR_allocate_aligned_string_msg(Str, len, MR_PROC_LABEL);
- Str[0] = First;
- strcpy(Str + 1, Rest);
+ MR_utf8_encode(Str, First);
+ strcpy(Str + firstw, Rest);
}").
:- pragma foreign_proc("C#",
string.first_char(Str::uo, First::in, Rest::in),
[will_not_call_mercury, promise_pure, thread_safe],
"{
string FirstStr;
- FirstStr = new System.String(First, 1);
- Str = System.String.Concat(FirstStr, Rest);
+ if (First <= 0xffff) {
+ FirstStr = new System.String((char) First, 1);
+ } else {
+ FirstStr = System.Char.ConvertFromUtf32(First);
+ }
+ Str = FirstStr + Rest;
}").
:- pragma foreign_proc("Java",
string.first_char(Str::uo, First::in, Rest::in),
[will_not_call_mercury, promise_pure, thread_safe],
"{
- java.lang.String FirstStr = java.lang.String.valueOf(First);
+ String FirstStr = new String(Character.toChars(First));
Str = FirstStr.concat(Rest);
}").
:- pragma foreign_proc("Erlang",
string.first_char(Str::uo, First::in, Rest::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
- Str = list_to_binary([First, Rest])
+ Str = unicode:characters_to_binary([First, Rest])
").
%-----------------------------------------------------------------------------%
@@ -5086,23 +5991,40 @@ string.format(S1, PT) = S2 :-
%------------------------------------------------------------------------------%
string.words_separator(SepP, String) = Words :-
- I = preceding_boundary(isnt(SepP), String, string.length(String) - 1),
- Words = words_2(SepP, String, I, []).
+ next_boundary(SepP, String, 0, WordStart),
+ words_2(SepP, String, WordStart, Words).
-%------------------------------------------------------------------------------%
+:- pred words_2(pred(char)::in(pred(in) is semidet), string::in, int::in,
+ list(string)::out) is det.
-:- func words_2(pred(char)::in(pred(in) is semidet), string::in, int::in,
- list(string)::in) = (list(string)::out) is det.
+words_2(SepP, String, WordStart, Words) :-
+ next_boundary(isnt(SepP), String, WordStart, WordEnd),
+ ( WordEnd = WordStart ->
+ Words = []
+ ;
+ string.unsafe_substring(String, WordStart, WordEnd - WordStart, Word),
+ next_boundary(SepP, String, WordEnd, NextWordStart),
+ ( WordEnd = NextWordStart ->
+ Words = [Word]
+ ;
+ words_2(SepP, String, NextWordStart, Words0),
+ Words = [Word | Words0]
+ )
+ ).
-words_2(SepP, String, WordEnd, Words0) = Words :-
- ( WordEnd < 0 ->
- Words = Words0
+ % Return the smallest I >= I0 such that `not P(String[I])'.
+ %
+:- pred next_boundary(pred(char)::in(pred(in) is semidet), string::in, int::in,
+ int::out) is det.
+
+next_boundary(P, String, I0, I) :-
+ (
+ string.unsafe_index_next(String, I0, I1, Char),
+ P(Char)
+ ->
+ next_boundary(P, String, I1, I)
;
- WordPre = preceding_boundary(SepP, String, WordEnd),
- Word = string.unsafe_substring(String, WordPre + 1,
- WordEnd - WordPre),
- PrevWordEnd = preceding_boundary(isnt(SepP), String, WordPre),
- Words = words_2(SepP, String, PrevWordEnd, [Word | Words0])
+ I = I0
).
%------------------------------------------------------------------------------%
@@ -5113,33 +6035,32 @@ string.words(String) = string.words_separator(char.is_whitespace, String).
string.split_at_separator(DelimP, String) = Substrings :-
Len = string.length(String),
- split_at_separator_2(DelimP, String, Len - 1, Len, [], Substrings).
+ split_at_separator_2(DelimP, String, Len, Len, [], Substrings).
:- pred split_at_separator_2(pred(char)::in(pred(in) is semidet), string::in,
int::in, int::in, list(string)::in, list(string)::out) is det.
split_at_separator_2(DelimP, Str, I, SegEnd, Acc0, Acc) :-
- % Walk Str backwards extending the accumulated list of chunks as chars
- % matching DelimP are found.
+ % Walk Str backwards extending the accumulated list of chunks as code
+ % points matching DelimP are found.
%
- % Invariant: -1 =< I < length(Str)
+ % Invariant: 0 =< I =< length(Str)
% SegEnd is one past the last index of the current segment.
%
- ( I < 0 ->
- % We've reached the beginning of the string.
- Seg = string.unsafe_substring(Str, 0, SegEnd),
- Acc = [Seg | Acc0]
- ;
- C = string.unsafe_index(Str, I),
+ ( string.unsafe_prev_index(Str, I, J, C) ->
( DelimP(C) ->
% Chop here.
- SegStart = I + 1,
+ SegStart = I,
Seg = string.unsafe_substring(Str, SegStart, SegEnd - SegStart),
- split_at_separator_2(DelimP, Str, I - 1, I, [Seg | Acc0], Acc)
+ split_at_separator_2(DelimP, Str, J, J, [Seg | Acc0], Acc)
;
% Extend current segment.
- split_at_separator_2(DelimP, Str, I - 1, SegEnd, Acc0, Acc)
+ split_at_separator_2(DelimP, Str, J, SegEnd, Acc0, Acc)
)
+ ;
+ % We've reached the beginning of the string.
+ Seg = string.unsafe_substring(Str, 0, SegEnd),
+ Acc = [Seg | Acc0]
).
%------------------------------------------------------------------------------%
@@ -5156,36 +6077,16 @@ split_at_string(Needle, Total) =
split_at_string(StartAt, NeedleLen, Needle, Total) = Out :-
( sub_string_search_start(Total, Needle, StartAt, NeedlePos) ->
- BeforeNeedle = substring(Total, StartAt, NeedlePos-StartAt),
+ BeforeNeedle = substring(Total, StartAt, NeedlePos - StartAt),
Tail = split_at_string(NeedlePos+NeedleLen, NeedleLen, Needle, Total),
Out = [BeforeNeedle | Tail]
;
- string__split(Total, StartAt, _skip, Last),
+ string.split(Total, StartAt, _Skip, Last),
Out = [Last]
).
%------------------------------------------------------------------------------%
- % preceding_boundary(SepP, String, I) returns the largest index J =< I
- % in String of the char that is SepP and min(-1, I) if there is no such J.
- % preceding_boundary/3 is intended for finding (in reverse) consecutive
- % maximal sequences of chars satisfying some property. Note that I
- % *must not* exceed the largest valid index for String.
- %
-:- func preceding_boundary(pred(char)::in(pred(in) is semidet), string::in,
- int::in) = (int::out) is det.
-
-preceding_boundary(SepP, String, I) =
- ( I < 0 ->
- I
- ; SepP(string.unsafe_index(String, I)) ->
- I
- ;
- preceding_boundary(SepP, String, I - 1)
- ).
-
-%------------------------------------------------------------------------------%
-
S1 ++ S2 = string.append(S1, S2).
%------------------------------------------------------------------------------%
@@ -5203,11 +6104,13 @@ string.det_base_string_to_int(Base, S) = N :-
%-----------------------------------------------------------------------------%
-chomp(S) =
- ( index(S, length(S) - 1, '\n') ->
- left(S, length(S) - 1)
+chomp(S) = Chomp :-
+ % '\n' is always represented by a single code unit.
+ Offset = string.length(S) - 1,
+ ( index(S, Offset, '\n') ->
+ Chomp = left(S, Offset)
;
- S
+ Chomp = S
).
%-----------------------------------------------------------------------------%
@@ -5235,40 +6138,39 @@ lstrip_pred(P, S) = right(S, length(S) - prefix_length(P, S)).
%-----------------------------------------------------------------------------%
-prefix_length(P, S) = prefix_length_2(0, length(S), P, S).
+prefix_length(P, S) = Index :-
+ prefix_length_2(P, S, 0, Index).
-:- func prefix_length_2(int::in, int::in, pred(char)::in(pred(in) is semidet),
- string::in) = (int::out) is det.
+:- pred prefix_length_2(pred(char)::in(pred(in) is semidet),
+ string::in, int::in, int::out) is det.
-prefix_length_2(I, N, P, S) =
- % XXX We are using if-then-elses to get ordered conjunction.
- ( I < N ->
- ( P(S ^ unsafe_elem(I)) ->
- prefix_length_2(I + 1, N, P, S)
- ;
- I
- )
+prefix_length_2(P, S, I, Index) :-
+ (
+ string.unsafe_index_next(S, I, J, Char),
+ P(Char)
+ ->
+ prefix_length_2(P, S, J, Index)
;
- I
+ Index = I
).
%-----------------------------------------------------------------------------%
-suffix_length(P, S) = suffix_length_2(length(S) - 1, length(S), P, S).
+suffix_length(P, S) = End - Index :-
+ End = string.length(S),
+ suffix_length_2(P, S, End, Index).
-:- func suffix_length_2(int::in, int::in, pred(char)::in(pred(in) is semidet),
- string::in) = (int::out) is det.
+:- pred suffix_length_2(pred(char)::in(pred(in) is semidet),
+ string::in, int::in, int::out) is det.
-suffix_length_2(I, N, P, S) =
- % XXX We are using if-then-elses to get ordered conjunction.
- ( 0 =< I ->
- ( P(S ^ unsafe_elem(I)) ->
- suffix_length_2(I - 1, N, P, S)
- ;
- N - (I + 1)
- )
+suffix_length_2(P, S, I, Index) :-
+ (
+ string.unsafe_prev_index(S, I, J, Char),
+ P(Char)
+ ->
+ suffix_length_2(P, S, J, Index)
;
- N - (I + 1)
+ Index = I
).
%------------------------------------------------------------------------------%
@@ -5686,28 +6588,6 @@ private_builtin_type_info_to_revstrings(PrivateBuiltinTypeInfo, !Rs) :-
det_dynamic_cast(X, Y) :-
det_univ_to_type(univ(X), Y).
-%-----------------------------------------------------------------------------%
-
- % char_list_remove_suffix/3: We use this instead of the more general
- % list.remove_suffix so that (for example) string.format will succeed in
- % grade Java, even though unification has not yet been implemented.
- %
-:- pred char_list_remove_suffix(list(char)::in, list(char)::in,
- list(char)::out) is semidet.
-
-char_list_remove_suffix(List, Suffix, Prefix) :-
- list.length(List, ListLength),
- list.length(Suffix, SuffixLength),
- PrefixLength = ListLength - SuffixLength,
- list.split_list(PrefixLength, List, Prefix, Rest),
- char_list_equal(Suffix, Rest).
-
-:- pred char_list_equal(list(char)::in, list(char)::in) is semidet.
-
-char_list_equal([], []).
-char_list_equal([X | Xs], [X | Ys]) :-
- char_list_equal(Xs, Ys).
-
%------------------------------------------------------------------------------%
string.format_table(Columns, Separator) = Table :-
@@ -5730,7 +6610,7 @@ string.format_table(Columns, Separator) = Table :-
string.format_table_max(ColumnsLimits, Separator) = Table :-
MaxWidthsSenses = list.map(find_max_length_with_limit, ColumnsLimits),
Columns = list.map(project_column_strings, ColumnsLimits),
- SepLen = string.length(Separator),
+ SepLen = string.count_codepoints(Separator),
generate_rows(MaxWidthsSenses, Separator, SepLen, Columns, [], RevRows),
list.reverse(RevRows, Rows),
Table = string.join_list("\n", Rows).
@@ -5784,7 +6664,7 @@ pad_row([Justify - MaxWidth | JustifyWidths], [ColumnString0 | ColumnStrings0],
NextColumn = CurColumn + MaxWidth + SepLen,
pad_row(JustifyWidths, ColumnStrings0, Separator, SepLen, NextColumn,
LineRest),
- ( string.length(ColumnString0) =< MaxWidth ->
+ ( string.count_codepoints(ColumnString0) =< MaxWidth ->
(
Justify = just_left,
ColumnString = string.pad_right(ColumnString0, ' ', MaxWidth)
@@ -5870,7 +6750,7 @@ lpad(Chr, N, Str) = string.pad_left(Str, Chr, N).
:- pred max_str_length(string::in, int::in, int::out) is det.
max_str_length(Str, PrevMaxLen, MaxLen) :-
- Length = string.length(Str),
+ Length = string.count_codepoints(Str),
( Length > PrevMaxLen ->
MaxLen = Length
;
@@ -5883,7 +6763,7 @@ word_wrap(Str, N) = word_wrap_separator(Str, N, "").
word_wrap_separator(Str, N, WordSep) = Wrapped :-
Words = string.words_separator(char.is_whitespace, Str),
- SepLen = string.length(WordSep),
+ SepLen = string.count_codepoints(WordSep),
( SepLen < N ->
word_wrap_2(Words, WordSep, SepLen, 1, N, [], Wrapped)
;
@@ -5899,7 +6779,7 @@ word_wrap_2([], _, _, _, _, RevStrs,
word_wrap_2([Word | Words], WordSep, SepLen, Col, N, Prev, Wrapped) :-
% Col is the column where the next character should be written if there
% is space for a whole word.
- WordLen = string.length(Word),
+ WordLen = string.count_codepoints(Word),
(
% We are on the first column and the length of the word
% is less than the line length.
@@ -5972,10 +6852,10 @@ word_wrap_2([Word | Words], WordSep, SepLen, Col, N, Prev, Wrapped) :-
:- func break_up_string_reverse(string, int, list(string)) = list(string).
break_up_string_reverse(Str, N, Prev) = Strs :-
- ( string.length(Str) =< N ->
+ ( string.count_codepoints(Str) =< N ->
Strs = [Str | Prev]
;
- string.split(Str, N, Left, Right),
+ string.split_by_codepoint(Str, N, Left, Right),
Strs = break_up_string_reverse(Right, N, [Left | Prev])
).
diff --git a/library/term_io.m b/library/term_io.m
index 0be7ccb..9b7acf5 100644
--- a/library/term_io.m
+++ b/library/term_io.m
@@ -784,6 +784,7 @@ mercury_escape_char(Char) = EscapeCode :-
is_mercury_source_char(Char) :-
( char.is_alnum(Char)
; is_mercury_punctuation_char(Char)
+ ; char.to_int(Char) >= 0x80
).
% Currently we only allow the following characters.
diff --git a/runtime/mercury_string.c b/runtime/mercury_string.c
index 0f20219..fb6035b 100644
--- a/runtime/mercury_string.c
+++ b/runtime/mercury_string.c
@@ -112,6 +112,7 @@ MR_hash_string(MR_ConstString s)
{
MR_HASH_STRING_FUNC_BODY
}
+
MR_Integer
MR_hash_string2(MR_ConstString s)
{
@@ -123,3 +124,222 @@ MR_hash_string3(MR_ConstString s)
{
MR_HASH_STRING3_FUNC_BODY
}
+
+MR_bool
+MR_utf8_next(const MR_String s_, int *pos)
+{
+ const unsigned char *s = (const unsigned char *)s_;
+ int c;
+
+ if (*pos == '\0') {
+ return MR_FALSE;
+ }
+
+ for (;;) {
+ ++(*pos);
+ if (*pos == '\0') {
+ break;
+ }
+ c = s[*pos];
+ if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
+ break;
+ }
+ }
+
+ return MR_TRUE;
+}
+
+MR_bool
+MR_utf8_prev(const MR_String s_, int *pos)
+{
+ const unsigned char *s = (const unsigned char *)s_;
+ int c;
+
+ if (*pos <= 0) {
+ return MR_FALSE;
+ }
+
+ while (*pos > 0) {
+ (*pos)--;
+ c = s[*pos];
+ if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
+ break;
+ }
+ }
+
+ return MR_TRUE;
+}
+
+int
+MR_utf8_get(const MR_String s_, int pos)
+{
+ const unsigned char *s = (const unsigned char *)s_;
+ int c;
+ int remain;
+ int minc;
+ int i;
+
+ c = s[pos];
+
+ if (c <= 0x7F) {
+ /* Plain ASCII (including NUL terminator). */
+ return c;
+ }
+
+ if (c <= 0xC1) {
+ /* Trailing byte of multi-byte sequence or an overlong encoding for
+ * code point <= 127.
+ */
+ return -2;
+ }
+
+ if (c <= 0xDF) {
+ /* 2-byte sequence. */
+ c &= 0x1F;
+ remain = 1;
+ minc = 0x80;
+ }
+ else if (c <= 0xEF) {
+ /* 3-byte sequence. */
+ c &= 0x0F;
+ remain = 2;
+ minc = 0x800;
+ }
+ else if (c <= 0xF4) {
+ /* 4-byte sequence. */
+ c &= 0x07;
+ remain = 3;
+ minc = 0x10000;
+ }
+ else {
+ /* Otherwise invalid. */
+ return -2;
+ }
+
+ for (i = 1; i <= remain; i++) {
+ if (s[pos + i] == '\0') {
+ return -2;
+ }
+ }
+
+ while (remain--) {
+ int d = s[++pos];
+
+ if (!MR_utf8_is_trail_byte(d)) {
+ return -2;
+ }
+
+ c = (c << 6) | (d & 0x3F);
+ }
+
+ /* Check for overlong forms, which could be used to bypass security
+ * validations. We could also check code points aren't above U+10FFFF or in
+ * the surrogate ranges, but we don't.
+ */
+
+ if (c < minc) {
+ return -2;
+ }
+
+ return c;
+}
+
+int
+MR_utf8_get_next(const MR_String s, int *pos)
+{
+ int c = MR_utf8_get(s, *pos);
+
+ if (c >= 0) {
+ (*pos) += MR_utf8_width(c);
+ return c;
+ }
+
+ /* Some invalid byte sequence. */
+ MR_utf8_next(s, pos);
+ return c;
+}
+
+int
+MR_utf8_prev_get(const MR_String s, int *pos)
+{
+ if (MR_utf8_prev(s, pos)) {
+ return MR_utf8_get(s, *pos);
+ }
+
+ /* Past beginning. */
+ return -1;
+}
+
+
+size_t
+MR_utf8_width(int c)
+{
+ /* So we don't need to check for negative values nor use unsigned ints
+ * in the interface, which are a pain.
+ */
+ unsigned int uc = c;
+
+ if (uc <= 0x7f)
+ return 1;
+ if (uc <= 0x7ff)
+ return 2;
+ if (uc <= 0xffff)
+ return 3;
+ if (uc <= 0x10ffff)
+ return 4;
+ /* The rest are illegal. */
+ return 0;
+}
+
+size_t
+MR_utf8_encode(char s_[], int c)
+{
+ unsigned char *s = (unsigned char *)s_;
+ unsigned int uc = c;
+
+ if (uc <= 0x7f) {
+ s[0] = uc;
+ return 1;
+ }
+
+ if (uc <= 0x7ff) {
+ s[0] = 0xC0 | ((uc >> 6) & 0x1F);
+ s[1] = 0x80 | (uc & 0x3F);
+ return 2;
+ }
+
+ if (uc <= 0xffff) {
+ s[0] = 0xE0 | ((uc >> 12) & 0x0F);
+ s[1] = 0x80 | ((uc >> 6) & 0x3F);
+ s[2] = 0x80 | (uc & 0x3F);
+ return 3;
+ }
+
+ if (uc <= 0x10ffff) {
+ s[0] = 0xF0 | ((uc >> 18) & 0x07);
+ s[1] = 0x80 | ((uc >> 12) & 0x3F);
+ s[2] = 0x80 | ((uc >> 6) & 0x3F);
+ s[3] = 0x80 | (uc & 0x3F);
+ return 4;
+ }
+
+ /* Otherwise is illegal. */
+ return 0;
+}
+
+MR_bool
+MR_utf8_verify(const MR_String s)
+{
+ int pos = 0;
+ int c;
+
+ for (;;) {
+ c = MR_utf8_get_next(s, &pos);
+ if (c == 0) {
+ return MR_TRUE;
+ }
+ if (c < 0) {
+ return MR_FALSE;
+ }
+ }
+}
diff --git a/runtime/mercury_string.h b/runtime/mercury_string.h
index b12afbe..6265e5d 100644
--- a/runtime/mercury_string.h
+++ b/runtime/mercury_string.h
@@ -15,24 +15,21 @@
#include <stdarg.h>
/*
-** Mercury characters are given type `MR_Char', which is a typedef for `char'.
-** But BEWARE: when stored in an MR_Integer, the value must be
-** first cast to `MR_UnsignedChar'.
-** Mercury strings are stored as pointers to '\0'-terminated arrays of MR_Char.
+** Mercury characters (Unicode code points) are given type `MR_Char', which is
+** a typedef for `int'.
+** Mercury strings are stored as pointers to '\0'-terminated arrays of `char'.
+** Strings are UTF-8 encoded.
** Mercury strings must not contain null characters. Unexpected null characters
** are a source of security vulnerabilities.
**
-** We may eventually move to using wchar_t for Mercury characters and strings,
-** so it is important to use these typedefs.
-**
** The actual typedefs are in mercury_types.h to avoid problems with
** circular #includes.
**
-** typedef char MR_Char;
-** typedef unsigned char MR_UnsignedChar;
+** typedef int MR_Char;
+** typedef unsigned int MR_UnsignedChar;
**
-** typedef MR_Char *MR_String;
-** typedef const MR_Char *MR_ConstString;
+** typedef char *MR_String;
+** typedef const char *MR_ConstString;
*/
/*
@@ -202,9 +199,9 @@
MR_CHECK_EXPR_TYPE(s, MR_ConstString); \
len = 0; \
hash = 0; \
- while(((MR_ConstString)(s))[len]) { \
+ while (((const unsigned char *)(s))[len]) { \
hash ^= (hash << 5); \
- hash ^= (MR_UnsignedChar) ((MR_ConstString)(s))[len]; \
+ hash ^= ((const unsigned char *)(s))[len]; \
len++; \
} \
hash ^= len; \
@@ -217,9 +214,9 @@
MR_CHECK_EXPR_TYPE(s, MR_ConstString); \
len = 0; \
hash = 0; \
- while(((MR_ConstString)(s))[len]) { \
+ while (((const unsigned char *)(s))[len]) { \
hash = hash * 37; \
- hash += (MR_UnsignedChar) ((MR_ConstString)(s))[len]; \
+ hash += ((const unsigned char *)(s))[len]; \
len++; \
} \
hash ^= len; \
@@ -232,9 +229,9 @@
MR_CHECK_EXPR_TYPE(s, MR_ConstString); \
len = 0; \
hash = 0; \
- while(((MR_ConstString)(s))[len]) { \
+ while (((const unsigned char *)(s))[len]) { \
hash = hash * 49; \
- hash += (MR_UnsignedChar) ((MR_ConstString)(s))[len]; \
+ hash += ((const unsigned char *)(s))[len]; \
len++; \
} \
hash ^= len; \
@@ -314,4 +311,61 @@ MR_Integer MR_hash_string3(MR_ConstString);
MR_String MR_make_string(MR_Code *proclabel, const char *fmt, ...);
+/*
+** UTF-8 manipulation
+*/
+
+#define MR_utf8_is_single_byte(c) (((unsigned)(c) & 0x80) == 0)
+#define MR_utf8_is_lead_byte(c) (((unsigned)(c) - 0xC0) < 0x3E)
+#define MR_utf8_is_trail_byte(c) (((unsigned)(c) & 0xC0) == 0x80)
+
+/*
+** Advance `*pos' to the beginning of the next code point in `s'.
+** If `*pos' is already at the end of the string then return MR_FALSE
+** without modifying `*pos'.
+*/
+extern MR_bool MR_utf8_next(const MR_String s_, int *pos);
+
+/*
+** Rewind `*pos' to the beginning of the previous code point in `s'.
+** If `*pos' is already at the beginning of the string then return MR_FALSE
+** without modifying `*pos'.
+*/
+extern MR_bool MR_utf8_prev(const MR_String s_, int *pos);
+
+/*
+** Decode and return the code point beginning at `pos' in `s'.
+** Return 0 if at the end of the string (i.e. the NUL terminator).
+** If an illegal code sequence exists at that offset, return -2.
+*/
+extern int MR_utf8_get(const MR_String s, int pos);
+
+/*
+** Decode the code point beginning at `pos' in `s', and advance `*pos'.
+*/
+extern int MR_utf8_get_next(const MR_String s, int *pos);
+
+/*
+** Rewind `*pos' to the beginning of the previous code point in `s'
+** and return that code code.
+** Return -1 if `*pos' is already at the beginning of the string.
+*/
+extern int MR_utf8_prev_get(const MR_String s, int *pos);
+
+/*
+** Return the number of bytes required to encode the code point `c' in UTF-8.
+*/
+extern size_t MR_utf8_width(int c);
+
+/*
+** Encode the code point `c' into the buffer `s'.
+** Return the number of bytes used.
+*/
+extern size_t MR_utf8_encode(char s[], int c);
+
+/*
+** Return MR_TRUE iff `s' contains a valid UTF-8 encoded string.
+*/
+extern MR_bool MR_utf8_verify(const MR_String s);
+
#endif /* not MERCURY_STRING_H */
diff --git a/runtime/mercury_types.h b/runtime/mercury_types.h
index fc51fc8..fd32326 100644
--- a/runtime/mercury_types.h
+++ b/runtime/mercury_types.h
@@ -128,11 +128,11 @@ typedef MR_Code *MR_CodePtr;
** If you modify them, you will need to modify mercury_string.h as well.
*/
-typedef char MR_Char;
-typedef unsigned char MR_UnsignedChar;
+typedef int MR_Char;
+typedef unsigned int MR_UnsignedChar;
-typedef MR_Char *MR_String;
-typedef const MR_Char *MR_ConstString;
+typedef char *MR_String;
+typedef const char *MR_ConstString;
/*
** Definitions for accessing the representation of the Mercury `array' type.
diff --git a/tests/debugger/tailrec1.m b/tests/debugger/tailrec1.m
index a2949d3..13e680f 100644
--- a/tests/debugger/tailrec1.m
+++ b/tests/debugger/tailrec1.m
@@ -55,7 +55,9 @@ tailrec1_read_strings(Stream, !Words, !IO) :-
int i;
i = 0;
- while ((c = mercury_getc((MercuryFilePtr) Stream)) != EOF && c != '\\n') {
+ while ((c = mercury_get_byte((MercuryFilePtr) Stream)) != EOF &&
+ c != '\\n')
+ {
if (i < 100) {
buf[i] = c;
}
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 5df4d3e..1d77007 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -294,6 +294,7 @@ ORDINARY_PROGS= \
user_compare \
user_defined_equality2 \
value_enum \
+ words_separator \
write \
write_reg1 \
write_reg2 \
diff --git a/tests/hard_coded/nonascii.exp b/tests/hard_coded/nonascii.exp
index 88fe4c9..28d7cbc 100644
--- a/tests/hard_coded/nonascii.exp
+++ b/tests/hard_coded/nonascii.exp
@@ -252,6 +252,11 @@
253,
254,
255,
+2047,
+2048,
+65535,
+65536,
+1114111,
10
1
2
@@ -507,4 +512,9 @@
253
254
255
+2047
+2048
+65535
+65536
+1114111
10
diff --git a/tests/hard_coded/nonascii.m b/tests/hard_coded/nonascii.m
index 23fa3eb..c7825fc 100644
--- a/tests/hard_coded/nonascii.m
+++ b/tests/hard_coded/nonascii.m
@@ -18,11 +18,12 @@ main(!IO) :-
(
Result = ok(Stream),
test1(Stream, !IO),
- test2(Stream, 255, !IO)
+ test2(Stream, 260, !IO)
;
Result = error(Error),
io__error_message(Error, Msg),
- io__write_string(Msg, !IO)
+ io__write_string(Msg, !IO),
+ io__nl(!IO)
).
:- pred test1(io__input_stream::in, io::di, io::uo) is det.
@@ -41,7 +42,8 @@ test1(Stream, !IO) :-
;
Result = error(Error),
io__error_message(Error, Msg),
- io__write_string(Msg, !IO)
+ io__write_string(Msg, !IO),
+ io__nl(!IO)
).
:- pred test2(io__input_stream::in, int::in, io::di, io::uo) is det.
@@ -60,7 +62,8 @@ test2(Stream, N, !IO) :-
;
Result = error(Error),
io__error_message(Error, Msg),
- io__write_string(Msg, !IO)
+ io__write_string(Msg, !IO),
+ io__nl(!IO)
),
test2(Stream, N - 1, !IO)
;
diff --git a/tests/hard_coded/nonascii_gen.c b/tests/hard_coded/nonascii_gen.c
index b3a7196..2cf3423 100644
--- a/tests/hard_coded/nonascii_gen.c
+++ b/tests/hard_coded/nonascii_gen.c
@@ -1,24 +1,73 @@
#include <stdio.h>
+size_t
+MR_utf8_encode(char s_[], int c)
+{
+ unsigned char *s = (unsigned char *)s_;
+ unsigned int uc = c;
+
+ if (uc <= 0x7f) {
+ s[0] = uc;
+ return 1;
+ }
+
+ if (uc <= 0x7ff) {
+ s[0] = 0xC0 | ((uc >> 6) & 0x1F);
+ s[1] = 0x80 | (uc & 0x3F);
+ return 2;
+ }
+
+ if (uc <= 0xffff) {
+ s[0] = 0xE0 | ((uc >> 12) & 0x0F);
+ s[1] = 0x80 | ((uc >> 6) & 0x3F);
+ s[2] = 0x80 | (uc & 0x3F);
+ return 3;
+ }
+
+ if (uc <= 0x10ffff) {
+ s[0] = 0xF0 | ((uc >> 18) & 0x07);
+ s[1] = 0x80 | ((uc >> 12) & 0x3F);
+ s[2] = 0x80 | ((uc >> 6) & 0x3F);
+ s[3] = 0x80 | (uc & 0x3F);
+ return 4;
+ }
+
+ /* Otherwise is illegal. */
+ return 0;
+}
+
+void
+put_utf8(int c)
+{
+ char s[4];
+ size_t n;
+ size_t i;
+
+ n = MR_utf8_encode(s, c);
+ for (i = 0; i < n; i++) {
+ putchar(s[i]);
+ }
+}
+
int
main(void)
{
+ int repeat;
int i;
- for (i = 1; i < 256; i++) {
- if (i != '\n') {
- putchar(i);
+ for (repeat = 0; repeat < 2; repeat++) {
+ for (i = 1; i < 256; i++) {
+ if (i != '\n') {
+ put_utf8(i);
+ }
}
- }
-
- putchar('\n');
- for (i = 1; i < 256; i++) {
- if (i != '\n') {
- putchar(i);
- }
+ put_utf8(0x0007ff);
+ put_utf8(0x000800);
+ put_utf8(0x00ffff);
+ put_utf8(0x010000);
+ put_utf8(0x10ffff);
+ put_utf8('\n');
}
-
- putchar('\n');
return 0;
}
diff --git a/tests/hard_coded/special_char.exp b/tests/hard_coded/special_char.exp
index 52dc034..91f2fe8 100644
--- a/tests/hard_coded/special_char.exp
+++ b/tests/hard_coded/special_char.exp
@@ -1,4 +1,4 @@
Hello world
-?SÑ
+?SÃ
diff --git a/tests/hard_coded/unicode_test.exp b/tests/hard_coded/unicode_test.exp
index 5ed1bf3..cccf2b8 100644
--- a/tests/hard_coded/unicode_test.exp
+++ b/tests/hard_coded/unicode_test.exp
@@ -1,11 +1,11 @@
00000011
00000011
-11001110 10010100
-11001110 10100000
-11101111 10111111 10111111
-11110100 10001111 10111111 10111111
-11110010 10101011 10110011 10011110
-01110010 11000011 10101001 01110011 01110101 01101101 11000011 10101001
+1110010100
+1110100000
+1111111111111111
+100001111111111111111
+10101011110011011110
+01110010 11101001 01110011 01110101 01101101 11101001
01100001 01100010 01100011 00110001 00110010 00110011
01011100 01110101 00110000 00110000 00110100 00110001
01011100 01110101 00110000 00110000 00110100 00110001
diff --git a/tests/hard_coded/words_separator.exp b/tests/hard_coded/words_separator.exp
new file mode 100644
index 0000000..a330c01
--- /dev/null
+++ b/tests/hard_coded/words_separator.exp
@@ -0,0 +1,8 @@
+[]
+[]
+[]
+["x"]
+["x"]
+["x", "y"]
+["x", "y"]
+["x", "y"]
diff --git a/tests/hard_coded/words_separator.m b/tests/hard_coded/words_separator.m
new file mode 100644
index 0000000..bcaa715
--- /dev/null
+++ b/tests/hard_coded/words_separator.m
@@ -0,0 +1,43 @@
+%-----------------------------------------------------------------------------%
+
+:- module words_separator.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%-----------------------------------------------------------------------------%
+%-----------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module list.
+:- import_module string.
+
+%-----------------------------------------------------------------------------%
+
+main(!IO) :-
+ S1 = words_separator(comma, ""),
+ S2 = words_separator(comma, ","),
+ S3 = words_separator(comma, ",,"),
+ S4 = words_separator(comma, "x,"),
+ S5 = words_separator(comma, ",x"),
+ S6 = words_separator(comma, "x,y"),
+ S7 = words_separator(comma, "x,,y"),
+ S8 = words_separator(comma, ",x,,y,"),
+ list.foldl(write_nl, [S1, S2, S3, S4, S5, S6, S7, S8], !IO).
+
+:- pred comma(char::in) is semidet.
+
+comma(',').
+
+:- pred write_nl(list(string)::in, io::di, io::uo) is det.
+
+write_nl(L, !IO) :-
+ io.write(L, !IO),
+ io.nl(!IO).
+
+%-----------------------------------------------------------------------------%
+% vim: ft=mercury ts=8 sts=4 sw=4 et
diff --git a/tests/valid/mercury_java_parser_follow_code_bug.m b/tests/valid/mercury_java_parser_follow_code_bug.m
index a9d42a4..92dee13 100644
--- a/tests/valid/mercury_java_parser_follow_code_bug.m
+++ b/tests/valid/mercury_java_parser_follow_code_bug.m
@@ -500,7 +500,7 @@ null_literal -->
% [5]Contents | [6]Prev | [7]Next | [8]Index Java Language Specification
% Second Edition
%
-% [9]Copyright © 2000 Sun Microsystems, Inc. All rights reserved
+% [9]Copyright © 2000 Sun Microsystems, Inc. All rights reserved
% Please send any comments or corrections to the [10]JLS team
% Syntax
--------------------------------------------------------------------------
mercury-reviews mailing list
Post messages to: mercury-reviews at csse.unimelb.edu.au
Administrative Queries: owner-mercury-reviews at csse.unimelb.edu.au
Subscriptions: mercury-reviews-request at csse.unimelb.edu.au
--------------------------------------------------------------------------
More information about the reviews
mailing list