[m-rev.] preliminary review: UTF-8/16 support

Peter Wang novalazy at gmail.com
Wed Feb 16 17:19:57 AEDT 2011


On 2010-12-01, Peter Wang <novalazy at gmail.com> wrote:
> Here is some work in progress on UTF-8 and UTF-16 string support, in
> case someone would like to look at it or make suggestions.
> 
> We would commit to using the Universal Character Set instead of
> trying to be agnostic about string encodings.
> Mercury `char' becomes a Unicode code point, so is re-defined as a
> 32-bit integer (at least).
>    
> String indexing predicates return code points instead of code units.
> e.g. `string.index(String, CodeUnitOffset, CodePoint)' returns the
> code point beginning at CodeUnitOffset.
>    
> When you iterate over a string you should not advance to Index + 1 any
> more, so there is are predicates which return the next/previous offset:
>     string.index_next(String, Offset, CodePoint, NextOffset)
>     string.prev_index(String, NextOffset, CodePoint, Offset)

Newer patch as requested by Julien.

Peter

diff --git a/compiler/c_util.m b/compiler/c_util.m
index 8f9c30a..bfe1cf1 100644
--- a/compiler/c_util.m
+++ b/compiler/c_util.m
@@ -179,11 +179,13 @@
 
 :- implementation.
 
+:- import_module libs.compiler_util.
 :- import_module libs.options.
 
 :- import_module bool.
 :- import_module int.
 :- import_module list.
+:- import_module require.
 :- import_module string.
 
 %-----------------------------------------------------------------------------%
@@ -307,7 +309,27 @@ output_quoted_string(S, !IO) :-
     output_quoted_string_lang(literal_c, S, !IO).
 
 output_quoted_string_lang(Lang, S, !IO) :-
-    do_output_quoted_string(Lang, 0, length(S), S, !IO).
+    (
+        Lang = literal_c,
+        % Avoid a limitation in the MSVC compiler where string literals can be
+        % no longer than 2048 chars. However if you output the string in
+        % chunks, eg "part a" "part b" it will accept a string longer than 2048
+        % chars, go figure!
+        % XXX we could avoid creating new strings
+        string.split_by_codepoint(S, 160, Left, Right),
+        do_output_quoted_string(Lang, Left, 0, !IO),
+        ( Right = "" ->
+            true
+        ;
+            io.write_string("\" \"", !IO),
+            output_quoted_string_lang(Lang, Right, !IO)
+        )
+    ;
+        ( Lang = literal_java
+        ; Lang = literal_csharp
+        ),
+        do_output_quoted_string(Lang, S, 0, !IO)
+    ).
 
 output_quoted_multi_string(Ss, !IO) :-
     output_quoted_multi_string_lang(literal_c, Ss, !IO).
@@ -318,50 +340,13 @@ output_quoted_multi_string_lang(Lang, [S | Ss], !IO) :-
     output_quoted_char_lang(Lang, char.det_from_int(0), !IO),
     output_quoted_multi_string_lang(Lang, Ss, !IO).
 
-:- pred do_output_quoted_string(literal_language::in, int::in, int::in,
-    string::in, io::di, io::uo) is det.
+:- pred do_output_quoted_string(literal_language::in, string::in,
+    int::in, io::di, io::uo) is det.
 
-do_output_quoted_string(Lang, Cur, Len, S, !IO) :-
-    ( Cur < Len ->
-        % Avoid a limitation in the MSVC compiler where string literals
-        % can be no longer than 2048 chars. However if you output the string
-        % in chunks, eg "part a" "part b" it will accept a string longer than
-        % 2048 chars, go figure!
-        (
-            Lang = literal_c,
-            Cur \= 0,
-            Cur mod 512 = 0
-        ->
-            io.write_string("\" \"", !IO)
-        ;
-            true
-        ),
-
-        string.unsafe_index(S, Cur, Char),
+do_output_quoted_string(Lang, S, Cur, !IO) :-
+    ( string.unsafe_index_next(S, Cur, Next, Char) ->
         output_quoted_char_lang(Lang, Char, !IO),
-        
-        % Check for trigraph sequences in string literals. We break the
-        % trigraph by breaking the string into multiple chunks. For example,
-        % "??-" gets converted to "?" "?-".
-        (
-            Lang = literal_c,
-            Char = '?',
-            Cur + 2 < Len
-        ->
-            (
-                string.unsafe_index(S, Cur + 1, '?'),
-                string.unsafe_index(S, Cur + 2, ThirdChar),
-                is_trigraph_char(ThirdChar)
-            ->
-                io.write_string("\" \"", !IO)
-            ;
-                true
-            )
-        ;
-            true
-        ),
-
-        do_output_quoted_string(Lang, Cur + 1, Len, S, !IO)
+        do_output_quoted_string(Lang, S, Next, !IO)
     ;
         true
     ).
@@ -403,25 +388,40 @@ quote_one_char(Lang, Char, RevChars0, RevChars) :-
     ->
         RevChars = [EscapeChar, '\\' | RevChars0]
     ;
-        is_c_source_char(Char)
+        Lang = literal_c,
+        Char = '?'
     ->
-        RevChars = [Char | RevChars0]
+        % Avoid trigraphs by escaping the question marks.
+        RevChars = ['?', '\\' | RevChars0]
     ;
-        Lang = literal_java,
-        char.to_int(Char) >= 0x80
+        is_c_source_char(Char)
     ->
-        % If the compiler is built in a C grade (8-bit strings), we assume that
-        % both the Mercury source file and Java target file use UTF-8 encoding.
-        % Each `Char' will be a UTF-8 code unit in a multi-byte sequence.
-        % If the compiler is built in a Java backend, each `Char' will be a
-        % UTF-16 code unit, possibly of a surrogate pair. In both cases the
-        % code units must be passed through without escaping.
         RevChars = [Char | RevChars0]
     ;
         char.to_int(Char, 0)
     ->
         RevChars = ['0', '\\' | RevChars0]
     ;
+        Int = char.to_int(Char),
+        Int >= 0x80
+    ->
+        (
+            Lang = literal_c,
+            ( char.to_utf8(Char, CodeUnits) ->
+                list.map(octal_escape_any_int, CodeUnits, EscapeCharss),
+                list.condense(EscapeCharss, EscapeChars),
+                reverse_append(EscapeChars, RevChars0, RevChars)
+            ;
+                unexpected($module, $pred, "invalid Unicode code point")
+            )
+        ;
+            Lang = literal_java,
+            RevChars = [Char | RevChars0]
+        ;
+            Lang = literal_csharp,
+            RevChars = [Char | RevChars0]
+        )
+    ;
         (
             Lang = literal_c,
             octal_escape_any_char(Char, EscapeChars)
@@ -430,7 +430,7 @@ quote_one_char(Lang, Char, RevChars0, RevChars) :-
             octal_escape_any_char(Char, EscapeChars)
         ;
             Lang = literal_csharp,
-            hex_escape_any_char(Char, EscapeChars)
+            unicode_escape_any_char(Char, EscapeChars)
         ),
         reverse_append(EscapeChars, RevChars0, RevChars)
     ).
@@ -453,20 +453,6 @@ escape_special_char('\v', 'v'). % not in Java
 escape_special_char('\r', 'r').
 escape_special_char('\f', 'f').
 
-    % Succeed if the given character, prefixed with "??", is a trigraph.
-    %
-:- pred is_trigraph_char(char::in) is semidet.
-
-is_trigraph_char('(').
-is_trigraph_char(')').
-is_trigraph_char('<').
-is_trigraph_char('>').
-is_trigraph_char('=').
-is_trigraph_char('/').
-is_trigraph_char('\'').
-is_trigraph_char('!').
-is_trigraph_char('-').
-
     % This succeeds iff the specified character is allowed as an (unescaped)
     % character in standard-conforming C source code.
     %
@@ -501,19 +487,20 @@ reverse_append([X | Xs], L0, L) :-
     %
 octal_escape_any_char(Char, EscapeCodeChars) :-
     char.to_int(Char, Int),
+    octal_escape_any_int(Int, EscapeCodeChars).
+
+:- pred octal_escape_any_int(int::in, list(char)::out) is det.
+
+octal_escape_any_int(Int, EscapeCodeChars) :-
     string.int_to_base_string(Int, 8, OctalString0),
     string.pad_left(OctalString0, '0', 3, OctalString),
     EscapeCodeChars = ['\\' | string.to_char_list(OctalString)].
 
-:- pred hex_escape_any_char(char::in, list(char)::out) is det.
+:- pred unicode_escape_any_char(char::in, list(char)::out) is det.
 
-    % Convert a character to the corresponding hexadeciaml escape code.
-    % XXX This assumes that the target language compiler's representation
-    % of characters is the same as the Mercury compiler's.
-    %
-hex_escape_any_char(Char, EscapeCodeChars) :-
+unicode_escape_any_char(Char, EscapeCodeChars) :-
     char.to_int(Char, Int),
-    string.format("\\x%04x", [i(Int)], HexString),
+    string.format("\\u%04x", [i(Int)], HexString),
     string.to_char_list(HexString, EscapeCodeChars).
 
 %-----------------------------------------------------------------------------%
diff --git a/compiler/compile_target_code.m b/compiler/compile_target_code.m
index fe4a07b..5552c4f 100644
--- a/compiler/compile_target_code.m
+++ b/compiler/compile_target_code.m
@@ -404,8 +404,8 @@ do_compile_c_file(ErrorStream, PIC, C_File, O_File, Globals, Succeeded, !IO) :-
     string.append_list([
         CC, " ", 
         AllCFlags,
-        " -c ", C_File, " ",
-        NameObjectFile, O_File], Command),
+        " -c ", quote_arg(C_File), " ",
+        NameObjectFile, quote_arg(O_File)], Command),
     get_maybe_filtercc_command(Globals, MaybeFilterCmd),
     invoke_system_command_maybe_filter_output(Globals, ErrorStream,
         cmd_verbose_commands, Command, MaybeFilterCmd, Succeeded, !IO).
@@ -1262,7 +1262,8 @@ module_name_to_file_name_ext(Globals, Ext, MkDir, ModuleName, FileName, !IO) :-
 
 invoke_mkinit(Globals, InitFileStream, Verbosity,
         MkInit, Args, FileNames, MkInitOK, !IO) :-
-    join_quoted_string_list(FileNames, "", "\n", "", TargetFileNames),
+    % mkinit expects unquoted file names.
+    join_string_list(FileNames, "", "\n", "", TargetFileNames),
 
     io.make_temp(TmpFile, !IO),
     io.open_output(TmpFile, OpenResult, !IO),
@@ -1969,7 +1970,7 @@ link_exe_or_shared_lib(Globals, ErrorStream, LinkTargetType, ModuleName,
                     UndefOpt, " ",
                     ThreadOpts, " ",
                     TraceOpts, " ",
-                    " -o ", OutputFileName, " ",
+                    " -o ", quote_arg(OutputFileName), " ",
                     Objects, " ",
                     LinkOptSep, " ",
                     LinkLibraryDirectories, " ",
diff --git a/compiler/elds_to_erlang.m b/compiler/elds_to_erlang.m
index 6beb095..23d3884 100644
--- a/compiler/elds_to_erlang.m
+++ b/compiler/elds_to_erlang.m
@@ -1178,7 +1178,7 @@ write_with_escaping(StringOrAtom, String, !IO) :-
 write_with_escaping_2(StringOrAtom, Char, !IO) :-
     char.to_int(Char, Int),
     (
-        32 =< Int, Int =< 126,
+        32 =< Int, % Int =< 126,
         Char \= ('\\'),
         (
             StringOrAtom = in_string,
diff --git a/compiler/mlds_to_cs.m b/compiler/mlds_to_cs.m
index 9d8133d..686b8b0 100644
--- a/compiler/mlds_to_cs.m
+++ b/compiler/mlds_to_cs.m
@@ -2238,7 +2238,8 @@ type_to_string(Info, MLDS_Type, String, ArrayDims) :-
         ArrayDims = []
     ;
         MLDS_Type = mlds_native_char_type,
-        String = "char",
+        % C# `char' not large enough for code points so we must use `int'.
+        String = "int",
         ArrayDims = []
     ;
         MLDS_Type = mlds_foreign_type(ForeignType),
@@ -2333,7 +2334,8 @@ type_to_string(Info, MLDS_Type, String, ArrayDims) :-
 mercury_type_to_string(Info, Type, CtorCat, String, ArrayDims) :-
     (
         CtorCat = ctor_cat_builtin(cat_builtin_char),
-        String = "char",
+        % C# `char' not large enough for code points so we must use `int'.
+        String = "int",
         ArrayDims = []
     ;
         CtorCat = ctor_cat_builtin(cat_builtin_int),
@@ -3513,9 +3515,10 @@ csharp_builtin_type(Type, "double") :-
     Type = mlds_native_float_type.
 csharp_builtin_type(Type, "double") :-
     Type = mercury_type(builtin_type(builtin_type_float), _, _).
-csharp_builtin_type(Type, "char") :-
+    % C# `char' not large enough for code points so we must use `int'.
+csharp_builtin_type(Type, "int") :-
     Type = mlds_native_char_type.
-csharp_builtin_type(Type, "char") :-
+csharp_builtin_type(Type, "int") :-
     Type = mercury_type(builtin_type(builtin_type_char), _, _).
 csharp_builtin_type(Type, "bool") :-
     Type = mlds_native_bool_type.
@@ -3609,7 +3612,7 @@ output_rval_const(Info, Const, !IO) :-
         output_int_const(N, !IO)
     ;
         Const = mlconst_char(N),
-        io.write_string("((char) ", !IO),
+        io.write_string("( ", !IO),
         output_int_const(N, !IO),
         io.write_string(")", !IO)
     ;
diff --git a/compiler/mlds_to_java.m b/compiler/mlds_to_java.m
index 3004247..cf9db1c 100644
--- a/compiler/mlds_to_java.m
+++ b/compiler/mlds_to_java.m
@@ -3449,7 +3449,8 @@ type_to_string(Info, MLDS_Type, String, ArrayDims) :-
         ArrayDims = []
     ;
         MLDS_Type = mlds_native_char_type,
-        String = "char",
+        % Java `char' not large enough for code points so we must use `int'.
+        String = "int",
         ArrayDims = []
     ;
         MLDS_Type = mlds_foreign_type(ForeignType),
@@ -3543,7 +3544,8 @@ type_to_string(Info, MLDS_Type, String, ArrayDims) :-
 mercury_type_to_string(Info, Type, CtorCat, String, ArrayDims) :-
     (
         CtorCat = ctor_cat_builtin(cat_builtin_char),
-        String = "char",
+        % Java `char' not large enough for code points so we must use `int'.
+        String = "int",
         ArrayDims = []
     ;
         CtorCat = ctor_cat_builtin(cat_builtin_int),
@@ -4872,9 +4874,10 @@ java_builtin_type(Type, "double", "java.lang.Double", "doubleValue") :-
     Type = mlds_native_float_type.
 java_builtin_type(Type, "double", "java.lang.Double", "doubleValue") :-
     Type = mercury_type(builtin_type(builtin_type_float), _, _).
-java_builtin_type(Type, "char", "java.lang.Character", "charValue") :-
+    % Java `char' not large enough for code points so we must use `int'.
+java_builtin_type(Type, "int", "java.lang.Integer", "intValue") :-
     Type = mlds_native_char_type.
-java_builtin_type(Type, "char", "java.lang.Character", "charValue") :-
+java_builtin_type(Type, "int", "java.lang.Integer", "intValue") :-
     Type = mercury_type(builtin_type(builtin_type_char), _, _).
 java_builtin_type(Type, "boolean", "java.lang.Boolean", "booleanValue") :-
     Type = mlds_native_bool_type.
@@ -5001,7 +5004,7 @@ output_rval_const(Info, Const, !IO) :-
         output_int_const(N, !IO)
     ;
         Const = mlconst_char(N),
-        io.write_string("((char) ", !IO),
+        io.write_string("(", !IO),
         output_int_const(N, !IO),
         io.write_string(")", !IO)
     ;
diff --git a/doc/reference_manual.texi b/doc/reference_manual.texi
index 5b7879f..f33f0a6 100644
--- a/doc/reference_manual.texi
+++ b/doc/reference_manual.texi
@@ -6742,11 +6742,13 @@ than @samp{sizeof(MR_Word)}.
 @node C# data passing conventions
 @subsection C# data passing conventions
 
-The Mercury types @code{int}, @code{float}, @code{char},
+The Mercury types @code{int}, @code{float},
 and @code{string} are mapped to the Common Language Infrastructure (CLI) types
- at code{System.Int32}, @code{System.Double}, @code{System.Char} and
+ at code{System.Int32}, @code{System.Double}, and
 @code{System.String} respectively, which correspond to the C# types
- at code{int}, @code{double}, @code{char}, and @code{string}.
+ at code{int}, @code{double}, and @code{string}.
+The Mercury type @code{char} is mapped like @code{int}; @emph{not} to the CLI
+type @code{System.Char} because that only holds 16-bit numeric values.
 
 For the Mercury standard library type @samp{bool.bool}, there is a
 corresponding C# type, @code{mr_bool.Bool_0}.  C# code can refer to the
@@ -6858,10 +6860,12 @@ List_1    list.cons(object head, List_1 tail)
 @node Java data passing conventions
 @subsection Java data passing conventions
 
-The Mercury types @code{int}, @code{float}, @code{char},
+The Mercury types @code{int}, @code{float},
 and @code{string} are mapped to the Java types 
- at code{int}, @code{double}, @code{char} and
+ at code{int}, @code{double}, and
 @code{java.lang.String} respectively.
+The Mercury type @code{char} is mapped like @code{int}; @emph{not} to the Java
+type @code{char} because that only holds 16-bit numeric values.
 
 For the Mercury standard library type @samp{bool.bool}, there is a
 corresponding Java type, @code{bool.Bool_0}.  Java code can refer to the
diff --git a/library/char.m b/library/char.m
index 5771ebf..f4ef1df 100644
--- a/library/char.m
+++ b/library/char.m
@@ -24,31 +24,28 @@
 
 :- module char.
 :- interface.
+
 :- import_module enum.
+:- import_module list.
 :- import_module pretty_printer.
 
 %-----------------------------------------------------------------------------%
 
+    % A Unicode code point.
+    %
 :- type char == character.
 
 :- instance enum(character).
 
     % Convert a character to its corresponding numerical code (integer value).
-    % Beware that the mapping from characters to numerical codes is
-    % implementation-dependent; there is no guarantee that the integer values
-    % for characters will fit in 8 bits. Furthermore, the value returned from
-    % char.to_int might be different than the byte(s) used to store the
-    % character in a file. There is also no guarantee that characters created
-    % using `char.to_int(out, in)' can be written to files or to the standard
-    % output or standard error streams. For example, an implementation might
-    % represent characters using Unicode, but store files in an 8-bit national
-    % character set.
     %
     % Note that '\0' is not accepted as a Mercury null character constant.
     % Instead, a null character can be created using `char.det_from_int(0)'.
     % Null characters aren't very useful in Mercury because they aren't
     % allowed in strings.
     %
+    % Codepoints outside the Unicode range are also not accepted.
+    %
 :- func char.to_int(char) = int.
 :- pred char.to_int(char, int).
 :- mode char.to_int(in, out) is det.
@@ -97,8 +94,8 @@
 :- mode char.lower_upper(in, out) is semidet.
 :- mode char.lower_upper(out, in) is semidet.
 
-    % True iff the character is whitespace, i.e. a space, tab,
-    % newline, carriage return, form-feed, or vertical tab.
+    % True iff the character is a whitespace character in the ASCII range,
+    % i.e. a space, tab, newline, carriage return, form-feed, or vertical tab.
     %
 :- pred char.is_whitespace(char::in) is semidet.
 
@@ -175,11 +172,26 @@
     %
 :- func char.char_to_doc(char) = pretty_printer.doc.
 
+    % Encode a Unicode code point in UTF-8.
+    %
+:- pred char.to_utf8(char::in, list(int)::out) is semidet.
+
+    % Encode a Unicode code point in UTF-16 (native endianness).
+    %
+:- pred char.to_utf16(char::in, list(int)::out) is semidet.
+
+    % Succeed if `Char' is a Unicode surrogate code point.
+    % In UTF-16, a code point with a scalar value greater than 0xffff
+    % is encoded with a pair of surrogate code points.
+    %
+:- pred char.is_surrogate(char::in) is semidet.
+
 %-----------------------------------------------------------------------------%
 %-----------------------------------------------------------------------------%
 
 :- implementation.
 
+:- import_module int.
 :- import_module require.
 :- import_module term_io.
 
@@ -513,13 +525,8 @@ char.det_from_int(Int) = Char :-
     [will_not_call_mercury, promise_pure, thread_safe,
         does_not_affect_liveness],
 "
-    /*
-    ** If the integer doesn't fit into a char, then the assignment
-    ** `Character = Int' below will truncate it. SUCCESS_INDICATOR will be set
-    ** to true only if the result was not truncated.
-    */
     Character = Int;
-    SUCCESS_INDICATOR = ((MR_UnsignedChar) Character == Int);
+    SUCCESS_INDICATOR = (Character >= 0 && Character <= 0x10ffff);
 ").
 
 :- pragma foreign_proc("C#",
@@ -540,8 +547,8 @@ char.det_from_int(Int) = Char :-
     char.to_int(Character::out, Int::in),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    Character = (char) Int;
-    SUCCESS_INDICATOR = (Character == Int);
+    Character = Int;
+    SUCCESS_INDICATOR = (Int >= 0 && Int <= 0x10ffff);
 ").
 
 :- pragma foreign_proc("Java",
@@ -562,8 +569,8 @@ char.det_from_int(Int) = Char :-
     char.to_int(Character::out, Int::in),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    Character = (char) Int;
-    SUCCESS_INDICATOR = ((int) Character == Int);
+    Character = Int;
+    SUCCESS_INDICATOR = (Int >= 0 && Int <= 0x10ffff);
 ").
 
 :- pragma foreign_proc("Erlang",
@@ -584,16 +591,8 @@ char.det_from_int(Int) = Char :-
     char.to_int(Character::out, Int::in),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    case
-        Int >= 0 andalso Int < 256
-    of
-        true ->
-            SUCCESS_INDICATOR = true,
-            Character = Int;
-        false ->
-            SUCCESS_INDICATOR = false,
-            Character = -1
-    end
+    Character = Int,
+    SUCCESS_INDICATOR = (Int >= 0 andalso Int =< 16#10ffff)
 ").
 
     % We used unsigned character codes, so the minimum character code
@@ -607,33 +606,70 @@ char.min_char_value(0).
     [will_not_call_mercury, promise_pure, thread_safe,
         does_not_affect_liveness],
 "
-    Max = UCHAR_MAX;
+    Max = 0x10ffff;
 ").
 :- pragma foreign_proc("C#",
     char.max_char_value(Max::out),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    // .NET uses 16-bit 'Unicode'. This might be either UCS-2,
-    // where Unicode characters that don't fit in 16 bits are encoded
-    // in two 16 bit characters, or it might be just the 16-bit subset,
-    // i.e. only the Unicode characters that fit in 16 bits.
-    // For our purposes, it doesn't matter.
-    Max = 0xffff;
+    Max = 0x10ffff;
 ").
 :- pragma foreign_proc("Java",
     char.max_char_value(Max::out),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    Max = (int) java.lang.Character.MAX_VALUE;
+    Max = 0x10ffff;
 ").
 :- pragma foreign_proc("Erlang",
     char.max_char_value(Max::out),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    % XXX actually Erlang doesn't have chars so there should be no maximum
-    Max = 255
+    Max = 16#10ffff
 ").
 
+char.to_utf8(Char, CodeUnits) :-
+    Int = char.to_int(Char),
+    ( Int =< 0x7f ->
+        CodeUnits = [Int]
+    ; Int =< 0x7ff ->
+        A = 0xc0 \/ ((Int >> 6) /\ 0x1f),
+        B = 0x80 \/  (Int       /\ 0x3f),
+        CodeUnits = [A, B]
+    ; Int =< 0xffff ->
+        A = 0xe0 \/ ((Int >> 12) /\ 0x0f),
+        B = 0x80 \/ ((Int >>  6) /\ 0x3f),
+        C = 0x80 \/  (Int        /\ 0x3f),
+        CodeUnits = [A, B, C]
+    ; Int =< 0x10ffff ->
+        A = 0xf0 \/ ((Int >> 18) /\ 0x07),
+        B = 0x80 \/ ((Int >> 12) /\ 0x3f),
+        C = 0x80 \/ ((Int >>  6) /\ 0x3f),
+        D = 0x80 \/  (Int        /\ 0x3f),
+        CodeUnits = [A, B, C, D]
+    ;
+        % Illegal code point.
+        fail
+    ).
+
+char.to_utf16(Char, CodeUnits) :-
+    Int = char.to_int(Char),
+    ( Int =< 0xffff ->
+        CodeUnits = [Int]
+    ; Int =< 0x10ffff ->
+        U = Int - 0x10000,
+        A = 0xd800 \/ (U >> 10),
+        B = 0xdc00 \/ (U /\ 0x3ff),
+        CodeUnits = [A, B]
+    ;
+        % Illegal code point.
+        fail
+    ).
+
+char.is_surrogate(Char) :-
+    Int = char.to_int(Char),
+    Int >= 0xd800,
+    Int =< 0xdfff.
+
 %-----------------------------------------------------------------------------%
 %-----------------------------------------------------------------------------%
 % Ralph Becket <rwab1 at cl.cam.ac.uk> 27/04/99
diff --git a/library/io.m b/library/io.m
index 89d0d94..ed568d5 100644
--- a/library/io.m
+++ b/library/io.m
@@ -133,7 +133,7 @@
 % Text input predicates
 %
 
-    % Reads a character from the current input stream.
+    % Reads a character (code point) from the current input stream.
     %
 :- pred io.read_char(io.result(char)::out, io::di, io::uo) is det.
 
@@ -142,7 +142,7 @@
 :- pred io.read_word(io.result(list(char))::out, io::di, io::uo) is det.
 
     % Reads a line from the current input stream, returns the result
-    % as a list of chars.
+    % as a list of characters (code points).
     %
 :- pred io.read_line(io.result(list(char))::out, io::di, io::uo) is det.
 
@@ -152,14 +152,14 @@
     %
 :- pred io.read_line_as_string(io.result(string)::out, io::di, io::uo) is det.
 
-    % Reads all the characters from the current input stream until
-    % eof or error.
+    % Reads all the characters (code points) from the current input stream
+    % until eof or error.
     %
 :- pred io.read_file(io.maybe_partial_res(list(char))::out, io::di, io::uo)
     is det.
 
-    % Reads all the characters from the current input stream until
-    % eof or error. Returns the result as a string rather than
+    % Reads all the characters (code points) from the current input stream
+    % until eof or error. Returns the result as a string rather than
     % as a list of char.
     %
     % Returns an error if the file contains a null character, because
@@ -168,7 +168,7 @@
 :- pred io.read_file_as_string(io.maybe_partial_res(string)::out,
     io::di, io::uo) is det.
 
-    % Applies the given closure to each character read from
+    % Applies the given closure to each character (code point) read from
     % the input stream in turn, until eof or error.
     %
 :- pred io.input_stream_foldl(pred(char, T, T), T, io.maybe_partial_res(T),
@@ -178,7 +178,7 @@
 :- mode io.input_stream_foldl((pred(in, in, out) is cc_multi), in, out,
     di, uo) is cc_multi.
 
-    % Applies the given closure to each character read from
+    % Applies the given closure to each character (code point) read from
     % the input stream in turn, until eof or error.
     %
 :- pred io.input_stream_foldl_io(pred(char, io, io), io.res, io, io).
@@ -187,7 +187,7 @@
 :- mode io.input_stream_foldl_io((pred(in, di, uo) is cc_multi), out, di, uo)
     is cc_multi.
 
-    % Applies the given closure to each character read from
+    % Applies the given closure to each character (code point) read from
     % the input stream in turn, until eof or error.
     %
 :- pred io.input_stream_foldl2_io(pred(char, T, T, io, io),
@@ -197,9 +197,9 @@
 :- mode io.input_stream_foldl2_io((pred(in, in, out, di, uo) is cc_multi),
     in, out, di, uo) is cc_multi.
 
-    % Applies the given closure to each character read from the input stream
-    % in turn, until eof or error, or the closure returns `no' as its
-    % second argument.
+    % Applies the given closure to each character (code point) read from the
+    % input stream in turn, until eof or error, or the closure returns `no' as
+    % its second argument.
     %
 :- pred io.input_stream_foldl2_io_maybe_stop(
     pred(char, bool, T, T, io, io),
@@ -211,7 +211,7 @@
     (pred(in, out, in, out, di, uo) is cc_multi),
     in, out, di, uo) is cc_multi.
 
-    % Un-reads a character from the current input stream.
+    % Un-reads a character (code point) from the current input stream.
     % You can put back as many characters as you like.
     % You can even put back something that you didn't actually read.
     % Note: `io.putback_char' uses the C library function ungetc().
@@ -220,12 +220,12 @@
     %
 :- pred io.putback_char(char::in, io::di, io::uo) is det.
 
-    % Reads a character from specified stream.
+    % Reads a character (code point) from specified stream.
     %
 :- pred io.read_char(io.input_stream::in, io.result(char)::out,
     io::di, io::uo) is det.
 
-    % Reads a character from the specified stream.
+    % Reads a character (code point) from the specified stream.
     % This interface avoids memory allocation when there is no error.
     %
 :- pred io.read_char_unboxed(io.input_stream::in, io.result::out, char::out,
@@ -237,7 +237,7 @@
     io::di, io::uo) is det.
 
     % Reads a line from specified stream, returning the result
-    % as a list of chars.
+    % as a list of characters (code point).
     %
 :- pred io.read_line(io.input_stream::in, io.result(list(char))::out,
     io::di, io::uo) is det.
@@ -249,7 +249,7 @@
 :- pred io.read_line_as_string(io.input_stream::in, io.result(string)::out,
     io::di, io::uo) is det.
 
-    % Reads all the characters from the given input stream until
+    % Reads all the characters (code points) from the given input stream until
     % eof or error.
     %
 :- pred io.read_file(io.input_stream::in,
@@ -264,7 +264,7 @@
 :- pred io.read_file_as_string(io.input_stream::in,
     io.maybe_partial_res(string)::out, io::di, io::uo) is det.
 
-    % Applies the given closure to each character read from
+    % Applies the given closure to each character (code point) read from
     % the input stream in turn, until eof or error.
     %
 :- pred io.input_stream_foldl(io.input_stream, pred(char, T, T),
@@ -274,7 +274,7 @@
 :- mode io.input_stream_foldl(in, in(pred(in, in, out) is cc_multi),
     in, out, di, uo) is cc_multi.
 
-    % Applies the given closure to each character read from
+    % Applies the given closure to each character (code point) read from
     % the input stream in turn, until eof or error.
     %
 :- pred io.input_stream_foldl_io(io.input_stream, pred(char, io, io),
@@ -284,7 +284,7 @@
 :- mode io.input_stream_foldl_io(in, in(pred(in, di, uo) is cc_multi),
     out, di, uo) is cc_multi.
 
-    % Applies the given closure to each character read from
+    % Applies the given closure to each character (code point) read from
     % the input stream in turn, until eof or error.
     %
 :- pred io.input_stream_foldl2_io(io.input_stream,
@@ -297,9 +297,9 @@
     in(pred(in, in, out, di, uo) is cc_multi),
     in, out, di, uo) is cc_multi.
 
-    % Applies the given closure to each character read from the input stream
-    % in turn, until eof or error, or the closure returns `no' as its
-    % second argument.
+    % Applies the given closure to each character (code point) read from the
+    % input stream in turn, until eof or error, or the closure returns `no' as
+    % its second argument.
     %
 :- pred io.input_stream_foldl2_io_maybe_stop(io.input_stream,
     pred(char, bool, T, T, io, io),
@@ -1806,12 +1806,12 @@
     % we don't want the C/Java/etc code to depend on how Mercury stores
     % its discriminated union data types.
 
-    % Reads a character from specified stream, and returns the numerical value
-    % for that character (as from char.to_int). This may involve converting
-    % external character encodings into Mercury's internal character
-    % representation and (for text streams) converting OS line indicators,
-    % e.g. CR-LF for Windows, to '\n' characters. Returns -1 if at EOF,
-    % -2 if an error occurs.
+    % Reads a character (code point) from specified stream, and returns the
+    % numerical value for that character (as from char.to_int). This may
+    % involve converting external character encodings into Mercury's internal
+    % character representation and (for text streams) converting OS line
+    % indicators, e.g. CR-LF for Windows, to '\n' characters.
+    % Returns -1 if at EOF, -2 if an error occurs.
     %
 :- pred io.read_char_code(io.input_stream::in, int::out, io::di, io::uo)
     is det.
@@ -2170,15 +2170,15 @@ io.read_line_as_string(input_stream(Stream), Result, !IO) :-
 #define ML_IO_BYTES_TO_WORDS(n) (((n) + sizeof(MR_Word) - 1) / sizeof(MR_Word))
 #define ML_IO_READ_LINE_START   1024
 
-    MR_Char initial_read_buffer[ML_IO_READ_LINE_START];
-    MR_Char *read_buffer = initial_read_buffer;
+    char initial_read_buffer[ML_IO_READ_LINE_START];
+    char *read_buffer = initial_read_buffer;
     size_t read_buf_size = ML_IO_READ_LINE_START;
     size_t i;
     int char_code = '\\0';
 
     Res = 0;
     for (i = 0; char_code != '\\n'; ) {
-        char_code = mercury_getc(Stream);
+        char_code = mercury_get_byte(Stream);
         if (char_code == EOF) {
             if (i == 0) {
                 Res = -1;
@@ -2189,21 +2189,17 @@ io.read_line_as_string(input_stream(Stream), Result, !IO) :-
             Res = -2;
             break;
         }
-        if (char_code != (MR_UnsignedChar) char_code) {
-            Res = -3;
-            break;
-        }
         read_buffer[i++] = char_code;
         MR_assert(i <= read_buf_size);
         if (i == read_buf_size) {
             /* Grow the read buffer */
             read_buf_size = ML_IO_READ_LINE_GROW(read_buf_size);
             if (read_buffer == initial_read_buffer) {
-                read_buffer = MR_NEW_ARRAY(MR_Char, read_buf_size);
+                read_buffer = MR_NEW_ARRAY(char, read_buf_size);
                 MR_memcpy(read_buffer, initial_read_buffer,
                     ML_IO_READ_LINE_START);
             } else {
-                read_buffer = MR_RESIZE_ARRAY(read_buffer, MR_Char,
+                read_buffer = MR_RESIZE_ARRAY(read_buffer, char,
                     read_buf_size);
             }
         }
@@ -2211,10 +2207,10 @@ io.read_line_as_string(input_stream(Stream), Result, !IO) :-
     if (Res == 0) {
         MR_Word ret_string_word;
         MR_offset_incr_hp_atomic_msg(ret_string_word,
-            0, ML_IO_BYTES_TO_WORDS((i + 1) * sizeof(MR_Char)),
+            0, ML_IO_BYTES_TO_WORDS((i + 1) * sizeof(char)),
             MR_PROC_LABEL, ""string.string/0"");
         RetString = (MR_String) ret_string_word;
-        MR_memcpy(RetString, read_buffer, i * sizeof(MR_Char));
+        MR_memcpy(RetString, read_buffer, i * sizeof(char));
         RetString[i] = '\\0';
     } else {
         /*
@@ -4018,7 +4014,7 @@ have_file_ids :- semidet_fail.
 % Buffer sizes are measured in Chars.
 
 :- type buffer.
-:- pragma foreign_type(c, buffer, "MR_Char *", [can_pass_as_mercury_type]).
+:- pragma foreign_type(c, buffer, "char *", [can_pass_as_mercury_type]).
 
     % XXX It would be better to use a char_array (e.g. defined as char[] in
     % C#) type rather than array(char).  This is because on the Java and IL
@@ -4040,9 +4036,9 @@ have_file_ids :- semidet_fail.
 "{
     MR_Word buf;
     MR_offset_incr_hp_atomic_msg(buf, 0,
-        (Size * sizeof(MR_Char) + sizeof(MR_Word) - 1) / sizeof(MR_Word),
+        (Size * sizeof(char) + sizeof(MR_Word) - 1) / sizeof(MR_Word),
         MR_PROC_LABEL, ""io:buffer/0"");
-    Buffer = (MR_Char *) buf;
+    Buffer = (char *) buf;
 }").
 
 io.alloc_buffer(Size, buffer(Array)) :-
@@ -4058,28 +4054,28 @@ io.alloc_buffer(Size, buffer(Array)) :-
     [will_not_call_mercury, promise_pure, tabled_for_io, thread_safe,
         does_not_affect_liveness],
 "{
-    MR_CHECK_EXPR_TYPE(Buffer0, MR_Char *);
-    MR_CHECK_EXPR_TYPE(Buffer, MR_Char *);
+    MR_CHECK_EXPR_TYPE(Buffer0, char *);
+    MR_CHECK_EXPR_TYPE(Buffer, char *);
 
 #ifdef MR_CONSERVATIVE_GC
-    Buffer = MR_GC_realloc(Buffer0, NewSize * sizeof(MR_Char));
+    Buffer = MR_GC_realloc(Buffer0, NewSize * sizeof(char));
 #else
-    if (Buffer0 + OldSize == (MR_Char *) MR_hp) {
+    if (Buffer0 + OldSize == (char *) MR_hp) {
         MR_Word next;
         MR_offset_incr_hp_atomic_msg(next, 0,
-            (NewSize * sizeof(MR_Char) + sizeof(MR_Word) - 1)
+            (NewSize * sizeof(char) + sizeof(MR_Word) - 1)
                 / sizeof(MR_Word),
             MR_PROC_LABEL, ""io:buffer/0"");
-        assert(Buffer0 + OldSize == (MR_Char *) next);
+        assert(Buffer0 + OldSize == (char *) next);
         Buffer = Buffer0;
     } else {
         /* just have to alloc and copy */
         MR_Word buf;
         MR_offset_incr_hp_atomic_msg(buf, 0,
-            (NewSize * sizeof(MR_Char) + sizeof(MR_Word) - 1)
+            (NewSize * sizeof(char) + sizeof(MR_Word) - 1)
                 / sizeof(MR_Word),
             MR_PROC_LABEL, ""io:buffer/0"");
-        Buffer = (MR_Char *) buf;
+        Buffer = (char *) buf;
         if (OldSize > NewSize) {
             MR_memcpy(Buffer, Buffer0, NewSize);
         } else {
@@ -4128,8 +4124,8 @@ io.buffer_to_string(buffer(Array), Len, String) :-
 "{
     int items_read;
 
-    MR_CHECK_EXPR_TYPE(Buffer0, MR_Char *);
-    MR_CHECK_EXPR_TYPE(Buffer, MR_Char *);
+    MR_CHECK_EXPR_TYPE(Buffer0, char *);
+    MR_CHECK_EXPR_TYPE(Buffer, char *);
 
     items_read = MR_READ(*Stream, Buffer0 + Pos0, Size - Pos0);
 
@@ -5818,6 +5814,18 @@ int             ML_fprintf(MercuryFilePtr mf, const char *format, ...);
             return true;
         }
 
+        private int read_code_unit()
+            throws java.io.IOException
+        {
+            if (!refill_buffer()) {
+                return -1;
+            }
+
+            char c = buf[buf_pos];
+            buf_pos++;
+            return (int) c;
+        }
+
         /*
         ** read_char(): [Java]
         **
@@ -5827,18 +5835,22 @@ int             ML_fprintf(MercuryFilePtr mf, const char *format, ...);
         public int read_char()
             throws java.io.IOException
         {
-            if (!refill_buffer()) {
+            final int c1 = read_code_unit();
+            if (c1 == -1) {
                 return -1;
             }
-
-            char c = buf[buf_pos];
-            buf_pos++;
-
-            if (c == '\\n') {
+            if (c1 == '\\n') {
                 line_number++;
             }
-
-            return c;
+            if (!Character.isHighSurrogate((char) c1)) {
+                return c1;
+            }
+            final int c2 = read_code_unit();
+            if (c2 != -1 && !Character.isLowSurrogate((char) c2)) {
+                // Return replacement character.
+                return 0xfffd;
+            }
+            return Character.toCodePoint((char) c1, (char) c2);
         }
 
         /*
@@ -5911,7 +5923,7 @@ int             ML_fprintf(MercuryFilePtr mf, const char *format, ...);
             }
         }
 
-        public void ungetc(char c) {
+        private void unget_code_unit(char c) {
             /*
             ** If necessary, shift the unread characters in the input buffer
             ** to make room at the front of the buffer.  If the buffer is full
@@ -5933,11 +5945,19 @@ int             ML_fprintf(MercuryFilePtr mf, const char *format, ...);
                 }
             }
 
-            buf_pos--;
-            buf[buf_pos] = c;
+            buf[--buf_pos] = c;
+        }
 
-            if (c == '\\n') {
-                line_number--;
+        public void ungetc(int c) {
+            if (Character.charCount(c) == 1) {
+                unget_code_unit((char) c);
+                if (c == '\\n') {
+                    line_number--;
+                }
+            } else {
+                char[] units = Character.toChars(c);
+                unget_code_unit(units[1]);
+                unget_code_unit(units[0]);
             }
         }
 
@@ -7004,7 +7024,11 @@ mercury_print_string(MR_MercuryFileStruct mf, string s)
         // So we have to output each character separately.
 
         for (int i = 0; i < s.Length; i++) {
-            if (s[i] == '\\n') {
+            if (System.Char.IsSurrogate(s[i])) {
+                // XXX unicode: more error checking
+                mf.writer.Write(s.Substring(i, 2));
+                i++;
+            } else if (s[i] == '\\n') {
                 mf.line_number++;
                 mf.writer.WriteLine("""");
             } else {
@@ -7017,10 +7041,14 @@ mercury_print_string(MR_MercuryFileStruct mf, string s)
 
 ").
 
+:- pragma foreign_decl("C", "
+extern int  mercury_get_byte(MercuryFilePtr mf);
+").
+
 :- pragma foreign_code("C", "
 
 int
-mercury_getc(MercuryFilePtr mf)
+mercury_get_byte(MercuryFilePtr mf)
 {
     int c = MR_GETCH(*mf);
     if (c == '\\n') {
@@ -7077,6 +7105,9 @@ mercury_getc(MR_MercuryFileStruct mf)
                 // If not, we still need to treat this as a newline, and thus
                 // increment the line counter.
                 mf.line_number++;
+            } else if (System.Char.IsSurrogate((char) c)) {
+                int c2 = mf.reader.Read();
+                c = System.Char.ConvertToUtf32((char) c, (char) c2);
             }
         } else /* c == NewLine[0] */ {
             switch (io.NewLine.Length) {
@@ -7288,6 +7319,16 @@ mercury_close(MercuryFilePtr mf)
 :- pragma foreign_code("C#", "
 
 public static void
+mercury_write_codepoint(System.IO.TextWriter w, int c)
+{
+    if (c <= 0xffff) {
+        w.Write((char) c);
+    } else {
+        w.Write(System.Char.ConvertFromUtf32(c));
+    }
+}
+
+public static void
 mercury_close(MR_MercuryFileStruct mf)
 {
     if (mf.reader != null) {
@@ -7336,7 +7377,48 @@ io.read_char_code(input_stream(Stream), CharCode, !IO) :-
     [will_not_call_mercury, promise_pure, tabled_for_io,
         does_not_affect_liveness, no_sharing],
 "
-    CharCode = mercury_getc(Stream);
+    char    buf[5];
+    int     nbytes;
+    int     i;
+    int     c;
+    unsigned int    uc;
+
+    c = mercury_get_byte(Stream);
+    uc = c;
+    if (uc <= 0x7f) {
+        CharCode = uc;
+    } else if (c == EOF) {
+        CharCode = -1;
+    } else {
+        if ((uc & 0xE0) == 0xC0) {
+            nbytes = 2;
+        } else if ((uc & 0xF0) == 0xE0) {
+            nbytes = 3;
+        } else if ((uc & 0xF8) == 0xF0) {
+            nbytes = 4;
+        } else {
+            nbytes = 0;
+        }
+        if (nbytes > 0) {
+            buf[0] = uc;
+            for (i = 1; i < nbytes; i++) {
+                uc = mercury_get_byte(Stream);
+                buf[i] = uc;
+            }
+            buf[i] = '\\0';
+            CharCode = MR_utf8_get(buf, 0);
+            if (CharCode < 0) {
+                /* Invalid byte sequence. */
+                errno = EILSEQ;
+                CharCode = -2;
+            }
+        } else {
+            /* Invalid byte sequence. */
+            errno = EILSEQ;
+            CharCode = -2;
+        }
+    }
+
     MR_update_io(IO0, IO);
 ").
 
@@ -7350,7 +7432,7 @@ io.read_byte_val(input_stream(Stream), ByteVal, !IO) :-
     [will_not_call_mercury, promise_pure, tabled_for_io,
         does_not_affect_liveness, no_sharing],
 "
-    ByteVal = mercury_getc(Stream);
+    ByteVal = mercury_get_byte(Stream);
     MR_update_io(IO0, IO);
 ").
 
@@ -7367,9 +7449,20 @@ io.putback_char(input_stream(Stream), Character, !IO) :-
     if (Character == '\\n') {
         MR_line_number(*mf)--;
     }
-    /* XXX should work even if ungetc() fails */
-    if (MR_UNGETCH(*mf, Character) == EOF) {
-        mercury_io_error(mf, ""io.putback_char: ungetc failed"");
+    if (Character <= 0x7f) {
+        if (MR_UNGETCH(*mf, Character) == EOF) {
+            mercury_io_error(mf, ""io.putback_char: ungetc failed"");
+        }
+    } else {
+        char    buf[5];
+        ssize_t len;
+        len = MR_utf8_encode(buf, Character);
+        /* XXX this depends on multiple putback */
+        for (; len > 0; len--) {
+            if (MR_UNGETCH(*mf, buf[len - 1]) == EOF) {
+                mercury_io_error(mf, ""io.putback_char: ungetc failed"");
+            }
+        }
     }
     MR_update_io(IO0, IO);
 ").
@@ -7384,7 +7477,6 @@ io.putback_byte(binary_input_stream(Stream), Character, !IO) :-
         does_not_affect_liveness, no_sharing],
 "
     MercuryFilePtr mf = Stream;
-    /* XXX should work even if ungetc() fails */
     if (MR_UNGETCH(*mf, Character) == EOF) {
         mercury_io_error(mf, ""io.putback_byte: ungetc failed"");
     }
@@ -7520,11 +7612,24 @@ io.putback_byte(binary_input_stream(Stream), Character, !IO) :-
         does_not_affect_liveness, no_sharing],
 "
     MercuryFilePtr out = mercury_current_text_output();
-    if (MR_PUTCH(*out, Character) < 0) {
-        mercury_output_error(out);
-    }
-    if (Character == '\\n') {
-        MR_line_number(*out)++;
+    char    buf[5];
+    size_t  len;
+    int     i;
+    if (Character <= 0x7f) {
+        if (MR_PUTCH(*out, Character) < 0) {
+            mercury_output_error(out);
+        }
+        if (Character == '\\n') {
+            MR_line_number(*out)++;
+        }
+    } else {
+        len = MR_utf8_encode(buf, Character);
+        for (i = 0; i < len; i++) {
+            if (MR_PUTCH(*out, buf[i]) < 0) {
+                mercury_output_error(out);
+                break;
+            }
+        }
     }
     MR_update_io(IO0, IO);
 ").
@@ -7626,7 +7731,7 @@ io.write_bitmap(Bitmap, Start, NumBytes, !IO) :-
         switch (io.mercury_current_text_output.line_ending) {
         case io.ML_line_ending_kind.ML_raw_binary:
         case io.ML_line_ending_kind.ML_Unix_line_ending:
-            w.Write(Character);
+            mercury_write_codepoint(w, Character);
             break;
         case io.ML_line_ending_kind.ML_OS_line_ending:
             w.WriteLine("""");
@@ -7634,7 +7739,7 @@ io.write_bitmap(Bitmap, Start, NumBytes, !IO) :-
         }
         io.mercury_current_text_output.line_number++;
     } else {
-        w.Write(Character);
+        mercury_write_codepoint(w, Character);
     }
 ").
 
@@ -7677,8 +7782,11 @@ io.write_bitmap(Bitmap, Start, NumBytes, !IO) :-
     io.write_char(Chr::in, _IO0::di, _IO::uo),
     [may_call_mercury, promise_pure, thread_safe, tabled_for_io, terminates],
 "
-    io.mercury_current_text_output.get().write_or_throw(
-        Character.toString(Chr));
+    io.MR_TextOutputFile stream = io.mercury_current_text_output.get();
+    char[] buf = java.lang.Character.toChars(Chr);
+    for (char c : buf) {
+        stream.put_or_throw(c);
+    }
 ").
 :- pragma foreign_proc("Java",
     io.write_int(Val::in, _IO0::di, _IO::uo),
@@ -7857,11 +7965,24 @@ io.write_char(output_stream(Stream), Character, !IO) :-
     [may_call_mercury, promise_pure, tabled_for_io, thread_safe, terminates,
         does_not_affect_liveness, no_sharing],
 "
-    if (MR_PUTCH(*Stream, Character) < 0) {
-        mercury_output_error(Stream);
-    }
-    if (Character == '\\n') {
-        MR_line_number(*Stream)++;
+    if (Character <= 0x7f) {
+        if (MR_PUTCH(*Stream, Character) < 0) {
+            mercury_output_error(Stream);
+        }
+        if (Character == '\\n') {
+            MR_line_number(*Stream)++;
+        }
+    } else {
+        char    buf[5];
+        size_t  len;
+        int     i;
+        len = MR_utf8_encode(buf, Character);
+        for (i = 0; i < len; i++) {
+            if (MR_PUTCH(*Stream, buf[i]) < 0) {
+                mercury_output_error(Stream);
+                break;
+            }
+        }
     }
     MR_update_io(IO0, IO);
 ").
@@ -8014,7 +8135,7 @@ io.flush_binary_output(binary_output_stream(Stream), !IO) :-
         switch (stream.line_ending) {
         case io.ML_line_ending_kind.ML_raw_binary:
         case io.ML_line_ending_kind.ML_Unix_line_ending:
-            w.Write(Character);
+            mercury_write_codepoint(w, Character);
             break;
         case io.ML_line_ending_kind.ML_OS_line_ending:
             w.WriteLine("""");
@@ -8022,7 +8143,7 @@ io.flush_binary_output(binary_output_stream(Stream), !IO) :-
         }
         stream.line_number++;
     } else {
-        w.Write(Character);
+        mercury_write_codepoint(w, Character);
     }
 ").
 
@@ -8089,7 +8210,10 @@ io.flush_binary_output(binary_output_stream(Stream), !IO) :-
     io.write_char_2(Stream::in, Character::in, _IO0::di, _IO::uo),
     [may_call_mercury, promise_pure, thread_safe, tabled_for_io, terminates],
 "
-    ((io.MR_TextOutputFile) Stream).put_or_throw(Character);
+    char[] buf = java.lang.Character.toChars(Character);
+    for (char c : buf) {
+        ((io.MR_TextOutputFile) Stream).put_or_throw(c);
+    }
 ").
 
 :- pragma foreign_proc("Java",
diff --git a/library/lexer.m b/library/lexer.m
index 95fb5cc..ac6b828 100644
--- a/library/lexer.m
+++ b/library/lexer.m
@@ -319,8 +319,7 @@ string_get_context(StartPosn, Context, !Posn) :-
 string_read_char(String, Len, Char, Posn0, Posn) :-
     Posn0 = posn(LineNum0, LineOffset0, Offset0),
     Offset0 < Len,
-    string.unsafe_index(String, Offset0, Char),
-    Offset = Offset0 + 1,
+    string.unsafe_index_next(String, Offset0, Offset, Char),
     ( Char = '\n' ->
         LineNum = LineNum0 + 1,
         Posn = posn(LineNum, Offset, Offset)
@@ -332,13 +331,15 @@ string_read_char(String, Len, Char, Posn0, Posn) :-
 
 string_ungetchar(String, Posn0, Posn) :-
     Posn0 = posn(LineNum0, LineOffset0, Offset0),
-    Offset = Offset0 - 1,
-    string.unsafe_index(String, Offset, Char),
-    ( Char = '\n' ->
-        LineNum = LineNum0 - 1,
-        Posn = posn(LineNum, Offset, Offset)
+    ( string.unsafe_prev_index(String, Offset0, Offset, Char) ->
+        ( Char = '\n' ->
+            LineNum = LineNum0 - 1,
+            Posn = posn(LineNum, Offset, Offset)
+        ;
+            Posn = posn(LineNum0, LineOffset0, Offset)
+        )
     ;
-        Posn = posn(LineNum0, LineOffset0, Offset)
+        Posn = Posn0
     ).
 
 :- pred grab_string(string::in, posn::in, string::out,
@@ -1223,13 +1224,14 @@ get_unicode_escape(Stream, NumHexChars, QuoteChar, Chars, HexChars, Token,
         ( if
             rev_char_list_to_string(HexChars, HexString),
             string.base_string_to_int(16, HexString, UnicodeCharCode),
-            convert_unicode_char_to_target_chars(UnicodeCharCode, UTFChars)
+            allowed_unicode_char_code(UnicodeCharCode),
+            char.from_int(UnicodeCharCode, UnicodeChar)
         then
             ( if UnicodeCharCode = 0 then
                 Token = null_character_error
             else
                 get_quoted_name(Stream, QuoteChar,
-                    list.reverse(UTFChars) ++ Chars, Token, !IO)
+                    [UnicodeChar | Chars], Token, !IO)
             )
         else
             Token = error("invalid Unicode character code")
@@ -1263,9 +1265,10 @@ string_get_unicode_escape(NumHexChars, String, Len, QuoteChar, Chars,
         ( if
             rev_char_list_to_string(HexChars, HexString),
             string.base_string_to_int(16, HexString, UnicodeCharCode),
-            convert_unicode_char_to_target_chars(UnicodeCharCode, UTFChars)
+            allowed_unicode_char_code(UnicodeCharCode),
+            char.from_int(UnicodeCharCode, UnicodeChar)
         then
-            RevCharsWithUnicode = list.reverse(UTFChars) ++ Chars,
+            RevCharsWithUnicode = [UnicodeChar | Chars],
             ( if UnicodeCharCode = 0 then
                 string_get_context(Posn0, Context, !Posn),
                 Token = null_character_error
@@ -1292,90 +1295,6 @@ string_get_unicode_escape(NumHexChars, String, Len, QuoteChar, Chars,
         )
     ).
 
-:- pred convert_unicode_char_to_target_chars(int::in, list(char)::out)
-    is semidet.
-
-convert_unicode_char_to_target_chars(UnicodeCharCode, Chars) :-
-    BackendEncoding = backend_unicode_encoding,
-    (
-        BackendEncoding = utf8,
-        encode_unicode_char_as_utf8(UnicodeCharCode, Chars)
-    ;
-        BackendEncoding = utf16,
-        encode_unicode_char_as_utf16(UnicodeCharCode, Chars)
-    ).
-
-:- pred encode_unicode_char_as_utf8(int::in, list(char)::out) is semidet.
-
-encode_unicode_char_as_utf8(UnicodeCharCode, UTF8Chars) :-
-    allowed_unicode_char_code(UnicodeCharCode),
-
-    % Refer to table 3-5 of the Unicode 4.0.0 standard (available from
-    % www.unicode.org) for documentation on the bit distribution patterns used
-    % below.
-
-    ( if UnicodeCharCode =< 0x00007F then
-        UTF8Chars = [char.det_from_int(UnicodeCharCode)]
-    else if UnicodeCharCode =< 0x0007FF then
-        Part1 = (0b11111000000 /\ UnicodeCharCode) >> 6,
-        Part2 =  0b00000111111 /\ UnicodeCharCode,
-        char.det_from_int(Part1 \/ 0b11000000, UTF8Char1),
-        char.det_from_int(Part2 \/ 0b10000000, UTF8Char2),
-        UTF8Chars = [UTF8Char1, UTF8Char2]
-    else if UnicodeCharCode =< 0x00FFFF then
-        Part1 = (0b1111000000000000 /\ UnicodeCharCode) >> 12,
-        Part2 = (0b0000111111000000 /\ UnicodeCharCode) >> 6,
-        Part3 =  0b0000000000111111 /\ UnicodeCharCode,
-        char.det_from_int(Part1 \/ 0b11100000, UTF8Char1),
-        char.det_from_int(Part2 \/ 0b10000000, UTF8Char2),
-        char.det_from_int(Part3 \/ 0b10000000, UTF8Char3),
-        UTF8Chars = [UTF8Char1, UTF8Char2, UTF8Char3]
-    else
-        Part1 = (0b111000000000000000000 /\ UnicodeCharCode) >> 18,
-        Part2 = (0b000111111000000000000 /\ UnicodeCharCode) >> 12,
-        Part3 = (0b000000000111111000000 /\ UnicodeCharCode) >> 6,
-        Part4 =  0b000000000000000111111 /\ UnicodeCharCode,
-        char.det_from_int(Part1 \/ 0b11110000, UTF8Char1),
-        char.det_from_int(Part2 \/ 0b10000000, UTF8Char2),
-        char.det_from_int(Part3 \/ 0b10000000, UTF8Char3),
-        char.det_from_int(Part4 \/ 0b10000000, UTF8Char4),
-        UTF8Chars = [UTF8Char1, UTF8Char2, UTF8Char3, UTF8Char4]
-    ).
-
-:- pred encode_unicode_char_as_utf16(int::in, list(char)::out) is semidet.
-
-    % This predicate should only be called on backends that have
-    % a 16 bit character type.
-    %
-encode_unicode_char_as_utf16(UnicodeCharCode, UTF16Chars) :-
-    allowed_unicode_char_code(UnicodeCharCode),
-
-    % If the code point is less than or equal to 0xFFFF
-    % then the UTF-16 encoding is simply the code point value,
-    % otherwise we construct a surrogate pair.
-
-    ( if UnicodeCharCode =< 0xFFFF then
-        char.det_from_int(UnicodeCharCode, Char),
-        UTF16Chars = [Char]
-    else
-        %
-        % Refer to table 3-4 of the Unicode 4.0.0 standard (available from
-        % www.unicode.org) for documentation on the bit distribution patterns
-        % used below.
-        %
-        UUUUU =      (0b111110000000000000000 /\ UnicodeCharCode) >> 16,
-        XXXXXX =     (0b000001111110000000000 /\ UnicodeCharCode) >> 10,
-        XXXXXXXXXX = (0b000000000001111111111 /\ UnicodeCharCode),
-        WWWWW = UUUUU - 1,
-        Surrogate1Lead = 0b1101100000000000,
-        Surrogate2Lead = 0b1101110000000000,
-        Surrogate1 = Surrogate1Lead \/ (WWWWW << 6) \/ XXXXXX,
-        Surrogate2 = Surrogate2Lead \/ XXXXXXXXXX,
-        char.det_from_int(Surrogate1, Surrogate1Char),
-        char.det_from_int(Surrogate2, Surrogate2Char),
-        UTF16Chars = [Surrogate1Char, Surrogate2Char]
-    ).
-
 :- pred allowed_unicode_char_code(int::in) is semidet.
 
     % Succeeds if the give code point is a legal Unicode code point
@@ -1389,60 +1308,6 @@ allowed_unicode_char_code(Code) :-
         Code >= 0xD800, Code =< 0xDFFF
     ).
 
-:- type unicode_encoding
-    --->    utf8
-    ;       utf16.
-
-:- func backend_unicode_encoding = unicode_encoding.
-
-backend_unicode_encoding = Encoding :-
-    Int = backend_unicode_encoding_int,
-    ( unicode_encoding_int_to_encoding(Int, EncodingPrime) ->
-        Encoding = EncodingPrime
-    ;
-        error("backend_unicode_encoding: unexpected Unicode encoding code")
-    ).
-
-:- pred unicode_encoding_int_to_encoding(int::in, unicode_encoding::out)
-    is semidet.
-
-unicode_encoding_int_to_encoding(0, utf8).
-unicode_encoding_int_to_encoding(1, utf16).
-
-:- func backend_unicode_encoding_int = int.
-
-:- pragma inline(backend_unicode_encoding_int/0).
-
-:- pragma foreign_proc("C",
-    backend_unicode_encoding_int = (EncodingInt::out),
-    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
-        does_not_affect_liveness],
-"
-    EncodingInt = 0;
-").
-
-:- pragma foreign_proc("Java",
-    backend_unicode_encoding_int = (EncodingInt::out),
-    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail],
-"
-    EncodingInt = 1;
-").
-
-:- pragma foreign_proc("C#",
-    backend_unicode_encoding_int = (EncodingInt::out),
-    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail],
-"
-    EncodingInt = 1;
-").
-
-:- pragma foreign_proc("Erlang",
-    backend_unicode_encoding_int = (EncodingInt::out),
-    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
-        does_not_affect_liveness],
-"
-    EncodingInt = 0
-").
-
 :- pred get_hex_escape(io.input_stream::in, char::in, list(char)::in,
     list(char)::in, token::out, io::di, io::uo) is det.
 
diff --git a/library/string.m b/library/string.m
index 8ab5454..0534ee4 100644
--- a/library/string.m
+++ b/library/string.m
@@ -20,18 +20,24 @@
 %
 % The representation of strings is implementation dependent and subject to
 % change. In the current implementation, when Mercury is compiled to C, strings
-% are represented as in C, using a null character as the string terminator.
+% are UTF-8 encoded, using a null character as the string terminator.
+% A single code point requires one to four bytes (code units) to encode.
+%
 % When Mercury is compiled to Java, strings are represented as Java `String's.
-% When Mercury is compiled to .NET IL code, strings are represented as .NET
-% `System.String's.
+% When Mercury is compiled to C# code, strings are represented as
+% `System.String's.  In both cases, strings are UTF-16 encoded.  A single code
+% point requires one or two 16-bit integers (code units) to encode.
+%
+% Whe Mercury is compiled to Erlang, strings are represented as Erlang
+% binaries using UTF-8 encoding.
+% XXX However, this is incomplete.
 %
 % The builtin comparison operation on strings is also implementation dependent.
 % In the current implementation, when Mercury is compiled to C, string
 % comparison is implemented using C's strcmp() function.  When Mercury
 % is compiled to Java, string comparison is implemented using Java's
-% String.compareTo() method.  When Mercury is compiled to .NET IL code
-% string comparison is implemented using C#'s System.String.CompareOrdinal()
-% method.
+% String.compareTo() method.  When Mercury is compiled to C#, string comparison
+% is implemented using C#'s System.String.CompareOrdinal() method.
 %
 %-----------------------------------------------------------------------------%
 %-----------------------------------------------------------------------------%
@@ -69,14 +75,43 @@
 :- type text_file
     --->    text_file(string).
 
-    % Determine the length of a string.
+    % Determine the length of a string, in code units.
     % An empty string has length zero.
     %
+    % NOTE: code points (characters) are encoded using one or more code units,
+    % i.e. bytes for UTF-8; 16-bit integers for UTF-16.
+    %
 :- func string.length(string::in) = (int::uo) is det.
 :- pred string.length(string, int).
 :- mode string.length(in, uo) is det.
 :- mode string.length(ui, uo) is det.
 
+    % Synonyms for string.length.
+    %
+:- func string.count_code_units(string) = int.
+:- pred string.count_code_units(string::in, int::out) is det.
+
+    % Determine the number of code points in a string.
+    %
+:- func string.count_codepoints(string) = int.
+:- pred string.count_codepoints(string::in, int::out) is det.
+
+    % string.codepoint_offset(String, CodePointCount, CodePointOffset):
+    % Equivalent to `string.codepoint_offset(String, 0, CodePointCount,
+    % CodePointOffset)'.
+    %
+:- pred string.codepoint_offset(string::in, int::in, int::out) is semidet.
+
+    % string.codepoint_offset(String, StartOffset, CodePointCount,
+    %   CodePointOffset):
+    %
+    % Return the offset into `String' where, starting from `StartOffset',
+    % `CodePointCount' code points are skipped.  Fails if either `StartOffset'
+    % or `CodePointOffset' are out of range.
+    %
+:- pred string.codepoint_offset(string::in, int::in, int::in, int::out)
+    is semidet.
+
     % Append two strings together.
     %
 :- func string.append(string::in, string::in) = (string::uo) is det.
@@ -170,7 +205,7 @@
 :- mode string.string_ops_noncanon(in, in, in, out) is cc_multi.
 
     % string.char_to_string(Char, String).
-    % Converts a character (single-character atom) to a string or vice versa.
+    % Converts a character (code point) to a string or vice versa.
     %
 :- func string.char_to_string(char::in) = (string::uo) is det.
 :- pred string.char_to_string(char, string).
@@ -238,7 +273,7 @@
 :- func string.from_c_pointer(c_pointer::in) = (string::uo) is det.
 
     % string.first_char(String, Char, Rest) is true iff Char is the first
-    % character of String, and Rest is the remainder.
+    % character (code point) of String, and Rest is the remainder.
     %
     % WARNING: string.first_char makes a copy of Rest because the garbage
     % collector doesn't handle references into the middle of an object,
@@ -298,16 +333,18 @@
 :- func string.uncapitalize_first(string) = string.
 :- pred string.uncapitalize_first(string::in, string::out) is det.
 
-    % Convert the string to a list of characters.
-    % Throws an exception if the list of characters contains a null character.
+    % Convert the string to a list of characters (code points).
+    % Throws an exception if the list of characters contains a null character
+    % or surrogate code point.
     %
 :- func string.to_char_list(string) = list(char).
 :- pred string.to_char_list(string, list(char)).
 :- mode string.to_char_list(in, out) is det.
 :- mode string.to_char_list(uo, in) is det.
 
-    % Convert a list of characters to a string.
-    % Throws an exception if the list of characters contains a null character.
+    % Convert a list of characters (code points) to a string.
+    % Throws an exception if the list of characters contains a null character
+    % or surrogate code point.
     %
 :- func string.from_char_list(list(char)::in) = (string::uo) is det.
 :- pred string.from_char_list(list(char), string).
@@ -315,23 +352,34 @@
 :- mode string.from_char_list(out, in) is det.
 
     % As above, but fail instead of throwing an exception if the
-    % list contains a null character.
+    % list contains a null character or surrogate code point.
     %
 :- pred string.semidet_from_char_list(list(char)::in, string::uo) is semidet.
 
     % Same as string.from_char_list, except that it reverses the order
     % of the characters.
-    % Throws an exception if the list of characters contains a null character.
+    % Throws an exception if the list of characters contains a null character
+    % or surrogate code point.
     %
 :- func string.from_rev_char_list(list(char)::in) = (string::uo) is det.
 :- pred string.from_rev_char_list(list(char)::in, string::uo) is det.
 
     % As above, but fail instead of throwing an exception if the
-    % list contains a null character.
+    % list contains a null character or surrogate code point.
     %
 :- pred string.semidet_from_rev_char_list(list(char)::in, string::uo)
     is semidet.
 
+    % Convert a string into a list of code units.
+    %
+:- pred string.to_code_unit_list(string::in, list(int)::out) is det.
+
+    % Convert a list of code units to a string.
+    % Fails if the list does not contain a valid encoding of a string,
+    % in the encoding expected by the current process.
+    %
+:- pred string.from_code_unit_list(list(int)::in, string::uo) is semidet.
+
     % Converts a signed base 10 string to an int; throws an exception
     % if the string argument does not match the regexp [+-]?[0-9]+
     % or the number is not in the range [int.min_int+1, int.max_int].
@@ -371,15 +419,17 @@
     %
 :- pred string.to_float(string::in, float::out) is semidet.
 
-    % True if string contains only alphabetic characters (letters).
+    % True if string contains only alphabetic characters [A-Za-z].
     %
 :- pred string.is_all_alpha(string::in) is semidet.
 
-    % True if string contains only alphabetic characters and underscores.
+    % True if string contains only alphabetic characters [A-Za-z] and
+    % underscores.
     %
 :- pred string.is_all_alpha_or_underscore(string::in) is semidet.
 
-    % True if string contains only letters, digits, and underscores.
+    % True if string contains only alphabetic characters [A-Za-z], digits,
+    % and underscores.
     %
 :- pred string.is_all_alnum_or_underscore(string::in) is semidet.
 
@@ -389,7 +439,7 @@
 
     % string.all_match(TestPred, String):
     %
-    % True if TestPred is true when applied to each character in
+    % True if TestPred is true when applied to each character (code point) in
     % String or if String is the empty string.
     %
 :- pred string.all_match(pred(char)::in(pred(in) is semidet), string::in)
@@ -398,6 +448,8 @@
     % string.pad_left(String0, PadChar, Width, String):
     % Insert `PadChar's at the left of `String0' until it is at least as long
     % as `Width', giving `String'.
+    % Width is measured as the number of code points, which is too simplistic
+    % if the string may contain zero or double width code points.
     %
 :- func string.pad_left(string, char, int) = string.
 :- pred string.pad_left(string::in, char::in, int::in, string::out) is det.
@@ -405,33 +457,39 @@
     % string.pad_right(String0, PadChar, Width, String):
     % Insert `PadChar's at the right of `String0' until it is at least as long
     % as `Width', giving `String'.
+    % Width is measured as the number of code points, which is too simplistic
+    % if the string may contain zero or double width code points.
     %
 :- func string.pad_right(string, char, int) = string.
 :- pred string.pad_right(string::in, char::in, int::in, string::out) is det.
 
     % string.duplicate_char(Char, Count, String):
     % Construct a string consisting of `Count' occurrences of `Char'
-    % in sequence.
+    % code points in sequence.
     %
 :- func string.duplicate_char(char::in, int::in) = (string::uo) is det.
 :- pred string.duplicate_char(char::in, int::in, string::uo) is det.
 
     % string.contains_char(String, Char):
-    % Succeed if `Char' occurs in `String'.
+    % Succeed if the code point `Char' occurs in `String'.
     %
 :- pred string.contains_char(string::in, char::in) is semidet.
 
     % string.index(String, Index, Char):
-    % `Char' is the (`Index' + 1)-th character of `String'.
-    % Fails if `Index' is out of range (negative, or greater than or equal to
-    % the length of `String').
+    % `Char' is the character (code point) in `String', beginning at the
+    % code unit `Index'.  Fails if `Index' is out of range (negative, or
+    % greater than or equal to the length of `String').
+    %
+    % Calls error/1 if an illegal sequence is detected.
     %
 :- pred string.index(string::in, int::in, char::uo) is semidet.
 
     % string.index_det(String, Index, Char):
-    % `Char' is the (`Index' + 1)-th character of `String'.
+    % `Char' is the character (code point) in `String', beginning at the
+    % code unit `Index'.
     % Calls error/1 if `Index' is out of range (negative, or greater than
-    % or equal to the length of `String').
+    % or equal to the length of `String'), or if an illegal sequence is
+    % detected.
     %
 :- func string.index_det(string, int) = char.
 :- pred string.index_det(string::in, int::in, char::uo) is det.
@@ -442,7 +500,8 @@
 :- func string ^ elem(int) = char.
 
     % string.unsafe_index(String, Index, Char):
-    % `Char' is the (`Index' + 1)-th character of `String'.
+    % `Char' is the character (code point) in `String', beginning at the
+    % code unit `Index'.
     % WARNING: behavior is UNDEFINED if `Index' is out of range
     % (negative, or greater than or equal to the length of `String').
     % This version is constant time, whereas string.index_det
@@ -456,56 +515,92 @@
     %
 :- func string ^ unsafe_elem(int) = char.
 
+    % string.index_next(String, Index, NextIndex, Char):
+    % Like `string.index'/3 but also returns the position of the code unit
+    % that follows the code point beginning at `Index',
+    % i.e. NextIndex = Index + num_bytes_to_encode(Char).
+    %
+:- pred string.index_next(string::in, int::in, int::out, char::uo) is semidet.
+
+    % string.unsafe_index_next(String, Index, NextIndex, Char):
+    % `Char' is the character (code point) in `String', beginning at the
+    % code unit `Index'. `NextIndex' is the offset following the encoding
+    % of `Char'. Fails if `Index' is equal to the length of `String'.
+    % WARNING: behavior is UNDEFINED if `Index' is out of range
+    % (negative, or greater than the length of `String').
+    %
+:- pred string.unsafe_index_next(string::in, int::in, int::out, char::uo)
+    is semidet.
+
+    % string.unsafe_prev_index(String, Index, CharIndex, Char):
+    % `Char' is the character (code point) in `String' immediately _before_
+    % the code unit `Index'. `CharIndex' is the offset of the beginning of
+    % `Char'. Fails if `Index' is zero.
+    % WARNING: behavior is UNDEFINED if `Index' is out of range
+    % (negative, or greater than or equal to the length of `String').
+    %
+:- pred string.unsafe_prev_index(string::in, int::in, int::out, char::uo)
+    is semidet.
+
+    % string.unsafe_index_code_unit(String, Index, CodeList):
+    % `Code' unit is the code unit in `String' at the offset `Index'.
+    % WARNING: behavior is UNDEFINED if `Index' is out of range
+    % (negative, or greater than or equal to the length of `String').
+    %
+:- pred string.unsafe_index_code_unit(string::in, int::in, int::out) is det.
+
     % string.chomp(String):
     % `String' minus any single trailing newline character.
     %
 :- func string.chomp(string) = string.
 
     % string.lstrip(String):
-    % `String' minus any initial whitespace characters.
+    % `String' minus any initial whitespace characters in the ASCII range.
     %
 :- func string.lstrip(string) = string.
 
     % string.rstrip(String):
-    % `String' minus any trailing whitespace characters.
+    % `String' minus any trailing whitespace characters in the ASCII range.
     %
 :- func string.rstrip(string) = string.
 
     % string.strip(String):
-    % `String' minus any initial and trailing whitespace characters.
+    % `String' minus any initial and trailing whitespace characters in the
+    % ASCII range.
     %
 :- func string.strip(string) = string.
 
     % string.lstrip_pred(Pred, String):
-    % `String' minus the maximal prefix consisting entirely of chars
-    % satisfying `Pred'.
+    % `String' minus the maximal prefix consisting entirely of characters
+    % (code points) satisfying `Pred'.
     %
 :- func string.lstrip_pred(pred(char)::in(pred(in) is semidet), string::in)
     = (string::out) is det.
 
     % string.rstrip_pred(Pred, String):
-    % `String' minus the maximal suffix consisting entirely of chars
-    % satisfying `Pred'.
+    % `String' minus the maximal suffix consisting entirely of characters
+    % (code points) satisfying `Pred'.
     %
 :- func string.rstrip_pred(pred(char)::in(pred(in) is semidet), string::in)
     = (string::out) is det.
 
     % string.prefix_length(Pred, String):
-    % The length of the maximal prefix of `String' consisting entirely of
-    % chars satisfying Pred.
+    % The length (in code units) of the maximal prefix of `String' consisting
+    % entirely of characters (code points) satisfying Pred.
     %
 :- func string.prefix_length(pred(char)::in(pred(in) is semidet), string::in)
     = (int::out) is det.
 
     % string.suffix_length(Pred, String):
-    % The length of the maximal suffix of `String' consisting entirely of chars
-    % satisfying Pred.
+    % The length (in code units) of the maximal suffix of `String' consisting
+    % entirely of characters (code points) satisfying Pred.
     %
 :- func suffix_length(pred(char)::in(pred(in) is semidet), string::in)
     = (int::out) is det.
 
     % string.set_char(Char, Index, String0, String):
-    % `String' is `String0' with the (`Index' + 1)-th character set to `Char'.
+    % `String' is `String0', with the code point beginning at code unit
+    % `Index' removed and replaced by `Char'.
     % Fails if `Index' is out of range (negative, or greater than or equal to
     % the length of `String0').
     %
@@ -516,7 +611,8 @@
 %:- mode string.set_char(in, in, di, uo) is semidet.
 
     % string.set_char_det(Char, Index, String0, String):
-    % `String' is `String0' with the (`Index' + 1)-th character set to `Char'.
+    % `String' is `String0', with the code point beginning at code unit
+    % `Index' removed and replaced by `Char'.
     % Calls error/1 if `Index' is out of range (negative, or greater than
     % or equal to the length of `String0').
     %
@@ -528,7 +624,8 @@
 %:- mode string.set_char_det(in, in, di, uo) is det.
 
     % string.unsafe_set_char(Char, Index, String0, String):
-    % `String' is `String0' with the (`Index' + 1)-th character set to `Char'.
+    % `String' is `String0', with the code point beginning at code unit
+    % `Index' removed and replaced by `Char'.
     % WARNING: behavior is UNDEFINED if `Index' is out of range
     % (negative, or greater than or equal to the length of `String0').
     % This version is constant time, whereas string.set_char_det
@@ -547,8 +644,8 @@
 
     % string.foldl(Closure, String, !Acc):
     % `Closure' is an accumulator predicate which is to be called for each
-    % character of the string `String' in turn. The initial value of the
-    % accumulator is `!.Acc' and the final value is `!:Acc'.
+    % character (code point) of the string `String' in turn. The initial
+    % value of the accumulator is `!.Acc' and the final value is `!:Acc'.
     % (string.foldl is equivalent to
     %   string.to_char_list(String, Chars),
     %   list.foldl(Closure, Chars, !Acc)
@@ -583,6 +680,8 @@
     % is equivalent to string.foldl(Closure, SubString, !Acc)
     % where SubString = string.substring(String, Start, Count).
     %
+    % `Start' and `Count' are in terms of code units.
+    %
 :- func string.foldl_substring(func(char, A) = A, string, int, int, A) = A.
 :- pred string.foldl_substring(pred(char, A, A), string, int, int, A, A).
 :- mode string.foldl_substring(pred(in, in, out) is det, in, in, in,
@@ -599,6 +698,8 @@
     % string.foldl_substring2(Closure, String, Start, Count, !Acc1, !Acc2)
     % A variant of string.foldl_substring with two accumulators.
     %
+    % `Start' and `Count' are in terms of code units.
+    %
 :- pred string.foldl2_substring(pred(char, A, A, B, B),
     string, int, int, A, A, B, B).
 :- mode string.foldl2_substring(pred(in, di, uo, di, uo) is det,
@@ -629,6 +730,8 @@
     % is equivalent to string.foldr(Closure, SubString, !Acc)
     % where SubString = string.substring(String, Start, Count).
     %
+    % `Start' and `Count' are in terms of code units.
+    %
 :- func string.foldr_substring(func(char, T) = T, string, int, int, T) = T.
 :- pred string.foldr_substring(pred(char, T, T), string, int, int, T, T).
 :- mode string.foldr_substring(pred(in, in, out) is det, in, in, in,
@@ -644,7 +747,8 @@
 
     % string.words_separator(SepP, String) returns the list of non-empty
     % substrings of String (in first to last order) that are delimited
-    % by non-empty sequences of chars matched by SepP. For example,
+    % by non-empty sequences of characters (code points) matched by SepP.
+    % For example,
     %
     % string.words_separator(char.is_whitespace, " the cat  sat on the  mat") =
     %   ["the", "cat", "sat", "on", "the", "mat"]
@@ -661,7 +765,7 @@
 
     % string.split_at_separator(SepP, String) returns the list of
     % substrings of String (in first to last order) that are delimited
-    % by chars matched by SepP. For example,
+    % by characters (code points) matched by SepP. For example,
     %
     % string.split_at_separator(char.is_whitespace, " a cat  sat on the  mat")
     %   = ["", "a", "cat", "", "sat", "on", "the", "", "mat"]
@@ -686,33 +790,59 @@
     %
 :- func string.split_at_string(string, string) = list(string).
 
-    % string.split(String, Count, LeftSubstring, RightSubstring):
-    % `LeftSubstring' is the left-most `Count' characters of `String',
-    % and `RightSubstring' is the remainder of `String'.
+    % string.split(String, Index, LeftSubstring, RightSubstring):
+    % Split a string into two substrings, at the code unit `Index'.
     % (If `Count' is out of the range [0, length of `String'], it is treated
     % as if it were the nearest end-point of that range.)
     %
 :- pred string.split(string::in, int::in, string::uo, string::uo) is det.
 
+    % string.split_by_codepoint(String, Count, LeftSubstring, RightSubstring):
+    % `LeftSubstring' is the left-most `Count' characters (code points) of
+    % `String', and `RightSubstring' is the remainder of `String'.
+    % (If `Count' is out of the range [0, length of `String'], it is treated
+    % as if it were the nearest end-point of that range.)
+    %
+:- pred string.split_by_codepoint(string::in, int::in, string::uo, string::uo)
+    is det.
+
     % string.left(String, Count, LeftSubstring):
-    % `LeftSubstring' is the left-most `Count' characters of `String'.
+    % `LeftSubstring' is the left-most `Count' code _units_ of `String'.
     % (If `Count' is out of the range [0, length of `String'], it is treated
     % as if it were the nearest end-point of that range.)
     %
 :- func string.left(string::in, int::in) = (string::uo) is det.
 :- pred string.left(string::in, int::in, string::uo) is det.
 
+    % string.left_by_codepoint(String, Count, LeftSubstring):
+    % `LeftSubstring' is the left-most `Count' characters (code points) of
+    % `String'.
+    % (If `Count' is out of the range [0, length of `String'], it is treated
+    % as if it were the nearest end-point of that range.)
+    %
+:- func string.left_by_codepoint(string::in, int::in) = (string::uo) is det.
+:- pred string.left_by_codepoint(string::in, int::in, string::uo) is det.
+
     % string.right(String, Count, RightSubstring):
-    % `RightSubstring' is the right-most `Count' characters of `String'.
+    % `RightSubstring' is the right-most `Count' code _units_ of `String'.
     % (If `Count' is out of the range [0, length of `String'], it is treated
     % as if it were the nearest end-point of that range.)
     %
 :- func string.right(string::in, int::in) = (string::uo) is det.
 :- pred string.right(string::in, int::in, string::uo) is det.
 
+    % string.right_by_codepoint(String, Count, RightSubstring):
+    % `RightSubstring' is the right-most `Count' characters (code points) of
+    % `String'.
+    % (If `Count' is out of the range [0, length of `String'], it is treated
+    % as if it were the nearest end-point of that range.)
+    %
+:- func string.right_by_codepoint(string::in, int::in) = (string::uo) is det.
+:- pred string.right_by_codepoint(string::in, int::in, string::uo) is det.
+
     % string.substring(String, Start, Count, Substring):
-    % `Substring' is first the `Count' characters in what would remain
-    % of `String' after the first `Start' characters were removed.
+    % `Substring' is first the `Count' code _units_ in what would remain
+    % of `String' after the first `Start' code _units_ were removed.
     % (If `Start' is out of the range [0, length of `String'], it is treated
     % as if it were the nearest end-point of that range.
     % If `Count' is out of the range [0, length of `String' - `Start'],
@@ -721,15 +851,28 @@
 :- func string.substring(string::in, int::in, int::in) = (string::uo) is det.
 :- pred string.substring(string::in, int::in, int::in, string::uo) is det.
 
+    % string.substring_by_codepoint(String, Start, Count, Substring):
+    % `Substring' is first the `Count' code points in what would remain
+    % of `String' after the first `Start' code points were removed.
+    % (If `Start' is out of the range [0, length of `String'], it is treated
+    % as if it were the nearest end-point of that range.
+    % If `Count' is out of the range [0, length of `String' - `Start'],
+    % it is treated as if it were the nearest end-point of that range.)
+    %
+:- func string.substring_by_codepoint(string::in, int::in, int::in)
+    = (string::uo) is det.
+:- pred string.substring_by_codepoint(string::in, int::in, int::in, string::uo)
+    is det.
+
     % string.unsafe_substring(String, Start, Count, Substring):
-    % `Substring' is first the `Count' characters in what would remain
-    % of `String' after the first `Start' characters were removed.
+    % `Substring' is first the `Count' code _units_ in what would remain
+    % of `String' after the first `Start' code _units_ were removed.
     % WARNING: if `Start' is out of the range [0, length of `String'],
     % or if `Count' is out of the range [0, length of `String' - `Start'],
     % then the behaviour is UNDEFINED. Use with care!
     % This version takes time proportional to the length of the substring,
     % whereas string.substring may take time proportional to the length
-    %% of the whole string.
+    % of the whole string.
     %
 :- func string.unsafe_substring(string::in, int::in, int::in) = (string::uo)
     is det.
@@ -757,16 +900,16 @@
 :- func string.hash3(string) = int.
 
     % string.sub_string_search(String, SubString, Index).
-    % `Index' is the position in `String' where the first occurrence of
-    % `SubString' begins. Indices start at zero, so if `SubString' is a prefix
-    % of `String', this will return Index = 0.
+    % `Index' is the code unit position in `String' where the first
+    % occurrence of `SubString' begins. Indices start at zero, so if
+    % `SubString' is a prefix of `String', this will return Index = 0.
     %
 :- pred string.sub_string_search(string::in, string::in, int::out) is semidet.
 
     % string.sub_string_search_start(String, SubString, BeginAt, Index).
-    % `Index' is the position in `String' where the first occurrence of
-    % `SubString' occurs such that 'Index' is greater than or equal to
-    % `BeginAt'.  Indices start at zero,
+    % `Index' is the code unit position in `String' where the first
+    % occurrence of `SubString' occurs such that 'Index' is greater than or
+    % equal to `BeginAt'.  Indices start at zero.
     %
 :- pred string.sub_string_search_start(string::in, string::in, int::in,
     int::out)
@@ -848,6 +991,12 @@
     %  bb * 22
     % ccc * 333
     %
+    % NOTE: the result is only aligned in termed of numbers of code points.
+    % The output may not be properly aligned if any code points occupy more or
+    % less than one column when printed.
+    %
+    % (In the future, we could try to use wcswidth.)
+    %
 :- func string.format_table(list(justified_column), string) = string.
 
     % format_table_max(Columns, Separator) does the same job as format_table,
@@ -861,11 +1010,14 @@
     ;       right(list(string)).
 
     % word_wrap(Str, N) = Wrapped.
-    % Wrapped is Str with newlines inserted between words so that at most
-    % N characters appear on a line and each line contains as many whole words
-    % as possible. If any one word exceeds N characters in length then it will
-    % be broken over two (or more) lines. Sequences of whitespace characters
-    % are replaced by a single space.
+    % Wrapped is Str with newlines inserted between words (separated by ASCII
+    % space characters) so that at most N code points appear on a line and each
+    % line contains as many whole words as possible. If any one word exceeds N
+    % code point in length then it will be broken over two (or more) lines.
+    % Sequences of whitespace characters are replaced by a single space.
+    %
+    % NOTE: this function does not care what a code point is, nor how wide are
+    % the glyphs for each code point.
     %
 :- func string.word_wrap(string, int) = string.
 
@@ -873,7 +1025,7 @@
     % word_wrap_separator/3 is like word_wrap/2, except that words that
     % need to be broken up over multiple lines have WordSeparator inserted
     % between each piece. If the length of WordSeparator is greater than
-    % or equal to N, then no separator is used.
+    % or equal to N code points, then no separator is used.
     %
 :- func string.word_wrap_separator(string, int, string) = string.
 
@@ -1008,12 +1160,14 @@ string.foldl2(Closure, String, !Acc1, !Acc2) :-
 string.foldl_substring(Closure, String, Start0, Count0, !Acc) :-
     Start = max(0, Start0),
     Count = min(Count0, length(String) - Start),
-    string.foldl_substring_2(Closure, String, Start, Count, !Acc).
+    End = Start + Count,
+    string.foldl_substring_2(Closure, String, Start, End, !Acc).
 
 string.foldl2_substring(Closure, String, Start0, Count0, !Acc1, !Acc2) :-
     Start = max(0, Start0),
     Count = min(Count0, length(String) - Start),
-    string.foldl2_substring_2(Closure, String, Start, Count, !Acc1, !Acc2).
+    End = Start + Count,
+    string.foldl2_substring_2(Closure, String, Start, End, !Acc1, !Acc2).
 
 :- pred string.foldl_substring_2(pred(char, A, A), string, int, int, A, A).
 :- mode string.foldl_substring_2(pred(in, di, uo) is det, in, in, in,
@@ -1027,10 +1181,14 @@ string.foldl2_substring(Closure, String, Start0, Count0, !Acc1, !Acc2) :-
 :- mode string.foldl_substring_2(pred(in, in, out) is multi, in, in, in,
     in, out) is multi.
 
-string.foldl_substring_2(Closure, String, I, Count, !Acc) :-
-    ( 0 < Count ->
-        Closure(string.unsafe_index(String, I), !Acc),
-        string.foldl_substring_2(Closure, String, I + 1, Count - 1, !Acc)
+string.foldl_substring_2(Closure, String, I, End, !Acc) :-
+    (
+        I < End,
+        string.unsafe_index_next(String, I, J, Char),
+        J =< End
+    ->
+        Closure(Char, !Acc),
+        string.foldl_substring_2(Closure, String, J, End, !Acc)
     ;
         true
     ).
@@ -1050,11 +1208,14 @@ string.foldl_substring_2(Closure, String, I, Count, !Acc) :-
 :- mode string.foldl2_substring_2(pred(in, in, out, in, out) is multi,
     in, in, in, in, out, in, out) is multi.
 
-string.foldl2_substring_2(Closure, String, I, Count, !Acc1, !Acc2) :-
-    ( 0 < Count ->
-        Closure(string.unsafe_index(String, I), !Acc1, !Acc2),
-        string.foldl2_substring_2(Closure, String, I + 1, Count - 1,
-            !Acc1, !Acc2)
+string.foldl2_substring_2(Closure, String, I, End, !Acc1, !Acc2) :-
+    (
+        I < End,
+        string.unsafe_index_next(String, I, J, Char),
+        J =< End
+    ->
+        Closure(Char, !Acc1, !Acc2),
+        string.foldl2_substring_2(Closure, String, J, End, !Acc1, !Acc2)
     ;
         true
     ).
@@ -1073,7 +1234,8 @@ string.foldr(Closure, String, Acc0, Acc) :-
 string.foldr_substring(Closure, String, Start0, Count0, Acc0, Acc) :-
     Start = max(0, Start0),
     Count = min(Count0, length(String) - Start),
-    string.foldr_substring_2(Closure, String, Start, Count, Acc0, Acc).
+    End = Start + Count,
+    string.foldr_substring_2(Closure, String, Start, End, Acc0, Acc).
 
 :- pred string.foldr_substring_2(pred(char, T, T), string, int, int, T, T).
 :- mode string.foldr_substring_2(pred(in, in, out) is det, in, in, in,
@@ -1087,10 +1249,14 @@ string.foldr_substring(Closure, String, Start0, Count0, Acc0, Acc) :-
 :- mode string.foldr_substring_2(pred(in, in, out) is multi, in, in, in,
     in, out) is multi.
 
-string.foldr_substring_2(Closure, String, I, Count, !Acc) :-
-    ( 0 < Count ->
-        Closure(string.unsafe_index(String, I + Count - 1), !Acc),
-        string.foldr_substring_2(Closure, String, I, Count - 1, !Acc)
+string.foldr_substring_2(Closure, String, Start, I, !Acc) :-
+    (
+        I > Start,
+        string.unsafe_prev_index(String, I, J, Char),
+        J >= Start
+    ->
+        Closure(Char, !Acc),
+        string.foldr_substring_2(Closure, String, Start, J, !Acc)
     ;
         true
     ).
@@ -1098,11 +1264,25 @@ string.foldr_substring_2(Closure, String, I, Count, !Acc) :-
 string.left(String, Count, LeftString) :-
     string.split(String, Count, LeftString, _RightString).
 
+string.left_by_codepoint(String, Count) = LeftString :-
+    string.left_by_codepoint(String, Count, LeftString).
+
+string.left_by_codepoint(String, Count, LeftString) :-
+    string.split_by_codepoint(String, Count, LeftString, _RightString).
+
 string.right(String, RightCount, RightString) :-
     string.length(String, Length),
     LeftCount = Length - RightCount,
     string.split(String, LeftCount, _LeftString, RightString).
 
+string.right_by_codepoint(String, RightCount) = RightString :-
+    string.right_by_codepoint(String, RightCount, RightString).
+
+string.right_by_codepoint(String, RightCount, RightString) :-
+    string.count_codepoints(String, TotalCount),
+    LeftCount = TotalCount - RightCount,
+    string.split_by_codepoint(String, LeftCount, _LeftString, RightString).
+
 string.remove_suffix(String, Suffix, StringWithoutSuffix) :-
     string.suffix(String, Suffix),
     string.left(String, length(String) - length(Suffix), StringWithoutSuffix).
@@ -1141,27 +1321,28 @@ string.prefix(String::in, Prefix::in) :-
     PreLen =< Len,
     prefix_2_iii(String, Prefix, PreLen - 1).
 string.prefix(String::in, Prefix::out) :-
-    Len = length(String),
-    prefix_2_ioii(String, Prefix, 0, Len).
+    prefix_2_ioi(String, Prefix, 0).
 
 :- pred prefix_2_iii(string::in, string::in, int::in) is semidet.
 
 prefix_2_iii(String, Prefix, I) :-
     ( 0 =< I ->
-        (String `unsafe_index` I) = (Prefix `unsafe_index` I) `with_type` char,
+        string.unsafe_index_code_unit(String, I, C),
+        string.unsafe_index_code_unit(Prefix, I, C),
         prefix_2_iii(String, Prefix, I - 1)
     ;
         true
     ).
 
-:- pred prefix_2_ioii(string::in, string::out, int::in, int::in) is multi.
-
-prefix_2_ioii(String, Prefix, PreLen, _Len) :-
-    Prefix = unsafe_substring(String, 0, PreLen).
+:- pred prefix_2_ioi(string::in, string::out, int::in) is multi.
 
-prefix_2_ioii(String, Prefix, PreLen, Len) :-
-    PreLen < Len,
-    prefix_2_ioii(String, Prefix, PreLen + 1, Len).
+prefix_2_ioi(String, Prefix, Cur) :-
+    (
+        Prefix = unsafe_substring(String, 0, Cur)
+    ;
+        string.unsafe_index_next(String, Cur, Next, _),
+        prefix_2_ioi(String, Prefix, Next)
+    ).
 
 :- pragma promise_equivalent_clauses(string.suffix/2).
 
@@ -1172,15 +1353,15 @@ string.suffix(String::in, Suffix::in) :-
     suffix_2_iiii(String, Suffix, 0, Len - PreLen, PreLen).
 string.suffix(String::in, Suffix::out) :-
     Len = length(String),
-    suffix_2_ioii(String, Suffix, 0, Len).
+    suffix_2_ioii(String, Suffix, Len, Len).
 
 :- pred suffix_2_iiii(string::in, string::in, int::in, int::in, int::in)
     is semidet.
 
 suffix_2_iiii(String, Suffix, I, Offset, Len) :-
     ( I < Len ->
-        (String `unsafe_index` (I + Offset)) =
-            (Suffix `unsafe_index` I) `with_type` char,
+        string.unsafe_index_code_unit(String, I + Offset, C),
+        string.unsafe_index_code_unit(Suffix, I, C),
         suffix_2_iiii(String, Suffix, I + 1, Offset, Len)
     ;
         true
@@ -1188,12 +1369,13 @@ suffix_2_iiii(String, Suffix, I, Offset, Len) :-
 
 :- pred suffix_2_ioii(string::in, string::out, int::in, int::in) is multi.
 
-suffix_2_ioii(String, Suffix, SufLen, Len) :-
-    Suffix = unsafe_substring(String, Len - SufLen, SufLen).
-
-suffix_2_ioii(String, Suffix, SufLen, Len) :-
-    SufLen < Len,
-    suffix_2_ioii(String, Suffix, SufLen + 1, Len).
+suffix_2_ioii(String, Suffix, Cur, Len) :-
+    (
+        string.unsafe_substring(String, Cur, Len, Suffix)
+    ;
+        string.unsafe_prev_index(String, Cur, Prev, _),
+        suffix_2_ioii(String, Suffix, Prev, Len)
+    ).
 
 string.char_to_string(Char, String) :-
     string.to_char_list(String, [Char]).
@@ -1364,11 +1546,16 @@ string.to_char_list(Str::uo, CharList::in) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "{
-    MR_ConstString p = Str + strlen(Str);
+    int pos = strlen(Str);
+    int c;
+
     CharList = MR_list_empty_msg(MR_PROC_LABEL);
-    while (p > Str) {
-        p--;
-        CharList = MR_char_list_cons_msg((MR_UnsignedChar) *p, CharList,
+    for (;;) {
+        c = MR_utf8_prev_get(Str, &pos);
+        if (c <= 0) {
+            break;
+        }
+        CharList = MR_char_list_cons_msg((MR_UnsignedChar) c, CharList,
             MR_PROC_LABEL);
     }
 }").
@@ -1380,7 +1567,23 @@ string.to_char_list(Str::uo, CharList::in) :-
 "
     list.List_1 lst = list.empty_list();
     for (int i = Str.Length - 1; i >= 0; i--) {
-        char c = Str[i];
+        int c;
+        char c2 = Str[i];
+        if (System.Char.IsLowSurrogate(c2)) {
+            try {
+                char c1 = Str[i - 1];
+                c = System.Char.ConvertToUtf32(c1, c2);
+                i--;
+            } catch (System.ArgumentOutOfRangeException) {
+                c = 0xfffd;
+            } catch (System.IndexOutOfRangeException) {
+                c = 0xfffd;
+            }
+        } else if (System.Char.IsHighSurrogate(c2)) {
+            c = 0xfffd;
+        } else {
+            c = c2;
+        }
         lst = list.cons(c, lst);
     }
     CharList = lst;
@@ -1391,10 +1594,11 @@ string.to_char_list(Str::uo, CharList::in) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
-    list.List_1<Character> lst = list.empty_list();
-    for (int i = Str.length() - 1; i >= 0; i--) {
-        char c = Str.charAt(i);
+    list.List_1<Integer> lst = list.empty_list();
+    for (int i = Str.length(); i > 0; ) {
+        int c = Str.codePointBefore(i);
         lst = list.cons(c, lst);
+        i -= java.lang.Character.charCount(c);
     }
     CharList = lst;
 ").
@@ -1404,7 +1608,7 @@ string.to_char_list(Str::uo, CharList::in) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness],
 "
-    CharList = binary_to_list(Str)
+    CharList = unicode:characters_to_list(Str)
 ").
 
 string.to_char_list_2(Str, CharList) :-
@@ -1451,7 +1655,7 @@ string.from_char_list(Chars::in, Str::uo) :-
     size = 0;
     char_list_ptr = CharList;
     while (! MR_list_is_empty(char_list_ptr)) {
-        size++;
+        size += MR_utf8_width(MR_list_head(char_list_ptr));
         char_list_ptr = MR_list_tail(char_list_ptr);
     }
 
@@ -1467,8 +1671,8 @@ string.from_char_list(Chars::in, Str::uo) :-
     size = 0;
     char_list_ptr = CharList;
     while (! MR_list_is_empty(char_list_ptr)) {
-        MR_Char c;
-        c = (MR_Char) MR_list_head(char_list_ptr);
+        int c;
+        c = MR_list_head(char_list_ptr);
         /*
         ** It is an error to put a null character in a string
         ** (see the comments at the top of this file).
@@ -1477,7 +1681,7 @@ string.from_char_list(Chars::in, Str::uo) :-
             SUCCESS_INDICATOR = MR_FALSE;
             break;
         }
-        Str[size++] = c;
+        size += MR_utf8_encode(Str + size, c);
         char_list_ptr = MR_list_tail(char_list_ptr);
     }
 
@@ -1491,7 +1695,12 @@ string.from_char_list(Chars::in, Str::uo) :-
 "
     System.Text.StringBuilder sb = new System.Text.StringBuilder();
     while (!list.is_empty(CharList)) {
-        sb.Append((char) list.det_head(CharList));
+        int cp = (int) list.det_head(CharList);
+        if (cp <= 0xffff) {
+            sb.Append((char) cp);
+        } else {
+            sb.Append(System.Char.ConvertFromUtf32(cp));
+        }
         CharList = list.det_tail(CharList);
     }
     Str = sb.ToString();
@@ -1504,9 +1713,14 @@ string.from_char_list(Chars::in, Str::uo) :-
         does_not_affect_liveness],
 "
     java.lang.StringBuilder sb = new StringBuilder();
-    Iterable<Character> iterable = new list.ListIterator<Character>(CharList);
-    for (char c : iterable) {
-        sb.append(c);
+    Iterable<Integer> iterable = new list.ListIterator<Integer>(CharList);
+    for (int c : iterable) {
+        if (c <= 0xffff) {
+            /* Fast path. */
+            sb.append((char) c);
+        } else {
+            sb.append(java.lang.Character.toChars(c));
+        }
     }
     Str = sb.toString();
     SUCCESS_INDICATOR = true;
@@ -1517,8 +1731,8 @@ string.from_char_list(Chars::in, Str::uo) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness],
 "
-    SUCCESS_INDICATOR = true,
-    Str = list_to_binary(CharList)
+    Str = unicode:characters_to_binary(CharList),
+    SUCCESS_INDICATOR = true
 ").
 
 :- pragma promise_equivalent_clauses(string.semidet_from_char_list/2).
@@ -1561,7 +1775,7 @@ string.from_rev_char_list(Chars, Str) :-
     size = 0;
     list_ptr = Chars;
     while (!MR_list_is_empty(list_ptr)) {
-        size++;
+        size += MR_utf8_width(MR_list_head(list_ptr));
         list_ptr = MR_list_tail(list_ptr);
     }
 
@@ -1589,7 +1803,8 @@ string.from_rev_char_list(Chars, Str) :-
             SUCCESS_INDICATOR = MR_FALSE;
             break;
         }
-        Str[--size] = c;
+        size -= MR_utf8_width(c);
+        MR_utf8_encode(Str + size, c);
         list_ptr = MR_list_tail(list_ptr);
     }
 }").
@@ -1602,14 +1817,26 @@ string.from_rev_char_list(Chars, Str) :-
     int size = 0;
     list.List_1 list_ptr = Chars;
     while (!list.is_empty(list_ptr)) {
-        size++;
+        int c = (int) list.det_head(list_ptr);
+        if (c <= 0xffff) {
+            size++;
+        } else {
+            size += 2;
+        }
         list_ptr = list.det_tail(list_ptr);
     }
 
     char[] arr = new char[size];
     list_ptr = Chars;
     while (!list.is_empty(list_ptr)) {
-        arr[--size] = (char) list.det_head(list_ptr);
+        int c = (int) list.det_head(list_ptr);
+        if (c <= 0xffff) {
+            arr[--size] = (char) c;
+        } else {
+            string s = System.Char.ConvertFromUtf32(c);
+            arr[--size] = s[1];
+            arr[--size] = s[0];
+        }
         list_ptr = list.det_tail(list_ptr);
     }
 
@@ -1617,11 +1844,158 @@ string.from_rev_char_list(Chars, Str) :-
     SUCCESS_INDICATOR = true;
 ").
 
+% XXX semidet_from_rev_char_list for Java?
+
 string.semidet_from_rev_char_list(Chars::in, Str::uo) :-
     string.semidet_from_char_list(list.reverse(Chars), Str).
 
 %---------------------------------------------------------------------------%
 
+string.to_code_unit_list(String, List) :-
+    string.to_code_unit_list_2(String, 0, string.length(String), List).
+
+:- pred string.to_code_unit_list_2(string::in, int::in, int::in,
+    list(int)::out) is det.
+
+string.to_code_unit_list_2(String, Index, End, List) :-
+    ( Index >= End ->
+        List = []
+    ;
+        string.unsafe_index_code_unit(String, Index, Code),
+        string.to_code_unit_list_2(String, Index + 1, End, Tail),
+        List = [Code | Tail]
+    ).
+
+%-----------------------------------------------------------------------------%
+
+:- pragma foreign_proc("C",
+    string.from_code_unit_list(CodeList::in, Str::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness, may_not_duplicate, no_sharing],
+"
+    MR_Word list_ptr;
+    size_t  size;
+
+    size = 0;
+    list_ptr = CodeList;
+    while (! MR_list_is_empty(list_ptr)) {
+        size++;
+        list_ptr = MR_list_tail(list_ptr);
+    }
+
+    MR_allocate_aligned_string_msg(Str, size, MR_PROC_LABEL);
+
+    SUCCESS_INDICATOR = MR_TRUE;
+    size = 0;
+    list_ptr = CodeList;
+    while (! MR_list_is_empty(list_ptr)) {
+        int c;
+        c = MR_list_head(list_ptr);
+        /*
+        ** It is an error to put a null character in a string
+        ** (see the comments at the top of this file).
+        */
+        if (c == '\\0' || c > 0xff) {
+            SUCCESS_INDICATOR = MR_FALSE;
+            break;
+        }
+        Str[size] = c;
+        size++;
+        list_ptr = MR_list_tail(list_ptr);
+    }
+
+    Str[size] = '\\0';
+
+    SUCCESS_INDICATOR = SUCCESS_INDICATOR && MR_utf8_verify(Str);
+").
+
+:- pragma foreign_proc("Java",
+    string.from_code_unit_list(CodeList::in, Str::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness],
+"
+    java.lang.StringBuilder sb = new java.lang.StringBuilder();
+    boolean prev_high = false;
+
+    SUCCESS_INDICATOR = true;
+
+    Iterable<Integer> iterable = new list.ListIterator<Integer>(CodeList);
+    for (int i : iterable) {
+        char c = (char) i;
+        if (prev_high) {
+            if (!java.lang.Character.isLowSurrogate(c)) {
+                SUCCESS_INDICATOR = false;
+                break;
+            }
+            prev_high = false;
+        } else if (java.lang.Character.isHighSurrogate(c)) {
+            prev_high = true;
+        } else if (java.lang.Character.isLowSurrogate(c)) {
+            SUCCESS_INDICATOR = false;
+            break;
+        }
+        sb.append(c);
+    }
+
+    SUCCESS_INDICATOR = SUCCESS_INDICATOR && !prev_high;
+
+    if (SUCCESS_INDICATOR) {
+        Str = sb.toString();
+    } else {
+        Str = """";
+    }
+").
+
+:- pragma foreign_proc("C#",
+    string.from_code_unit_list(CodeList::in, Str::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness],
+"
+    System.Text.StringBuilder sb = new System.Text.StringBuilder();
+    bool prev_high = false;
+
+    SUCCESS_INDICATOR = true;
+
+    while (!list.is_empty(CodeList)) {
+        /* Both casts are required. */
+        char c = (char) (int) list.det_head(CodeList);
+        if (prev_high) {
+            if (!System.Char.IsLowSurrogate(c)) {
+                SUCCESS_INDICATOR = false;
+                break;
+            }
+            prev_high = false;
+        } else if (System.Char.IsHighSurrogate(c)) {
+            prev_high = true;
+        } else if (System.Char.IsLowSurrogate(c)) {
+            SUCCESS_INDICATOR = false;
+            break;
+        }
+        sb.Append(c);
+        CodeList = list.det_tail(CodeList);
+    }
+
+    SUCCESS_INDICATOR = SUCCESS_INDICATOR && !prev_high;
+
+    if (SUCCESS_INDICATOR) {
+        Str = sb.ToString();
+    } else {
+        Str = """";
+    }
+").
+
+:- pragma foreign_proc("Erlang",
+    string.from_code_unit_list(CodeList::in, Str::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness],
+"
+    % XXX should validate the string 
+    Str = list_to_binary(CodeList),
+    SUCCESS_INDICATOR = true
+").
+
+%---------------------------------------------------------------------------%
+
 string.to_upper(StrIn, StrOut) :-
     string.to_char_list(StrIn, List),
     string.char_list_to_upper(List, ListUpp),
@@ -1663,15 +2037,15 @@ string.uncapitalize_first(S0, S) :-
     ).
 
 string.all_match(P, String) :-
-    all_match_2(string.length(String) - 1, P, String).
+    all_match_2(P, String, 0).
 
-:- pred all_match_2(int::in, pred(char)::in(pred(in) is semidet), string::in)
+:- pred all_match_2(pred(char)::in(pred(in) is semidet), string::in, int::in)
     is semidet.
 
-string.all_match_2(I, P, String) :-
-    ( I >= 0 ->
-        P(string.unsafe_index(String, I)),
-        string.all_match_2(I - 1, P, String)
+string.all_match_2(P, String, I) :-
+    ( string.unsafe_index_next(String, I, J, Char) ->
+        P(Char),
+        string.all_match_2(P, String, J)
     ;
         true
     ).
@@ -1817,7 +2191,7 @@ string.is_all_digits(S) :-
 ").
 
 string.pad_left(String0, PadChar, Width, String) :-
-    string.length(String0, Length),
+    string.count_codepoints(String0, Length),
     ( Length < Width ->
         Count = Width - Length,
         string.duplicate_char(PadChar, Count, PadString),
@@ -1827,7 +2201,7 @@ string.pad_left(String0, PadChar, Width, String) :-
     ).
 
 string.pad_right(String0, PadChar, Width, String) :-
-    string.length(String0, Length),
+    string.count_codepoints(String0, Length),
     ( Length < Width ->
         Count = Width - Length,
         string.duplicate_char(PadChar, Count, PadString),
@@ -2039,9 +2413,9 @@ string.hash(String) = HashVal :-
 
 string.hash_loop(String, Index, Length, !HashVal) :-
     ( Index < Length ->
-        C = char.to_int(string.unsafe_index(String, Index)),
+        string.unsafe_index_code_unit(String, Index, C),
         !:HashVal = !.HashVal `xor` (!.HashVal `unchecked_left_shift` 5),
-        !:HashVal= !.HashVal `xor` C,
+        !:HashVal = !.HashVal `xor` C,
         string.hash_loop(String, Index + 1, Length, !HashVal)
     ;
         true
@@ -2057,7 +2431,7 @@ string.hash2(String) = HashVal :-
 
 string.hash2_loop(String, Index, Length, !HashVal) :-
     ( Index < Length ->
-        C = char.to_int(string.unsafe_index(String, Index)),
+        string.unsafe_index_code_unit(String, Index, C),
         !:HashVal = !.HashVal * 37,
         !:HashVal= !.HashVal + C,
         string.hash2_loop(String, Index + 1, Length, !HashVal)
@@ -2075,7 +2449,7 @@ string.hash3(String) = HashVal :-
 
 string.hash3_loop(String, Index, Length, !HashVal) :-
     ( Index < Length ->
-        C = char.to_int(string.unsafe_index(String, Index)),
+        string.unsafe_index_code_unit(String, Index, C),
         !:HashVal = !.HashVal * 49,
         !:HashVal= !.HashVal + C,
         string.hash3_loop(String, Index + 1, Length, !HashVal)
@@ -2095,12 +2469,16 @@ string.sub_string_search(WholeString, Pattern, Index) :-
         does_not_affect_liveness, no_sharing],
 "{
     char *match;
-    match = strstr(WholeString + BeginAt, Pattern);
-    if (match) {
-        Index = match - WholeString;
-        SUCCESS_INDICATOR = MR_TRUE;
-    } else {
+    if ((MR_Unsigned) BeginAt > strlen(WholeString)) {
         SUCCESS_INDICATOR = MR_FALSE;
+    } else {
+        match = strstr(WholeString + BeginAt, Pattern);
+        if (match) {
+            Index = match - WholeString;
+            SUCCESS_INDICATOR = MR_TRUE;
+        } else {
+            SUCCESS_INDICATOR = MR_FALSE;
+        }
     }
 }").
 
@@ -2311,8 +2689,7 @@ width(Width, !PolyTypes, !Chars) :-
         non_zero_digit(!Chars),
         zero_or_more_occurences(digit, !Chars),
         Final = !.Chars,
-
-        char_list_remove_suffix(Init, Final, Width)
+        list.remove_suffix(Init, Final, Width)
     ).
 
     % Do we have a precision?
@@ -2337,7 +2714,7 @@ prec(Prec, !PolyTypes, !Chars) :-
         zero_or_more_occurences(digit, !Chars),
         Final = !.Chars
     ->
-        char_list_remove_suffix(Init, Final, Prec)
+        list.remove_suffix(Init, Final, Prec)
     ;
         % When no number follows the '.' the precision defaults to 0.
         Prec = ['0']
@@ -3896,36 +4273,49 @@ count_extra_trailing_zeroes(FloatStr, I, N0) = N :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
-    SUCCESS_INDICATOR = (strchr(Str, Ch) != NULL) && Ch != '\\0';
+    char    buf[5];
+    size_t  len;
+    if (Ch <= 0x7f) {
+        /* Fast path. */
+        SUCCESS_INDICATOR = (strchr(Str, Ch) != NULL) && Ch != '\\0';
+    } else {
+        len = MR_utf8_encode(buf, Ch);
+        buf[len] = '\\0';
+        SUCCESS_INDICATOR = (strstr(Str, buf) != NULL);
+    }
 ").
 
 :- pragma foreign_proc("C#",
     string.contains_char(Str::in, Ch::in),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    SUCCESS_INDICATOR = (Str.IndexOf(Ch) != -1);
+    if (Ch <= 0xffff) {
+        SUCCESS_INDICATOR = (Str.IndexOf((char) Ch) != -1);
+    } else {
+        string s = System.Char.ConvertFromUtf32(Ch);
+        SUCCESS_INDICATOR = Str.Contains(s);
+    }
 ").
 
 :- pragma foreign_proc("Java",
     string.contains_char(Str::in, Ch::in),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    SUCCESS_INDICATOR = (Str.indexOf(Ch) != -1);
+    // indexOf(int) handles supplementary characters correctly.
+    SUCCESS_INDICATOR = (Str.indexOf((int) Ch) != -1);
 ").
 
 string.contains_char(String, Char) :-
-    string.contains_char(String, Char, 0, string.length(String)).
+    string.contains_char(String, Char, 0).
 
-:- pred string.contains_char(string::in, char::in, int::in, int::in)
-    is semidet.
+:- pred string.contains_char(string::in, char::in, int::in) is semidet.
 
-string.contains_char(Str, Char, Index, Length) :-
-    ( Index < Length ->
-        string.unsafe_index(Str, Index, IndexChar),
+string.contains_char(Str, Char, I) :-
+    ( string.unsafe_index_next(Str, I, J, IndexChar) ->
         ( IndexChar = Char ->
             true
         ;
-            string.contains_char(Str, Char, Index + 1, Length)
+            string.contains_char(Str, Char, J)
         )
     ;
         fail
@@ -3946,6 +4336,16 @@ string.index(Str, Index, Char) :-
         fail
     ).
 
+:- pragma inline(string.index_next/4).
+
+string.index_next(Str, Index, NextIndex, Char) :-
+    Len = string.length(Str),
+    ( string.index_check(Index, Len) ->
+        string.unsafe_index_next(Str, Index, NextIndex, Char)
+    ;
+        fail
+    ).
+
 :- pred string.index_check(int::in, int::in) is semidet.
 
 % We should consider making this routine a compiler built-in.
@@ -3975,44 +4375,267 @@ string.index_check(Index, Length) :-
 
 /*-----------------------------------------------------------------------*/
 
+string.unsafe_index(Str, Index, Char) :-
+    ( string.unsafe_index_2(Str, Index, Char0) ->
+        Char = Char0
+    ;
+        error("string.unsafe_index: illegal sequence")
+    ).
+
+:- pred string.unsafe_index_2(string::in, int::in, char::uo) is semidet.
+
 :- pragma foreign_proc("C",
-    string.unsafe_index(Str::in, Index::in, Ch::uo),
+    string.unsafe_index_2(Str::in, Index::in, Ch::uo),
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
-    Ch = Str[Index];
+    Ch = MR_utf8_get(Str, Index);
+    SUCCESS_INDICATOR = (Ch > 0);
 ").
 :- pragma foreign_proc("C#",
-    string.unsafe_index(Str::in, Index::in, Ch::uo),
+    string.unsafe_index_2(Str::in, Index::in, Ch::uo),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    Ch = Str[Index];
+    char c1 = Str[Index];
+    if (System.Char.IsSurrogate(c1)) {
+        try {
+            char c2 = Str[Index + 1];
+            Ch = System.Char.ConvertToUtf32(c1, c2);
+        } catch (System.ArgumentOutOfRangeException) {
+            Ch = -1;
+        } catch (System.IndexOutOfRangeException) {
+            Ch = -1;
+        }
+    } else {
+        /* Common case. */
+        Ch = c1;
+    }
+    SUCCESS_INDICATOR = (Ch > 0);
 ").
 :- pragma foreign_proc("Java",
-    string.unsafe_index(Str::in, Index::in, Ch::uo),
+    string.unsafe_index_2(Str::in, Index::in, Ch::uo),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    Ch = Str.charAt(Index);
+    Ch = Str.codePointAt(Index);
+    /* XXX Unicode Java: more error checking */
+    /* XXX Unicode Java: could be more efficient */
+    SUCCESS_INDICATOR =
+        !java.lang.Character.isHighSurrogate((char) Ch) &&
+        !java.lang.Character.isLowSurrogate((char) Ch);
 ").
 :- pragma foreign_proc("Erlang",
-    string.unsafe_index(Str::in, Index::in, Ch::uo),
+    string.unsafe_index_2(Str::in, Index::in, Ch::uo),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    <<_:Index/binary, Ch/integer, _/binary>> = Str
+    <<_:Index/binary, Ch/utf8, _/binary>> = Str,
+    SUCCESS_INDICATOR = true
 ").
-string.unsafe_index(Str, Index, Char) :-
-    ( string.first_char(Str, First, Rest) ->
-        ( Index = 0 ->
-            Char = First
-        ;
-            string.unsafe_index(Rest, Index - 1, Char)
-        )
-    ;
-        error("string.unsafe_index: out of bounds")
-    ).
 
 String ^ unsafe_elem(Index) = unsafe_index(String, Index).
 
+:- pragma foreign_proc("C",
+    string.unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness, no_sharing],
+"
+    int pos = Index;
+    Ch = MR_utf8_get_next(Str, &pos);
+    NextIndex = pos;
+    SUCCESS_INDICATOR = (Ch > 0);
+").
+
+:- pragma foreign_proc("C#",
+    string.unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness, no_sharing],
+"
+    if (Index < Str.Length) {
+        Ch = System.Char.ConvertToUtf32(Str, Index);
+        if (Ch <= 0xffff) {
+            NextIndex = Index + 1;
+        } else {
+            NextIndex = Index + 2;
+        }
+        SUCCESS_INDICATOR = true;
+    } else {
+        Ch = -1;
+        NextIndex = Index;
+        SUCCESS_INDICATOR = false;
+    }
+").
+
+:- pragma foreign_proc("Java",
+    string.unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness, no_sharing],
+"
+    if (Index < Str.length()) {
+        Ch = Str.codePointAt(Index);
+        /* XXX Unicode Java: more error checking */
+        /* XXX Unicode Java: could be faster */
+        SUCCESS_INDICATOR =
+            !java.lang.Character.isHighSurrogate((char) Ch) &&
+            !java.lang.Character.isLowSurrogate((char) Ch);
+        if (SUCCESS_INDICATOR) {
+            NextIndex = Index + java.lang.Character.charCount(Ch);
+            SUCCESS_INDICATOR = true;
+        } else {
+            Ch = -1;
+            NextIndex = Index;
+            SUCCESS_INDICATOR = false;
+        }
+    } else {
+        Ch = -1;
+        NextIndex = Index;
+        SUCCESS_INDICATOR = false;
+    }
+").
+
+:- pragma foreign_proc("Erlang",
+    string.unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness, no_sharing],
+"
+    case Str of
+        << _:Index/binary, Ch/utf8, _/binary >> ->
+            if
+                Ch =< 16#7f ->
+                    NextIndex = Index + 1;
+                Ch =< 16#7ff ->
+                    NextIndex = Index + 2;
+                Ch =< 16#ffff ->
+                    NextIndex = Index + 3;
+                true ->
+                    NextIndex = Index + 4
+            end,
+            SUCCESS_INDICATOR = true;
+        _ ->
+            Ch = -1,
+            NextIndex = Index,
+            SUCCESS_INDICATOR = false
+    end
+").
+
+:- pragma foreign_proc("C",
+    string.unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness, no_sharing],
+"
+    int pos = Index;
+    Ch = MR_utf8_prev_get(Str, &pos);
+    PrevIndex = pos;
+    SUCCESS_INDICATOR = (Ch > 0);
+").
+
+:- pragma foreign_proc("C#",
+    string.unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness, no_sharing],
+"
+    if (Index > 0) {
+        char c2 = Str[Index - 1];
+        if (System.Char.IsSurrogate(c2)) {
+            try {
+                char c1 = Str[Index - 2];
+                Ch = System.Char.ConvertToUtf32(c1, c2);
+                PrevIndex = Index - 2;
+            } catch (System.IndexOutOfRangeException) {
+                Ch = -1;
+                PrevIndex = Index;
+                SUCCESS_INDICATOR = false;
+            } catch (System.ArgumentOutOfRangeException) {
+                Ch = -1;
+                PrevIndex = Index;
+                SUCCESS_INDICATOR = false;
+            }
+        } else {
+            /* Common case. */
+            Ch = (int) c2;
+            PrevIndex = Index - 1;
+        }
+        SUCCESS_INDICATOR = true;
+    } else {
+        Ch = -1;
+        PrevIndex = Index;
+        SUCCESS_INDICATOR = false;
+    }
+").
+
+:- pragma foreign_proc("Java",
+    string.unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness, no_sharing],
+"
+    if (Index > 0) {
+        Ch = Str.codePointBefore(Index);
+        PrevIndex = Index - java.lang.Character.charCount(Ch);
+        SUCCESS_INDICATOR = true;
+    } else {
+        Ch = -1;
+        PrevIndex = Index;
+        SUCCESS_INDICATOR = false;
+    }
+").
+
+:- pragma foreign_proc("Erlang",
+    string.unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness, no_sharing],
+"
+    {PrevIndex, Ch} = do_unsafe_prev_index(Str, Index - 1),
+    SUCCESS_INDICATOR = (Ch =/= -1)
+").
+
+:- pragma foreign_code("Erlang", "
+do_unsafe_prev_index(Str, Index) ->
+    if Index >= 0 ->
+        case Str of
+            <<_:Index/binary, Ch/integer, _/binary>> ->
+                if
+                    (Ch band 16#80) =:= 0 ->
+                        {Index, Ch};
+                    (Ch band 16#C0) == 16#80 ->
+                        do_unsafe_prev_index(Str, Index - 1);
+                    true ->
+                        <<_:Index/binary, Ch2/utf8, _/binary>> = Str,
+                        {Index, Ch2}
+                end;
+            true ->
+                {Index, -1}
+        end
+    ; true ->
+        {Index, -1}
+    end.
+").
+
+/*-----------------------------------------------------------------------*/
+
+:- pragma foreign_proc("C",
+    string.unsafe_index_code_unit(Str::in, Index::in, Code::out),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    const unsigned char *s = (const unsigned char *)Str;
+    Code = s[Index];
+").
+:- pragma foreign_proc("C#",
+    string.unsafe_index_code_unit(Str::in, Index::in, Code::out),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    Code = Str[Index];
+").
+:- pragma foreign_proc("Java",
+    string.unsafe_index_code_unit(Str::in, Index::in, Code::out),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    Code = Str.charAt(Index);
+").
+:- pragma foreign_proc("Erlang",
+    string.unsafe_index_code_unit(Str::in, Index::in, Code::out),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    <<_:Index/binary, Code/integer, _/binary>> = Str
+").
+
 /*-----------------------------------------------------------------------*/
 
 :- pragma foreign_decl("C",
@@ -4022,9 +4645,9 @@ String ^ unsafe_elem(Index) = unsafe_index(String, Index).
     ** GNU C version egcs-1.1.2 crashes with `fixed or forbidden register
     ** spilled' in grade asm_fast.gc.tr.debug if we write this inline.
     */
-    extern void MR_set_char(MR_String str, MR_Integer ind, MR_Char ch);
+    extern void MR_set_code_unit(MR_String str, MR_Integer ind, char ch);
 #else
-    #define MR_set_char(str, ind, ch) \\
+    #define MR_set_code_unit(str, ind, ch) \\
         ((str)[ind] = (ch))
 #endif
 ").
@@ -4036,7 +4659,7 @@ String ^ unsafe_elem(Index) = unsafe_index(String, Index).
     ** GNU C version egcs-1.1.2 crashes with `fixed or forbidden register
     ** spilled' in grade asm_fast.gc.tr.debug if we write this inline.
     */
-    void MR_set_char(MR_String str, MR_Integer ind, MR_Char ch)
+    void MR_set_code_unit(MR_String str, MR_Integer ind, char ch)
     {
         str[ind] = ch;
     }
@@ -4064,11 +4687,29 @@ string.set_char(Char, Index, !Str) :-
     size_t len = strlen(Str0);
     if ((MR_Unsigned) Index >= len) {
         SUCCESS_INDICATOR = MR_FALSE;
-    } else {
+    } else if (Str0[Index] <= 0x7f && Ch <= 0x7f) {
+        /* Fast path. */
         SUCCESS_INDICATOR = MR_TRUE;
         MR_allocate_aligned_string_msg(Str, len, MR_PROC_LABEL);
         strcpy(Str, Str0);
-        MR_set_char(Str, Index, Ch);
+        MR_set_code_unit(Str, Index, Ch);
+    } else {
+        int oldc = MR_utf8_get(Str0, Index);
+        if (oldc < 0) {
+            SUCCESS_INDICATOR = MR_FALSE;
+        } else {
+            size_t oldwidth = MR_utf8_width(oldc);
+            size_t newwidth = MR_utf8_width(Ch);
+            size_t newlen;
+            size_t tailofs;
+
+            newlen = len - oldwidth + newwidth;
+            MR_allocate_aligned_string_msg(Str, newlen, MR_PROC_LABEL);
+            MR_memcpy(Str, Str0, Index);
+            MR_utf8_encode(Str + Index, Ch);
+            strcpy(Str + Index + newwidth, Str0 + Index + oldwidth);
+            SUCCESS_INDICATOR = MR_TRUE;
+        }
     }
 ").
 :- pragma foreign_proc("C#",
@@ -4080,7 +4721,18 @@ string.set_char(Char, Index, !Str) :-
         SUCCESS_INDICATOR = false;
     } else {
         System.Text.StringBuilder sb = new System.Text.StringBuilder(Str0);
-        sb[Index] = Ch;
+        if (!System.Char.IsHighSurrogate(Str0, Index) && Ch <= 0xffff) {
+            /* Fast path. */
+            sb[Index] = (char) Ch;
+        } else {
+            // XXX Unicode C#: error checking
+            if (System.Char.IsLowSurrogate(Str0, Index + 1)) {
+                sb.Remove(Index, 2);
+            } else {
+                sb.Remove(Index, 1);
+            }
+            sb.Insert(Index, System.Char.ConvertFromUtf32(Ch));
+        }
         Str = sb.ToString();
         SUCCESS_INDICATOR = true;
     }
@@ -4094,7 +4746,17 @@ string.set_char(Char, Index, !Str) :-
         SUCCESS_INDICATOR = false;
     } else {
         java.lang.StringBuilder sb = new StringBuilder(Str0);
-        sb.setCharAt(Index, Ch);
+
+        int oldc = sb.codePointAt(Index);
+        int oldwidth = java.lang.Character.charCount(oldc);
+        int newwidth = java.lang.Character.charCount(Ch);
+        if (oldwidth == 1 && newwidth == 1) {
+            sb.setCharAt(Index, (char) Ch);
+        } else {
+            char[] buf = java.lang.Character.toChars(Ch);
+            sb.replace(Index, Index + oldwidth, new String(buf));
+        }
+
         Str = sb.toString();
         SUCCESS_INDICATOR = true;
     }
@@ -4104,6 +4766,7 @@ string.set_char_2(Ch, Index, Str0, Str) :-
     list.replace_nth(List0, Index + 1, Ch, List),
     string.to_char_list(Str, List).
 
+% XXX Unicode: what is this?
 % :- pragma foreign_proc("C",
 %   string.set_char_2(Ch::in, Index::in, Str0::di, Str::uo),
 %   [will_not_call_mercury, promise_pure, thread_safe, does_not_affect_liveness],
@@ -4124,6 +4787,7 @@ string.set_char_2(Ch, Index, Str0, Str) :-
 %   if (Index >= Str0.Length) {
 %       SUCCESS_INDICATOR = false;
 %   } else {
+%       // UTF-16
 %       Str = System.String.Concat(Str0.Substring(0, Index),
 %           System.Convert.ToString(Ch),
 %           Str0.Substring(Index + 1));
@@ -4152,32 +4816,65 @@ string.unsafe_set_char(Char, Index, !Str) :-
         does_not_affect_liveness],
 "
     size_t len = strlen(Str0);
-    MR_allocate_aligned_string_msg(Str, len, MR_PROC_LABEL);
-    strcpy(Str, Str0);
-    MR_set_char(Str, Index, Ch);
+    if (Str0[Index] <= 0x7f && Ch <= 0x7f) {
+        /* Fast path. */
+        SUCCESS_INDICATOR = MR_TRUE;
+        MR_allocate_aligned_string_msg(Str, len, MR_PROC_LABEL);
+        strcpy(Str, Str0);
+        MR_set_code_unit(Str, Index, Ch);
+    } else {
+        int oldc = MR_utf8_get(Str0, Index);
+        if (oldc < 0) {
+            SUCCESS_INDICATOR = MR_FALSE;
+        } else {
+            size_t oldwidth = MR_utf8_width(oldc);
+            size_t newwidth = MR_utf8_width(Ch);
+            size_t newlen;
+            size_t tailofs;
+
+            newlen = len - oldwidth + newwidth;
+            MR_allocate_aligned_string_msg(Str, newlen, MR_PROC_LABEL);
+            MR_memcpy(Str, Str0, Index);
+            MR_utf8_encode(Str + Index, Ch);
+            strcpy(Str + Index + newwidth, Str0 + Index + oldwidth);
+            SUCCESS_INDICATOR = MR_TRUE;
+        }
+    }
 ").
 :- pragma foreign_proc("C#",
     string.unsafe_set_char_2(Ch::in, Index::in, Str0::in, Str::out),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    Str = System.String.Concat(Str0.Substring(0, Index),
-        System.Convert.ToString(Ch),
-        Str0.Substring(Index + 1));
+    // XXX Unicode C#: need more error checking
+    if (System.Char.IsHighSurrogate(Str0, Index)) {
+        Str = System.String.Concat(Str0.Substring(0, Index),
+            System.Char.ConvertFromUtf32(Ch),
+            Str0.Substring(Index + 2));
+    } else {
+        Str = System.String.Concat(Str0.Substring(0, Index),
+            System.Char.ConvertFromUtf32(Ch),
+            Str0.Substring(Index + 1));
+    }
 ").
 :- pragma foreign_proc("Java",
     string.unsafe_set_char_2(Ch::in, Index::in, Str0::in, Str::out),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    Str = Str0.substring(0, Index) + Ch + Str0.substring(Index + 1);
+    int oldc = Str0.codePointAt(Index);
+    int oldwidth = java.lang.Character.charCount(oldc);
+    Str = Str0.subSequence(0, Index)
+        + new String(Character.toChars(Ch))
+        + Str0.subSequence(Index + oldwidth, Str0.length());
 ").
 :- pragma foreign_proc("Erlang",
     string.unsafe_set_char_2(Ch::in, Index::in, Str0::in, Str::out),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    <<Left:Index/binary, _/integer, Right/binary>> = Str0,
-    Str = list_to_binary([Left, Ch, Right])
+    <<Left:Index/binary, _/utf8, Right/binary>> = Str0,
+    Str = unicode:characters_to_binary([Left, Ch, Right])
 ").
 
+% NOTE: these procs do not understand variable width encodings
 % :- pragma foreign_proc("C",
 %   string.unsafe_set_char_2(Ch::in, Index::in, Str0::di, Str::uo),
 %   [will_not_call_mercury, promise_pure, thread_safe, does_not_affect_liveness],
@@ -4256,20 +4953,140 @@ string.unsafe_set_char(Char, Index, !Str) :-
     Length = size(Str)
 ").
 
-string.length(Str0, Len) :-
-    % XXX This copy is only necessary because of the ui.
-    copy(Str0, Str),
-    string.length_2(Str, 0, Len).
+string.length(Str, Len) :-
+    string.to_code_unit_list(Str, CodeList),
+    list.length(CodeList, Len).
+
+string.count_code_units(Str) = string.length(Str).
+
+string.count_code_units(Str, Length) :-
+    string.length(Str, Length).
+
+/*-----------------------------------------------------------------------*/
+
+string.count_codepoints(String) = Count :-
+    string.count_codepoints(String, Count).
+
+string.count_codepoints(String, Count) :-
+    count_codepoints_2(String, 0, 0, Count).
+
+:- pred count_codepoints_2(string::in, int::in, int::in, int::out) is det.
+
+count_codepoints_2(String, I, Count0, Count) :-
+    ( string.unsafe_index_next(String, I, J, _) ->
+        count_codepoints_2(String, J, Count0 + 1, Count)
+    ;   
+        Count = Count0
+    ).
+
+:- pragma foreign_proc("C",
+    string.count_codepoints(String::in, Count::out),
+    [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+    unsigned char   b;
+    int             i;
+
+    Count = 0;
+    for (i = 0; ; i++) {
+        b = String[i];
+        if (b == '\\0') {
+            break;
+        }
+        if (MR_utf8_is_single_byte(b) || MR_utf8_is_lead_byte(b)) {
+            Count++;
+        }
+    }
+").
+
+:- pragma foreign_proc("C#",
+    string.count_codepoints(String::in, Count::out),
+    [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+    Count = 0;
+    foreach (char c in String) {
+        if (!System.Char.IsLowSurrogate(c)) {
+            Count++;
+        }
+    }
+").
+
+:- pragma foreign_proc("Java",
+    string.count_codepoints(String::in, Count::out),
+    [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+    Count = String.codePointCount(0, String.length());
+").
 
-:- pred string.length_2(string::in, int::in, int::out) is det.
+    % Note: we do not define what happens with unpaired surrogates.
+    %
+string.codepoint_offset(String, N, Index) :-
+    string.codepoint_offset(String, 0, N, Index).
+
+% XXX unicode: untested
+string.codepoint_offset(String, StartOffset, N, Index) :-
+    StartOffset >= 0,
+    StartOffset < string.length(String),
+    string.codepoint_offset_2(String, StartOffset, N, Index).
 
-string.length_2(Str, Index, Length) :-
-    ( string.index(Str, Index, _) ->
-        string.length_2(Str, Index + 1, Length)
+% XXX unicode: untested
+:- pred codepoint_offset_2(string::in, int::in, int::in, int::out) is semidet.
+
+codepoint_offset_2(String, Offset, N, Index) :-
+    ( N = 0 ->
+        Index = Offset
     ;
-        Length = Index
+        string.unsafe_index_next(String, Offset, NextOffset, _),
+        string.codepoint_offset_2(String, NextOffset, N - 1, Index)
     ).
 
+:- pragma foreign_proc("C",
+    string.codepoint_offset(String::in, StartOffset::in, N::in, Index::out),
+    [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+    size_t          len;
+    unsigned char   b;
+
+    SUCCESS_INDICATOR = MR_FALSE;
+    len = strlen(String);
+    for (Index = StartOffset; Index < len; Index++) {
+        b = String[Index];
+        if (MR_utf8_is_single_byte(b) || MR_utf8_is_lead_byte(b)) {
+            if (N-- == 0) {
+                SUCCESS_INDICATOR = MR_TRUE;
+                break;
+            }
+        }
+    }
+").
+
+:- pragma foreign_proc("C#",
+    string.codepoint_offset(String::in, StartOffset::in, N::in, Index::out),
+    [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+    SUCCESS_INDICATOR = false;
+    for (Index = StartOffset; Index < String.Length; Index++) {
+        if (!System.Char.IsLowSurrogate(String, Index)) {
+            if (N-- == 0) {
+                SUCCESS_INDICATOR = true;
+                break;
+            }
+        }
+    }
+").
+
+:- pragma foreign_proc("Java",
+    string.codepoint_offset(String::in, StartOffset::in, N::in, Index::out),
+    [will_not_call_mercury, promise_pure, thread_safe, may_not_duplicate],
+"
+    try {
+        Index = String.offsetByCodePoints(StartOffset, N);
+        SUCCESS_INDICATOR = true;
+    } catch (IndexOutOfBoundsException e) {
+        Index = -1;
+        SUCCESS_INDICATOR = false;
+    }
+").
+
 /*-----------------------------------------------------------------------*/
 
 :- pragma promise_equivalent_clauses(string.append/3).
@@ -4438,7 +5255,8 @@ string.append_ooi_2(NextS1Len, S3Len, S1, S2, S3) :-
         (
             string.append_ooi_3(NextS1Len, S3Len, S1, S2, S3)
         ;
-            string.append_ooi_2(NextS1Len + 1, S3Len, S1, S2, S3)
+            string.unsafe_index_next(S3, NextS1Len, AdvS1Len, _),
+            string.append_ooi_2(AdvS1Len, S3Len, S1, S2, S3)
         )
     ).
 
@@ -4513,12 +5331,15 @@ substring(Str, !.Start, !.Count, SubStr) :-
 :- func strchars(int, int, string) = list(char).
 
 strchars(I, End, Str) = Chars :-
-    ( I >= End ->
-        Chars = []
-    ;
-        C = string.unsafe_index(Str, I),
-        Cs = strchars(I + 1, End, Str),
+    (
+        I < End,
+        string.unsafe_index_next(Str, I, J, C),
+        J =< End
+    ->
+        Cs = strchars(J, End, Str),
         Chars = [C | Cs]
+    ;
+        Chars = []
     ).
 
 :- pragma foreign_proc("C",
@@ -4611,6 +5432,23 @@ strchars(I, End, Str) = Chars :-
     end
 ").
 
+string.substring_by_codepoint(Str, Start, Count) = SubString :-
+    string.substring_by_codepoint(Str, Start, Count, SubString).
+
+string.substring_by_codepoint(Str, Start, Count, SubString) :-
+    ( string.codepoint_offset(Str, Start, StartOffset0) ->
+        StartOffset = StartOffset0
+    ;
+        StartOffset = 0
+    ),
+    ( string.codepoint_offset(Str, StartOffset, Count, EndOffset0) ->
+        EndOffset = EndOffset0
+    ;
+        EndOffset = string.length(Str)
+    ),
+    OffsetCount = EndOffset - StartOffset,
+    string.substring(Str, StartOffset, OffsetCount, SubString).
+
 :- pragma foreign_proc("C",
     string.unsafe_substring(Str::in, Start::in, Count::in, SubString::uo),
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
@@ -4722,24 +5560,36 @@ strchars(I, End, Str) = Chars :-
     end
 ").
 
-string.split(Str, Count, Left, Right) :-
-    ( Count =< 0 ->
+% XXX this portable version is no longer correct as it deals in code points
+% string.split(Str, Count, Left, Right) :-
+%     ( Count =< 0 ->
+%         Left = "",
+%         copy(Str, Right)
+%     ;
+%         string.to_char_list(Str, List),
+%         Len = string.length(Str),
+%         ( Count > Len ->
+%             Num = Len
+%         ;
+%             Num = Count
+%         ),
+%         ( list.split_list(Num, List, LeftList, RightList) ->
+%             string.to_char_list(Left, LeftList),
+%             string.to_char_list(Right, RightList)
+%         ;
+%             error("string.split")
+%         )
+%     ).
+
+string.split_by_codepoint(Str, Count, Left, Right) :-
+    ( string.codepoint_offset(Str, Count, Offset) ->
+        string.split(Str, Offset, Left, Right)
+    ; Count =< 0 ->
         Left = "",
         copy(Str, Right)
     ;
-        string.to_char_list(Str, List),
-        Len = string.length(Str),
-        ( Count > Len ->
-            Num = Len
-        ;
-            Num = Count
-        ),
-        ( list.split_list(Num, List, LeftList, RightList) ->
-            string.to_char_list(Left, LeftList),
-            string.to_char_list(Right, RightList)
-        ;
-            error("string.split")
-        )
+        copy(Str, Left),
+        Right = ""
     ).
 
 /*-----------------------------------------------------------------------*/
@@ -4749,10 +5599,12 @@ string.split(Str, Count, Left, Right) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
+    int pos = 0;
+    int c = MR_utf8_get_next(Str, &pos);
     SUCCESS_INDICATOR = (
-        Str[0] == First &&
+        c == First &&
         First != '\\0' &&
-        strcmp(Str + 1, Rest) == 0
+        strcmp(Str + pos, Rest) == 0
     );
 ").
 :- pragma foreign_proc("C#",
@@ -4760,26 +5612,39 @@ string.split(Str, Count, Left, Right) :-
     [will_not_call_mercury, promise_pure, thread_safe],
 "
     int len = Str.Length;
-    SUCCESS_INDICATOR = (
-        len > 0 &&
-        Str[0] == First &&
-        System.String.CompareOrdinal(Str, 1, Rest, 0, len) == 0
-    );
+    if (First <= 0xffff) {
+        SUCCESS_INDICATOR = (
+            len > 0 &&
+            Str[0] == First &&
+            System.String.CompareOrdinal(Str, 1, Rest, 0, len) == 0
+        );
+    } else {
+        string firstchars = System.Char.ConvertFromUtf32(First);
+        SUCCESS_INDICATOR = (
+            len > 1 &&
+            Str[0] == firstchars[0] &&
+            Str[1] == firstchars[1] &&
+            System.String.CompareOrdinal(Str, 2, Rest, 0, len) == 0
+        );
+    }
 ").
 :- pragma foreign_proc("Java",
     string.first_char(Str::in, First::in, Rest::in),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    SUCCESS_INDICATOR = (Str.length() == Rest.length() + 1 &&
-        Str.charAt(0) == First &&
-        Str.endsWith(Rest));
+    int toffset = java.lang.Character.charCount(First);
+    SUCCESS_INDICATOR = (
+        Str.length() > 0 &&
+        Str.codePointAt(0) == First &&
+        Str.regionMatches(toffset, Rest, 0, Rest.length())
+    );
 ").
 :- pragma foreign_proc("Erlang",
     string.first_char(Str::in, First::in, Rest::in),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
     case Str of
-        <<First, Rest/binary>> ->
+        <<First/utf8, Rest/binary>> ->
             SUCCESS_INDICATOR = true;
         _ ->
             SUCCESS_INDICATOR = false
@@ -4791,19 +5656,31 @@ string.split(Str, Count, Left, Right) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
-    First = Str[0];
-    SUCCESS_INDICATOR = (First != '\\0' && strcmp(Str + 1, Rest) == 0);
+    int pos = 0;
+    First = MR_utf8_get_next(Str, &pos);
+    SUCCESS_INDICATOR = (First != '\\0' && strcmp(Str + pos, Rest) == 0);
 ").
 :- pragma foreign_proc("C#",
     string.first_char(Str::in, First::uo, Rest::in),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    int len = Str.Length;
-    if (len > 0) {
-        SUCCESS_INDICATOR = (System.String.CompareOrdinal(Str, 1, Rest, 0, len)
-            == 0);
-        First = Str[0];
-    } else {
+    try {
+        int len = Str.Length;
+        char c1 = Str[0];
+        if (System.Char.IsHighSurrogate(c1)) {
+            char c2 = Str[1];
+            First = System.Char.ConvertToUtf32(c1, c2);
+            SUCCESS_INDICATOR =
+                (System.String.CompareOrdinal(Str, 2, Rest, 0, len) == 0);
+        } else {
+            First = c1;
+            SUCCESS_INDICATOR =
+                (System.String.CompareOrdinal(Str, 1, Rest, 0, len) == 0);
+        }
+    } catch (System.IndexOutOfRangeException) {
+        SUCCESS_INDICATOR = false;
+        First = (char) 0;
+    } catch (System.ArgumentOutOfRangeException) {
         SUCCESS_INDICATOR = false;
         First = (char) 0;
     }
@@ -4812,13 +5689,15 @@ string.split(Str, Count, Left, Right) :-
     string.first_char(Str::in, First::uo, Rest::in),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    if (Str.length() == Rest.length() + 1 && Str.endsWith(Rest)) {
-        SUCCESS_INDICATOR = true;
-        First = Str.charAt(0);
+    int toffset;
+    if (Str.length() > 0) {
+        First = Str.codePointAt(0);
+        toffset = java.lang.Character.charCount(First);
+        SUCCESS_INDICATOR = Str.regionMatches(toffset, Rest, 0, Rest.length());
     } else {
         SUCCESS_INDICATOR = false;
         // XXX to avoid uninitialized var warning
-        First = (char) 0;
+        First = 0;
     }
 ").
 :- pragma foreign_proc("Erlang",
@@ -4826,7 +5705,7 @@ string.split(Str, Count, Left, Right) :-
     [will_not_call_mercury, promise_pure, thread_safe],
 "
     case Str of
-        << First, Rest/binary >> ->
+        <<First/utf8, Rest/binary>> ->
             SUCCESS_INDICATOR = true;
         _ ->
             SUCCESS_INDICATOR = false,
@@ -4839,10 +5718,12 @@ string.split(Str, Count, Left, Right) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "{
-    if (Str[0] != First || First == '\\0') {
+    int pos = 0;
+    int c = MR_utf8_get_next(Str, &pos);
+    if (c != First || First == '\\0') {
         SUCCESS_INDICATOR = MR_FALSE;
     } else {
-        Str++;
+        Str += pos;
         /*
         ** We need to make a copy to ensure that the pointer is word-aligned.
         */
@@ -4858,8 +5739,15 @@ string.split(Str, Count, Left, Right) :-
     int len = Str.Length;
 
     if (len > 0) {
-        SUCCESS_INDICATOR = (First == Str[0]);
-        Rest = Str.Substring(1);
+        if (First <= 0xffff) {
+            SUCCESS_INDICATOR = (First == Str[0]);
+            Rest = Str.Substring(1);
+        } else {
+            string firststr = System.Char.ConvertFromUtf32(First);
+            SUCCESS_INDICATOR =
+                (System.String.CompareOrdinal(Str, 0, firststr, 0, 2) == 0);
+            Rest = Str.Substring(2);
+        }
     } else {
         SUCCESS_INDICATOR = false;
         Rest = null;
@@ -4872,8 +5760,8 @@ string.split(Str, Count, Left, Right) :-
     int len = Str.length();
 
     if (len > 0) {
-        SUCCESS_INDICATOR = (First == Str.charAt(0));
-        Rest = Str.substring(1);
+        SUCCESS_INDICATOR = (First == Str.codePointAt(0));
+        Rest = Str.substring(java.lang.Character.charCount(First));
     } else {
         SUCCESS_INDICATOR = false;
         // XXX to avoid uninitialized var warning
@@ -4885,7 +5773,7 @@ string.split(Str, Count, Left, Right) :-
     [will_not_call_mercury, promise_pure, thread_safe],
 "
     case Str of
-        <<First, Rest/binary>> ->
+        <<First/utf8, Rest/binary>> ->
             SUCCESS_INDICATOR = true;
         _ ->
             SUCCESS_INDICATOR = false,
@@ -4898,11 +5786,12 @@ string.split(Str, Count, Left, Right) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "{
-    First = Str[0];
+    int pos = 0;
+    First = MR_utf8_get_next(Str, &pos);
     if (First == '\\0') {
         SUCCESS_INDICATOR = MR_FALSE;
     } else {
-        Str++;
+        Str += pos;
         /*
         ** We need to make a copy to ensure that the pointer is word-aligned.
         */
@@ -4915,14 +5804,25 @@ string.split(Str, Count, Left, Right) :-
     string.first_char(Str::in, First::uo, Rest::uo),
     [will_not_call_mercury, promise_pure, thread_safe],
 "{
-    if (Str.Length == 0) {
+    try {
+        char c1 = Str[0];
+        if (System.Char.IsHighSurrogate(c1)) {
+            char c2 = Str[1];
+            First = System.Char.ConvertToUtf32(c1, c2);
+            Rest = Str.Substring(2);
+        } else {
+            First = Str[0];
+            Rest = Str.Substring(1);
+        }
+        SUCCESS_INDICATOR = true;
+    } catch (System.IndexOutOfRangeException) {
+        SUCCESS_INDICATOR = false;
+        First = (char) 0;
+        Rest = null;
+    } catch (System.ArgumentOutOfRangeException) {
         SUCCESS_INDICATOR = false;
         First = (char) 0;
         Rest = null;
-    } else {
-        First = Str[0];
-        Rest = Str.Substring(1);
-        SUCCESS_INDICATOR = true;
     }
 }").
 :- pragma foreign_proc("Java",
@@ -4931,12 +5831,12 @@ string.split(Str, Count, Left, Right) :-
 "{
     if (Str.length() == 0) {
         SUCCESS_INDICATOR = false;
-        // XXX to avoid uninitialized var warnings:
         First = (char) 0;
         Rest = null;
     } else {
-        First = Str.charAt(0);
-        Rest = Str.substring(1);
+        // XXX Unicode Java: do error checking?
+        First = Str.codePointAt(0);
+        Rest = Str.substring(java.lang.Character.charCount(First));
         SUCCESS_INDICATOR = true;
     }
 }").
@@ -4945,7 +5845,7 @@ string.split(Str, Count, Left, Right) :-
     [will_not_call_mercury, promise_pure, thread_safe],
 "
     case Str of
-        <<First, Rest/binary>> ->
+        <<First/utf8, Rest/binary>> ->
             SUCCESS_INDICATOR = true;
         _ ->
             SUCCESS_INDICATOR = false,
@@ -4959,31 +5859,36 @@ string.split(Str, Count, Left, Right) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "{
-    size_t len = strlen(Rest) + 1;
+    size_t firstw = MR_utf8_width(First);
+    size_t len = firstw + strlen(Rest);
     MR_allocate_aligned_string_msg(Str, len, MR_PROC_LABEL);
-    Str[0] = First;
-    strcpy(Str + 1, Rest);
+    MR_utf8_encode(Str, First);
+    strcpy(Str + firstw, Rest);
 }").
 :- pragma foreign_proc("C#",
     string.first_char(Str::uo, First::in, Rest::in),
     [will_not_call_mercury, promise_pure, thread_safe],
 "{
     string FirstStr;
-    FirstStr = new System.String(First, 1);
-    Str = System.String.Concat(FirstStr, Rest);
+    if (First <= 0xffff) {
+        FirstStr = new System.String((char) First, 1);
+    } else {
+        FirstStr = System.Char.ConvertFromUtf32(First);
+    }
+    Str = FirstStr + Rest;
 }").
 :- pragma foreign_proc("Java",
     string.first_char(Str::uo, First::in, Rest::in),
     [will_not_call_mercury, promise_pure, thread_safe],
 "{
-    java.lang.String FirstStr = java.lang.String.valueOf(First);
+    String FirstStr = new String(Character.toChars(First));
     Str = FirstStr.concat(Rest);
 }").
 :- pragma foreign_proc("Erlang",
     string.first_char(Str::uo, First::in, Rest::in),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    Str = list_to_binary([First, Rest])
+    Str = unicode:characters_to_binary([First, Rest])
 ").
 
 %-----------------------------------------------------------------------------%
@@ -5086,23 +5991,40 @@ string.format(S1, PT) = S2 :-
 %------------------------------------------------------------------------------%
 
 string.words_separator(SepP, String) = Words :-
-    I = preceding_boundary(isnt(SepP), String, string.length(String) - 1),
-    Words = words_2(SepP, String, I, []).
+    next_boundary(SepP, String, 0, WordStart),
+    words_2(SepP, String, WordStart, Words).
 
-%------------------------------------------------------------------------------%
+:- pred words_2(pred(char)::in(pred(in) is semidet), string::in, int::in,
+    list(string)::out) is det.
 
-:- func words_2(pred(char)::in(pred(in) is semidet), string::in, int::in,
-    list(string)::in) = (list(string)::out) is det.
+words_2(SepP, String, WordStart, Words) :-
+    next_boundary(isnt(SepP), String, WordStart, WordEnd),
+    ( WordEnd = WordStart ->
+        Words = []
+    ;
+        string.unsafe_substring(String, WordStart, WordEnd - WordStart, Word),
+        next_boundary(SepP, String, WordEnd, NextWordStart),
+        ( WordEnd = NextWordStart ->
+            Words = [Word]
+        ;
+            words_2(SepP, String, NextWordStart, Words0),
+            Words = [Word | Words0]
+        )
+    ).
 
-words_2(SepP, String, WordEnd, Words0) = Words :-
-    ( WordEnd < 0 ->
-        Words = Words0
+    % Return the smallest I >= I0 such that `not P(String[I])'.
+    %
+:- pred next_boundary(pred(char)::in(pred(in) is semidet), string::in, int::in,
+    int::out) is det.
+
+next_boundary(P, String, I0, I) :-
+    (
+        string.unsafe_index_next(String, I0, I1, Char),
+        P(Char)
+    ->
+        next_boundary(P, String, I1, I)
     ;
-        WordPre = preceding_boundary(SepP, String, WordEnd),
-        Word = string.unsafe_substring(String, WordPre + 1,
-            WordEnd - WordPre),
-        PrevWordEnd = preceding_boundary(isnt(SepP), String, WordPre),
-        Words = words_2(SepP, String, PrevWordEnd, [Word | Words0])
+        I = I0
     ).
 
 %------------------------------------------------------------------------------%
@@ -5113,33 +6035,32 @@ string.words(String) = string.words_separator(char.is_whitespace, String).
 
 string.split_at_separator(DelimP, String) = Substrings :-
     Len = string.length(String),
-    split_at_separator_2(DelimP, String, Len - 1, Len, [], Substrings).
+    split_at_separator_2(DelimP, String, Len, Len, [], Substrings).
 
 :- pred split_at_separator_2(pred(char)::in(pred(in) is semidet), string::in,
     int::in, int::in, list(string)::in, list(string)::out) is det.
 
 split_at_separator_2(DelimP, Str, I, SegEnd, Acc0, Acc) :-
-    % Walk Str backwards extending the accumulated list of chunks as chars
-    % matching DelimP are found.
+    % Walk Str backwards extending the accumulated list of chunks as code
+    % points matching DelimP are found.
     %
-    % Invariant: -1 =< I < length(Str)
+    % Invariant: 0 =< I =< length(Str)
     % SegEnd is one past the last index of the current segment.
     %
-    ( I < 0 ->
-        % We've reached the beginning of the string.
-        Seg = string.unsafe_substring(Str, 0, SegEnd),
-        Acc = [Seg | Acc0]
-    ;
-        C = string.unsafe_index(Str, I),
+    ( string.unsafe_prev_index(Str, I, J, C) ->
         ( DelimP(C) ->
             % Chop here.
-            SegStart = I + 1,
+            SegStart = I,
             Seg = string.unsafe_substring(Str, SegStart, SegEnd - SegStart),
-            split_at_separator_2(DelimP, Str, I - 1, I, [Seg | Acc0], Acc)
+            split_at_separator_2(DelimP, Str, J, J, [Seg | Acc0], Acc)
         ;
             % Extend current segment.
-            split_at_separator_2(DelimP, Str, I - 1, SegEnd, Acc0, Acc)
+            split_at_separator_2(DelimP, Str, J, SegEnd, Acc0, Acc)
         )
+    ;
+        % We've reached the beginning of the string.
+        Seg = string.unsafe_substring(Str, 0, SegEnd),
+        Acc = [Seg | Acc0]
     ).
 
 %------------------------------------------------------------------------------%
@@ -5156,36 +6077,16 @@ split_at_string(Needle, Total) =
 
 split_at_string(StartAt, NeedleLen, Needle, Total) = Out :-
     ( sub_string_search_start(Total, Needle, StartAt, NeedlePos) ->
-        BeforeNeedle = substring(Total, StartAt, NeedlePos-StartAt),
+        BeforeNeedle = substring(Total, StartAt, NeedlePos - StartAt),
         Tail = split_at_string(NeedlePos+NeedleLen, NeedleLen, Needle, Total),
         Out = [BeforeNeedle | Tail]
     ;
-        string__split(Total, StartAt, _skip, Last),
+        string.split(Total, StartAt, _Skip, Last),
         Out = [Last]
     ).
 
 %------------------------------------------------------------------------------%
 
-    % preceding_boundary(SepP, String, I) returns the largest index J =< I
-    % in String of the char that is SepP and min(-1, I) if there is no such J.
-    % preceding_boundary/3 is intended for finding (in reverse) consecutive
-    % maximal sequences of chars satisfying some property. Note that I
-    % *must not* exceed the largest valid index for String.
-    %
-:- func preceding_boundary(pred(char)::in(pred(in) is semidet), string::in,
-    int::in) = (int::out) is det.
-
-preceding_boundary(SepP, String, I) =
-    ( I < 0 ->
-        I
-    ; SepP(string.unsafe_index(String, I)) ->
-        I
-    ;
-        preceding_boundary(SepP, String, I - 1)
-    ).
-
-%------------------------------------------------------------------------------%
-
 S1 ++ S2 = string.append(S1, S2).
 
 %------------------------------------------------------------------------------%
@@ -5203,11 +6104,13 @@ string.det_base_string_to_int(Base, S) = N :-
 
 %-----------------------------------------------------------------------------%
 
-chomp(S) =
-    ( index(S, length(S) - 1, '\n') ->
-        left(S, length(S) - 1)
+chomp(S) = Chomp :-
+    % '\n' is always represented by a single code unit.
+    Offset = string.length(S) - 1,
+    ( index(S, Offset, '\n') ->
+        Chomp = left(S, Offset)
     ;
-        S
+        Chomp = S
     ).
 
 %-----------------------------------------------------------------------------%
@@ -5235,40 +6138,39 @@ lstrip_pred(P, S) = right(S, length(S) - prefix_length(P, S)).
 
 %-----------------------------------------------------------------------------%
 
-prefix_length(P, S) = prefix_length_2(0, length(S), P, S).
+prefix_length(P, S) = Index :-
+    prefix_length_2(P, S, 0, Index).
 
-:- func prefix_length_2(int::in, int::in, pred(char)::in(pred(in) is semidet),
-    string::in) = (int::out) is det.
+:- pred prefix_length_2(pred(char)::in(pred(in) is semidet),
+    string::in, int::in, int::out) is det.
 
-prefix_length_2(I, N, P, S) =
-    % XXX We are using if-then-elses to get ordered conjunction.
-    ( I < N ->
-        ( P(S ^ unsafe_elem(I)) ->
-            prefix_length_2(I + 1, N, P, S)
-        ;
-            I
-        )
+prefix_length_2(P, S, I, Index) :-
+    (
+        string.unsafe_index_next(S, I, J, Char),
+        P(Char)
+    ->
+        prefix_length_2(P, S, J, Index)
     ;
-        I
+        Index = I
     ).
 
 %-----------------------------------------------------------------------------%
 
-suffix_length(P, S) = suffix_length_2(length(S) - 1, length(S), P, S).
+suffix_length(P, S) = End - Index :-
+    End = string.length(S),
+    suffix_length_2(P, S, End, Index).
 
-:- func suffix_length_2(int::in, int::in, pred(char)::in(pred(in) is semidet),
-    string::in) = (int::out) is det.
+:- pred suffix_length_2(pred(char)::in(pred(in) is semidet),
+    string::in, int::in, int::out) is det.
 
-suffix_length_2(I, N, P, S) =
-    % XXX We are using if-then-elses to get ordered conjunction.
-    ( 0 =< I ->
-        ( P(S ^ unsafe_elem(I)) ->
-            suffix_length_2(I - 1, N, P, S)
-        ;
-            N - (I + 1)
-        )
+suffix_length_2(P, S, I, Index) :-
+    (
+        string.unsafe_prev_index(S, I, J, Char),
+        P(Char)
+    ->
+        suffix_length_2(P, S, J, Index)
     ;
-        N - (I + 1)
+        Index = I
     ).
 
 %------------------------------------------------------------------------------%
@@ -5686,28 +6588,6 @@ private_builtin_type_info_to_revstrings(PrivateBuiltinTypeInfo, !Rs) :-
 det_dynamic_cast(X, Y) :-
     det_univ_to_type(univ(X), Y).
 
-%-----------------------------------------------------------------------------%
-
-    % char_list_remove_suffix/3: We use this instead of the more general
-    % list.remove_suffix so that (for example) string.format will succeed in
-    % grade Java, even though unification has not yet been implemented.
-    %
-:- pred char_list_remove_suffix(list(char)::in, list(char)::in,
-    list(char)::out) is semidet.
-
-char_list_remove_suffix(List, Suffix, Prefix) :-
-    list.length(List, ListLength),
-    list.length(Suffix, SuffixLength),
-    PrefixLength = ListLength - SuffixLength,
-    list.split_list(PrefixLength, List, Prefix, Rest),
-    char_list_equal(Suffix, Rest).
-
-:- pred char_list_equal(list(char)::in, list(char)::in) is semidet.
-
-char_list_equal([], []).
-char_list_equal([X | Xs], [X | Ys]) :-
-    char_list_equal(Xs, Ys).
-
 %------------------------------------------------------------------------------%
 
 string.format_table(Columns, Separator) = Table :-
@@ -5730,7 +6610,7 @@ string.format_table(Columns, Separator) = Table :-
 string.format_table_max(ColumnsLimits, Separator) = Table :-
     MaxWidthsSenses = list.map(find_max_length_with_limit, ColumnsLimits),
     Columns = list.map(project_column_strings, ColumnsLimits),
-    SepLen = string.length(Separator),
+    SepLen = string.count_codepoints(Separator),
     generate_rows(MaxWidthsSenses, Separator, SepLen, Columns, [], RevRows),
     list.reverse(RevRows, Rows),
     Table = string.join_list("\n", Rows).
@@ -5784,7 +6664,7 @@ pad_row([Justify - MaxWidth | JustifyWidths], [ColumnString0 | ColumnStrings0],
     NextColumn = CurColumn + MaxWidth + SepLen,
     pad_row(JustifyWidths, ColumnStrings0, Separator, SepLen, NextColumn,
         LineRest),
-    ( string.length(ColumnString0) =< MaxWidth ->
+    ( string.count_codepoints(ColumnString0) =< MaxWidth ->
         (
             Justify = just_left,
             ColumnString = string.pad_right(ColumnString0, ' ', MaxWidth)
@@ -5870,7 +6750,7 @@ lpad(Chr, N, Str) = string.pad_left(Str, Chr, N).
 :- pred max_str_length(string::in, int::in, int::out) is det.
 
 max_str_length(Str, PrevMaxLen, MaxLen) :-
-    Length = string.length(Str),
+    Length = string.count_codepoints(Str),
     ( Length > PrevMaxLen ->
         MaxLen = Length
     ;
@@ -5883,7 +6763,7 @@ word_wrap(Str, N) = word_wrap_separator(Str, N, "").
 
 word_wrap_separator(Str, N, WordSep) = Wrapped :-
     Words = string.words_separator(char.is_whitespace, Str),
-    SepLen = string.length(WordSep),
+    SepLen = string.count_codepoints(WordSep),
     ( SepLen < N ->
         word_wrap_2(Words, WordSep, SepLen, 1, N, [], Wrapped)
     ;
@@ -5899,7 +6779,7 @@ word_wrap_2([], _, _, _, _, RevStrs,
 word_wrap_2([Word | Words], WordSep, SepLen, Col, N, Prev, Wrapped) :-
     % Col is the column where the next character should be written if there
     % is space for a whole word.
-    WordLen = string.length(Word),
+    WordLen = string.count_codepoints(Word),
     (
         % We are on the first column and the length of the word
         % is less than the line length.
@@ -5972,10 +6852,10 @@ word_wrap_2([Word | Words], WordSep, SepLen, Col, N, Prev, Wrapped) :-
 :- func break_up_string_reverse(string, int, list(string)) = list(string).
 
 break_up_string_reverse(Str, N, Prev) = Strs :-
-    ( string.length(Str) =< N ->
+    ( string.count_codepoints(Str) =< N ->
         Strs = [Str | Prev]
     ;
-        string.split(Str, N, Left, Right),
+        string.split_by_codepoint(Str, N, Left, Right),
         Strs = break_up_string_reverse(Right, N, [Left | Prev])
     ).
 
diff --git a/library/term_io.m b/library/term_io.m
index 0be7ccb..9b7acf5 100644
--- a/library/term_io.m
+++ b/library/term_io.m
@@ -784,6 +784,7 @@ mercury_escape_char(Char) = EscapeCode :-
 is_mercury_source_char(Char) :-
     ( char.is_alnum(Char)
     ; is_mercury_punctuation_char(Char)
+    ; char.to_int(Char) >= 0x80
     ).
 
     % Currently we only allow the following characters.
diff --git a/runtime/mercury_string.c b/runtime/mercury_string.c
index 0f20219..fb6035b 100644
--- a/runtime/mercury_string.c
+++ b/runtime/mercury_string.c
@@ -112,6 +112,7 @@ MR_hash_string(MR_ConstString s)
 {
     MR_HASH_STRING_FUNC_BODY
 }
+
 MR_Integer
 MR_hash_string2(MR_ConstString s)
 {
@@ -123,3 +124,222 @@ MR_hash_string3(MR_ConstString s)
 {
     MR_HASH_STRING3_FUNC_BODY
 }
+
+MR_bool
+MR_utf8_next(const MR_String s_, int *pos)
+{
+    const unsigned char *s = (const unsigned char *)s_;
+    int c;
+
+    if (*pos == '\0') {
+        return MR_FALSE;
+    }
+
+    for (;;) {
+        ++(*pos);
+        if (*pos == '\0') {
+            break;
+        }
+        c = s[*pos];
+        if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
+            break;
+        }
+    }
+
+    return MR_TRUE;
+}
+
+MR_bool
+MR_utf8_prev(const MR_String s_, int *pos)
+{
+    const unsigned char *s = (const unsigned char *)s_;
+    int c;
+
+    if (*pos <= 0) {
+        return MR_FALSE;
+    }
+
+    while (*pos > 0) {
+        (*pos)--;
+        c = s[*pos];
+        if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
+            break;
+        }
+    }
+
+    return MR_TRUE;
+}
+
+int
+MR_utf8_get(const MR_String s_, int pos)
+{
+    const unsigned char *s = (const unsigned char *)s_;
+    int c;
+    int remain;
+    int minc;
+    int i;
+
+    c = s[pos];
+
+    if (c <= 0x7F) {
+        /* Plain ASCII (including NUL terminator). */
+        return c;
+    }
+
+    if (c <= 0xC1) {
+        /* Trailing byte of multi-byte sequence or an overlong encoding for
+         * code point <= 127.
+         */
+        return -2;
+    }
+
+    if (c <= 0xDF) {
+        /* 2-byte sequence. */
+        c &= 0x1F;
+        remain = 1;
+        minc = 0x80;
+    }
+    else if (c <= 0xEF) {
+        /* 3-byte sequence. */
+        c &= 0x0F;
+        remain = 2;
+        minc = 0x800;
+    }
+    else if (c <= 0xF4) {
+        /* 4-byte sequence. */
+        c &= 0x07;
+        remain = 3;
+        minc = 0x10000;
+    }
+    else {
+        /* Otherwise invalid. */
+        return -2;
+    }
+
+    for (i = 1; i <= remain; i++) {
+        if (s[pos + i] == '\0') {
+            return -2;
+        }
+    }
+
+    while (remain--) {
+        int d = s[++pos];
+
+        if (!MR_utf8_is_trail_byte(d)) {
+            return -2;
+        }
+
+        c = (c << 6) | (d & 0x3F);
+    }
+
+    /* Check for overlong forms, which could be used to bypass security
+     * validations.  We could also check code points aren't above U+10FFFF or in
+     * the surrogate ranges, but we don't.
+     */
+
+    if (c < minc) {
+        return -2;
+    }
+
+    return c;
+}
+
+int
+MR_utf8_get_next(const MR_String s, int *pos)
+{
+    int c = MR_utf8_get(s, *pos);
+
+    if (c >= 0) {
+        (*pos) += MR_utf8_width(c);
+        return c;
+    }
+
+    /* Some invalid byte sequence. */
+    MR_utf8_next(s, pos);
+    return c;
+}
+
+int
+MR_utf8_prev_get(const MR_String s, int *pos)
+{
+    if (MR_utf8_prev(s, pos)) {
+        return MR_utf8_get(s, *pos);
+    }
+
+    /* Past beginning. */
+    return -1;
+}
+
+
+size_t
+MR_utf8_width(int c)
+{
+   /* So we don't need to check for negative values nor use unsigned ints
+    * in the interface, which are a pain.
+    */
+   unsigned int uc = c;
+
+   if (uc <= 0x7f)
+      return 1;
+   if (uc <= 0x7ff)
+      return 2;
+   if (uc <= 0xffff)
+      return 3;
+   if (uc <= 0x10ffff)
+      return 4;
+   /* The rest are illegal. */
+   return 0;
+}
+
+size_t
+MR_utf8_encode(char s_[], int c)
+{
+    unsigned char *s = (unsigned char *)s_;
+    unsigned int uc = c;
+
+    if (uc <= 0x7f) {
+        s[0] = uc;
+        return 1;
+    }
+
+    if (uc <= 0x7ff) {
+        s[0] = 0xC0 | ((uc >> 6) & 0x1F);
+        s[1] = 0x80 |  (uc       & 0x3F);
+        return 2;
+    }
+
+    if (uc <= 0xffff) {
+        s[0] = 0xE0 | ((uc >> 12) & 0x0F);
+        s[1] = 0x80 | ((uc >>  6) & 0x3F);
+        s[2] = 0x80 |  (uc        & 0x3F);
+        return 3;
+    }
+
+    if (uc <= 0x10ffff) {
+        s[0] = 0xF0 | ((uc >> 18) & 0x07);
+        s[1] = 0x80 | ((uc >> 12) & 0x3F);
+        s[2] = 0x80 | ((uc >>  6) & 0x3F);
+        s[3] = 0x80 |  (uc        & 0x3F);
+        return 4;
+    }
+
+    /* Otherwise is illegal. */
+    return 0;
+}
+
+MR_bool
+MR_utf8_verify(const MR_String s)
+{
+    int pos = 0;
+    int c;
+
+    for (;;) {
+        c = MR_utf8_get_next(s, &pos);
+        if (c == 0) {
+            return MR_TRUE;
+        }
+        if (c < 0) {
+            return MR_FALSE;
+        }
+    }
+}
diff --git a/runtime/mercury_string.h b/runtime/mercury_string.h
index b12afbe..6265e5d 100644
--- a/runtime/mercury_string.h
+++ b/runtime/mercury_string.h
@@ -15,24 +15,21 @@
 #include <stdarg.h>
 
 /*
-** Mercury characters are given type `MR_Char', which is a typedef for `char'.
-** But BEWARE: when stored in an MR_Integer, the value must be
-** first cast to `MR_UnsignedChar'.
-** Mercury strings are stored as pointers to '\0'-terminated arrays of MR_Char.
+** Mercury characters (Unicode code points) are given type `MR_Char', which is
+** a typedef for `int'.
+** Mercury strings are stored as pointers to '\0'-terminated arrays of `char'.
+** Strings are UTF-8 encoded.
 ** Mercury strings must not contain null characters. Unexpected null characters
 ** are a source of security vulnerabilities.
 **
-** We may eventually move to using wchar_t for Mercury characters and strings,
-** so it is important to use these typedefs.
-**
 ** The actual typedefs are in mercury_types.h to avoid problems with
 ** circular #includes.
 **
-** typedef char MR_Char;
-** typedef unsigned char MR_UnsignedChar;
+** typedef int MR_Char;
+** typedef unsigned int MR_UnsignedChar;
 **
-** typedef MR_Char *MR_String;
-** typedef const MR_Char *MR_ConstString;
+** typedef char *MR_String;
+** typedef const char *MR_ConstString;
 */
 
 /*
@@ -202,9 +199,9 @@
 		MR_CHECK_EXPR_TYPE(s, MR_ConstString);			\
 		len = 0;						\
 		hash = 0;						\
-		while(((MR_ConstString)(s))[len]) {			\
+		while (((const unsigned char *)(s))[len]) {		\
 			hash ^= (hash << 5);				\
-			hash ^= (MR_UnsignedChar) ((MR_ConstString)(s))[len]; \
+			hash ^= ((const unsigned char *)(s))[len];	\
 			len++;						\
 		}							\
 		hash ^= len;						\
@@ -217,9 +214,9 @@
 		MR_CHECK_EXPR_TYPE(s, MR_ConstString);			\
 		len = 0;						\
 		hash = 0;						\
-		while(((MR_ConstString)(s))[len]) {			\
+		while (((const unsigned char *)(s))[len]) {		\
 			hash = hash * 37;				\
-			hash += (MR_UnsignedChar) ((MR_ConstString)(s))[len]; \
+			hash += ((const unsigned char *)(s))[len];	\
 			len++;						\
 		}							\
 		hash ^= len;						\
@@ -232,9 +229,9 @@
 		MR_CHECK_EXPR_TYPE(s, MR_ConstString);			\
 		len = 0;						\
 		hash = 0;						\
-		while(((MR_ConstString)(s))[len]) {			\
+		while (((const unsigned char *)(s))[len]) {		\
 			hash = hash * 49;				\
-			hash += (MR_UnsignedChar) ((MR_ConstString)(s))[len]; \
+			hash += ((const unsigned char *)(s))[len];	\
 			len++;						\
 		}							\
 		hash ^= len;						\
@@ -314,4 +311,61 @@ MR_Integer	MR_hash_string3(MR_ConstString);
 
 MR_String MR_make_string(MR_Code *proclabel, const char *fmt, ...);
 
+/*
+** UTF-8 manipulation
+*/
+
+#define MR_utf8_is_single_byte(c)   (((unsigned)(c) & 0x80) == 0)
+#define MR_utf8_is_lead_byte(c)     (((unsigned)(c) - 0xC0) < 0x3E)
+#define MR_utf8_is_trail_byte(c)    (((unsigned)(c) & 0xC0) == 0x80)
+
+/*
+** Advance `*pos' to the beginning of the next code point in `s'.
+** If `*pos' is already at the end of the string then return MR_FALSE
+** without modifying `*pos'.
+*/
+extern MR_bool  MR_utf8_next(const MR_String s_, int *pos);
+
+/*
+** Rewind `*pos' to the beginning of the previous code point in `s'.
+** If `*pos' is already at the beginning of the string then return MR_FALSE
+** without modifying `*pos'.
+*/
+extern MR_bool  MR_utf8_prev(const MR_String s_, int *pos);
+
+/*
+** Decode and return the code point beginning at `pos' in `s'.
+** Return 0 if at the end of the string (i.e. the NUL terminator).
+** If an illegal code sequence exists at that offset, return -2.
+*/
+extern int      MR_utf8_get(const MR_String s, int pos);
+
+/*
+** Decode the code point beginning at `pos' in `s', and advance `*pos'.
+*/
+extern int      MR_utf8_get_next(const MR_String s, int *pos);
+
+/*
+** Rewind `*pos' to the beginning of the previous code point in `s'
+** and return that code code.
+** Return -1 if `*pos' is already at the beginning of the string.
+*/
+extern int      MR_utf8_prev_get(const MR_String s, int *pos);
+
+/*
+** Return the number of bytes required to encode the code point `c' in UTF-8.
+*/
+extern size_t   MR_utf8_width(int c);
+
+/*
+** Encode the code point `c' into the buffer `s'.
+** Return the number of bytes used.
+*/
+extern size_t   MR_utf8_encode(char s[], int c);
+
+/*
+** Return MR_TRUE iff `s' contains a valid UTF-8 encoded string.
+*/
+extern MR_bool  MR_utf8_verify(const MR_String s);
+
 #endif /* not MERCURY_STRING_H */
diff --git a/runtime/mercury_types.h b/runtime/mercury_types.h
index fc51fc8..fd32326 100644
--- a/runtime/mercury_types.h
+++ b/runtime/mercury_types.h
@@ -128,11 +128,11 @@ typedef MR_Code                 *MR_CodePtr;
 ** If you modify them, you will need to modify mercury_string.h as well.
 */
 
-typedef char            MR_Char;
-typedef unsigned char   MR_UnsignedChar;
+typedef int             MR_Char;
+typedef unsigned int    MR_UnsignedChar;
 
-typedef MR_Char         *MR_String;
-typedef const MR_Char   *MR_ConstString;
+typedef char            *MR_String;
+typedef const char      *MR_ConstString;
 
 /*
 ** Definitions for accessing the representation of the Mercury `array' type.
diff --git a/tests/debugger/tailrec1.m b/tests/debugger/tailrec1.m
index a2949d3..13e680f 100644
--- a/tests/debugger/tailrec1.m
+++ b/tests/debugger/tailrec1.m
@@ -55,7 +55,9 @@ tailrec1_read_strings(Stream, !Words, !IO) :-
     int     i;
 
     i = 0;
-    while ((c = mercury_getc((MercuryFilePtr) Stream)) != EOF && c != '\\n') {
+    while ((c = mercury_get_byte((MercuryFilePtr) Stream)) != EOF &&
+        c != '\\n')
+    {
         if (i < 100) {
             buf[i] = c;
         }
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 5df4d3e..1d77007 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -294,6 +294,7 @@ ORDINARY_PROGS=	\
 	user_compare \
 	user_defined_equality2 \
 	value_enum \
+	words_separator \
 	write \
 	write_reg1 \
 	write_reg2 \
diff --git a/tests/hard_coded/nonascii.exp b/tests/hard_coded/nonascii.exp
index 88fe4c9..28d7cbc 100644
--- a/tests/hard_coded/nonascii.exp
+++ b/tests/hard_coded/nonascii.exp
@@ -252,6 +252,11 @@
 253,
 254,
 255,
+2047,
+2048,
+65535,
+65536,
+1114111,
 10
 1
 2
@@ -507,4 +512,9 @@
 253
 254
 255
+2047
+2048
+65535
+65536
+1114111
 10
diff --git a/tests/hard_coded/nonascii.m b/tests/hard_coded/nonascii.m
index 23fa3eb..c7825fc 100644
--- a/tests/hard_coded/nonascii.m
+++ b/tests/hard_coded/nonascii.m
@@ -18,11 +18,12 @@ main(!IO) :-
 	(
 		Result = ok(Stream),
 		test1(Stream, !IO),
-		test2(Stream, 255, !IO)
+		test2(Stream, 260, !IO)
 	;
 		Result = error(Error),
 		io__error_message(Error, Msg),
-		io__write_string(Msg, !IO)
+		io__write_string(Msg, !IO),
+		io__nl(!IO)
 	).
 
 :- pred test1(io__input_stream::in, io::di, io::uo) is det.
@@ -41,7 +42,8 @@ test1(Stream, !IO) :-
 	;
 		Result = error(Error),
 		io__error_message(Error, Msg),
-		io__write_string(Msg, !IO)
+		io__write_string(Msg, !IO),
+		io__nl(!IO)
 	).
 
 :- pred test2(io__input_stream::in, int::in, io::di, io::uo) is det.
@@ -60,7 +62,8 @@ test2(Stream, N, !IO) :-
 		;
 			Result = error(Error),
 			io__error_message(Error, Msg),
-			io__write_string(Msg, !IO)
+			io__write_string(Msg, !IO),
+			io__nl(!IO)
 		),
 		test2(Stream, N - 1, !IO)
 	;
diff --git a/tests/hard_coded/nonascii_gen.c b/tests/hard_coded/nonascii_gen.c
index b3a7196..2cf3423 100644
--- a/tests/hard_coded/nonascii_gen.c
+++ b/tests/hard_coded/nonascii_gen.c
@@ -1,24 +1,73 @@
 #include <stdio.h>
 
+size_t
+MR_utf8_encode(char s_[], int c)
+{
+	unsigned char *s = (unsigned char *)s_;
+	unsigned int uc = c;
+
+	if (uc <= 0x7f) {
+		s[0] = uc;
+		return 1;
+	}
+
+	if (uc <= 0x7ff) {
+		s[0] = 0xC0 | ((uc >> 6) & 0x1F);
+		s[1] = 0x80 |  (uc       & 0x3F);
+		return 2;
+	}
+
+	if (uc <= 0xffff) {
+		s[0] = 0xE0 | ((uc >> 12) & 0x0F);
+		s[1] = 0x80 | ((uc >>  6) & 0x3F);
+		s[2] = 0x80 |  (uc        & 0x3F);
+		return 3;
+	}
+
+	if (uc <= 0x10ffff) {
+		s[0] = 0xF0 | ((uc >> 18) & 0x07);
+		s[1] = 0x80 | ((uc >> 12) & 0x3F);
+		s[2] = 0x80 | ((uc >>  6) & 0x3F);
+		s[3] = 0x80 |  (uc        & 0x3F);
+		return 4;
+	}
+
+	/* Otherwise is illegal. */
+	return 0;
+}
+
+void
+put_utf8(int c)
+{
+	char	s[4];
+	size_t	n;
+	size_t	i;
+
+	n = MR_utf8_encode(s, c);
+	for (i = 0; i < n; i++) {
+		putchar(s[i]);
+	}
+}
+
 int
 main(void)
 {
+	int repeat;
 	int	i;
 
-	for (i = 1; i < 256; i++) {
-		if (i != '\n') {
-			putchar(i);
+	for (repeat = 0; repeat < 2; repeat++) {
+		for (i = 1; i < 256; i++) {
+			if (i != '\n') {
+				put_utf8(i);
+			}
 		}
-	}
-
-	putchar('\n');
 
-	for (i = 1; i < 256; i++) {
-		if (i != '\n') {
-			putchar(i);
-		}
+		put_utf8(0x0007ff);
+		put_utf8(0x000800);
+		put_utf8(0x00ffff);
+		put_utf8(0x010000);
+		put_utf8(0x10ffff);
+		put_utf8('\n');
 	}
-
-	putchar('\n');
 	return 0;
 }
diff --git a/tests/hard_coded/special_char.exp b/tests/hard_coded/special_char.exp
index 52dc034..91f2fe8 100644
--- a/tests/hard_coded/special_char.exp
+++ b/tests/hard_coded/special_char.exp
@@ -1,4 +1,4 @@
 Hello world
 
 

	
-?SÑ
+?SÑ
diff --git a/tests/hard_coded/unicode_test.exp b/tests/hard_coded/unicode_test.exp
index 5ed1bf3..cccf2b8 100644
--- a/tests/hard_coded/unicode_test.exp
+++ b/tests/hard_coded/unicode_test.exp
@@ -1,11 +1,11 @@
 00000011
 00000011
-11001110 10010100
-11001110 10100000
-11101111 10111111 10111111
-11110100 10001111 10111111 10111111
-11110010 10101011 10110011 10011110
-01110010 11000011 10101001 01110011 01110101 01101101 11000011 10101001
+1110010100
+1110100000
+1111111111111111
+100001111111111111111
+10101011110011011110
+01110010 11101001 01110011 01110101 01101101 11101001
 01100001 01100010 01100011 00110001 00110010 00110011
 01011100 01110101 00110000 00110000 00110100 00110001
 01011100 01110101 00110000 00110000 00110100 00110001
diff --git a/tests/hard_coded/words_separator.exp b/tests/hard_coded/words_separator.exp
new file mode 100644
index 0000000..a330c01
--- /dev/null
+++ b/tests/hard_coded/words_separator.exp
@@ -0,0 +1,8 @@
+[]
+[]
+[]
+["x"]
+["x"]
+["x", "y"]
+["x", "y"]
+["x", "y"]
diff --git a/tests/hard_coded/words_separator.m b/tests/hard_coded/words_separator.m
new file mode 100644
index 0000000..bcaa715
--- /dev/null
+++ b/tests/hard_coded/words_separator.m
@@ -0,0 +1,43 @@
+%-----------------------------------------------------------------------------%
+
+:- module words_separator.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%-----------------------------------------------------------------------------%
+%-----------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module list.
+:- import_module string.
+
+%-----------------------------------------------------------------------------%
+
+main(!IO) :-
+    S1 = words_separator(comma, ""),
+    S2 = words_separator(comma, ","),
+    S3 = words_separator(comma, ",,"),
+    S4 = words_separator(comma, "x,"),
+    S5 = words_separator(comma, ",x"),
+    S6 = words_separator(comma, "x,y"),
+    S7 = words_separator(comma, "x,,y"),
+    S8 = words_separator(comma, ",x,,y,"),
+    list.foldl(write_nl, [S1, S2, S3, S4, S5, S6, S7, S8], !IO).
+
+:- pred comma(char::in) is semidet.
+
+comma(',').
+
+:- pred write_nl(list(string)::in, io::di, io::uo) is det.
+
+write_nl(L, !IO) :-
+    io.write(L, !IO),
+    io.nl(!IO).
+
+%-----------------------------------------------------------------------------%
+% vim: ft=mercury ts=8 sts=4 sw=4 et
diff --git a/tests/valid/mercury_java_parser_follow_code_bug.m b/tests/valid/mercury_java_parser_follow_code_bug.m
index a9d42a4..92dee13 100644
--- a/tests/valid/mercury_java_parser_follow_code_bug.m
+++ b/tests/valid/mercury_java_parser_follow_code_bug.m
@@ -500,7 +500,7 @@ null_literal -->
 %    [5]Contents | [6]Prev | [7]Next | [8]Index Java Language Specification
 %    Second Edition
 %
-%    [9]Copyright © 2000 Sun Microsystems, Inc. All rights reserved
+%    [9]Copyright © 2000 Sun Microsystems, Inc. All rights reserved
 %    Please send any comments or corrections to the [10]JLS team
 
 %                                     Syntax

--------------------------------------------------------------------------
mercury-reviews mailing list
Post messages to:       mercury-reviews at csse.unimelb.edu.au
Administrative Queries: owner-mercury-reviews at csse.unimelb.edu.au
Subscriptions:          mercury-reviews-request at csse.unimelb.edu.au
--------------------------------------------------------------------------



More information about the reviews mailing list