[m-rev.] for review: fix some problems with non-ASCII strings

Peter Wang novalazy at gmail.com
Tue May 17 11:03:02 AEST 2011


Branches: main

Fix some problems with non-ASCII strings.

compiler/error_util.m:
	Make string formatting routines count code points instead of code
	units.

	Iterate over non-ASCII strings correctly.

compiler/llds_out_data.m:
compiler/stack_layout.m:
	Explicitly count UTF-8 code units when counting the length of strings
	that will be written to C source files, in case the compiler is built
	in a grade which uses some other encoding. (The length argument to
	the `MR_string_const' macro isn't actually used, so that change won't
	have any practical effect.)

compiler/inst_check.m:
compiler/mercury_to_mercury.m:
compiler/structure_reuse.direct.choose_reuse.m:
	Fix some code unit counts which should be code point counts.

compiler/make.m:
	Iterate over non-ASCII characters correctly.

compiler/passes_aux.m:
	Simplify the implementation of `stage_num_str'.

compiler/timestamp.m:
	Make `string_to_timestamp' handle non-ASCII strings cleanly,
	although they should never occur.

compiler/x86_64_out.m:
	Split long comments at code point boundaries.

compiler/elds_to_erlang.m:
compiler/erl_code_gen.m:
compiler/mlds_to_cs.m:
compiler/pickle.m:
compiler/switch_util.m:
	Add some comments relating to non-ASCII characters.

library/parsing_utils.m:
	Fix the string and keyword parsers to work on non-ASCII characters.

library/pprint.m:
library/pretty_printer.m:
	Fix code unit counts which should be code point counts.

library/string.m:
	Add `count_utf8_code_units'.

	Fix the portable implementation of `string.to_char_list' (not
	actually used) to work on non-ASCII strings.

	Make string formatting routines count code points instead of code
	units.

library/term_io.m:
	Use a direct string comparison to check string is non-empty.

tests/general/test_parsing_utils.exp:
tests/general/test_parsing_utils.m:
tests/hard_coded/test_pretty_printer.exp:
tests/hard_coded/test_pretty_printer.m:
	Test non-ASCII strings.

diff --git a/compiler/elds_to_erlang.m b/compiler/elds_to_erlang.m
index f1e8ea9..76ec6ba 100644
--- a/compiler/elds_to_erlang.m
+++ b/compiler/elds_to_erlang.m
@@ -957,9 +957,10 @@ output_rtti_id(ModuleInfo, RttiId, !IO) :-
 :- func shorten_long_atom_name(string) = string.
 
 shorten_long_atom_name(Name0) = Name :-
-    % Erlang atom names can be up to 255 characters long, but the Erlang
-    % compiler may mangle it (e.g. to derive the names of anonymous functions)
-    % which would then exceed the limit.
+    % Erlang atom names can be up to 255 characters (bytes) long, but the
+    % Erlang compiler may mangle it (e.g. to derive the names of anonymous
+    % functions) which would then exceed the limit.
+    % This assumes the atom name consists of only ASCII characters.
     (if string.length(Name0) =< 200 then
         Name = Name0
     else
diff --git a/compiler/erl_code_gen.m b/compiler/erl_code_gen.m
index 1fbf700..5ae5286 100644
--- a/compiler/erl_code_gen.m
+++ b/compiler/erl_code_gen.m
@@ -799,9 +799,9 @@ erl_gen_switch(Var, CanFail, CasesList, CodeModel, InstMap0, _Context,
 
         list.length(CasesList) > switch_strings_as_atoms_limit,
 
-        % The Erlang implementation limits atoms to be 255 characters long or
-        % less, so we don't use the workaround if any cases are longer than
-        % that.
+        % The Erlang implementation limits atoms to be 255 characters (bytes)
+        % long or less, so we don't use the workaround if any cases are longer
+        % than that.
         all [String] (
             (
                 list.member(case(MainConsId, OtherConsIds, _), CasesList),
diff --git a/compiler/error_util.m b/compiler/error_util.m
index 6861c22..43fe26c 100644
--- a/compiler/error_util.m
+++ b/compiler/error_util.m
@@ -950,9 +950,9 @@ do_write_error_pieces(TreatAsFirst, MaybeContext, FixedIndent, MaxWidth,
         MaybeContext = yes(Context),
         term.context_file(Context, FileName),
         term.context_line(Context, LineNumber),
-        string.length(FileName, FileNameLength),
+        string.count_codepoints(FileName, FileNameLength),
         string.int_to_string(LineNumber, LineNumberStr),
-        string.length(LineNumberStr, LineNumberStrLength0),
+        string.count_codepoints(LineNumberStr, LineNumberStrLength0),
         ( LineNumberStrLength0 < 3 ->
             LineNumberStrLength = 3
         ;
@@ -1342,10 +1342,8 @@ break_into_words(String, Words0, Words) :-
 break_into_words_from(String, Cur, Words0, Words) :-
     ( find_word_start(String, Cur, Start) ->
         find_word_end(String, Start, End),
-        Length = End - Start + 1,
-        string.substring(String, Start, Length, WordStr),
-        Next = End + 1,
-        break_into_words_from(String, Next, [plain_word(WordStr) | Words0],
+        string.substring(String, Start, End - Start, WordStr),
+        break_into_words_from(String, End, [plain_word(WordStr) | Words0],
             Words)
     ;
         Words = Words0
@@ -1354,9 +1352,8 @@ break_into_words_from(String, Cur, Words0, Words) :-
 :- pred find_word_start(string::in, int::in, int::out) is semidet.
 
 find_word_start(String, Cur, WordStart) :-
-    string.index(String, Cur, Char),
+    string.unsafe_index_next(String, Cur, Next, Char),
     ( char.is_whitespace(Char) ->
-        Next = Cur + 1,
         find_word_start(String, Next, WordStart)
     ;
         WordStart = Cur
@@ -1365,8 +1362,7 @@ find_word_start(String, Cur, WordStart) :-
 :- pred find_word_end(string::in, int::in, int::out) is det.
 
 find_word_end(String, Cur, WordEnd) :-
-    Next = Cur + 1,
-    ( string.index(String, Next, Char) ->
+    ( string.unsafe_index_next(String, Cur, Next, Char) ->
         ( char.is_whitespace(Char) ->
             WordEnd = Cur
         ;
@@ -1451,7 +1447,7 @@ group_nonfirst_line_words(Words, Indent, Max, Lines) :-
     list(string)::out, list(string)::out) is det.
 
 get_line_of_words(FirstWord, LaterWords, Indent, Max, Line, RestWords) :-
-    string.length(FirstWord, FirstWordLen),
+    string.count_codepoints(FirstWord, FirstWordLen),
     Avail = Max - Indent * indent_increment,
     get_later_words(LaterWords, FirstWordLen, Avail, [FirstWord],
         Line, RestWords).
@@ -1461,7 +1457,7 @@ get_line_of_words(FirstWord, LaterWords, Indent, Max, Line, RestWords) :-
 
 get_later_words([], _, _, Line, Line, []).
 get_later_words([Word | Words], OldLen, Avail, Line0, Line, RestWords) :-
-    string.length(Word, WordLen),
+    string.count_codepoints(Word, WordLen),
     NewLen = OldLen + 1 + WordLen,
     ( NewLen =< Avail ->
         list.append(Line0, [Word], Line1),
diff --git a/compiler/inst_check.m b/compiler/inst_check.m
index 64f5175..1492da3 100644
--- a/compiler/inst_check.m
+++ b/compiler/inst_check.m
@@ -247,7 +247,7 @@ find_types_for_functor(FunctorsToTypes, Functor, Types) :-
             % Zero arity functors with length 1 could match the builtin
             % character type.
             Name = unqualified(NameStr),
-            string.length(NameStr) = 1
+            string.count_codepoints(NameStr) = 1
         ->
             TypesExceptTuple = [type_builtin(builtin_type_char)
                 | TypesExceptChar]
diff --git a/compiler/llds_out_data.m b/compiler/llds_out_data.m
index 6ab3124..171a2b1 100644
--- a/compiler/llds_out_data.m
+++ b/compiler/llds_out_data.m
@@ -1025,18 +1025,18 @@ output_rval_const(Info, Const, !IO) :-
         Const = llconst_string(String),
         io.write_string("MR_string_const(""", !IO),
         c_util.output_quoted_string(String, !IO),
-        string.length(String, StringLength),
         io.write_string(""", ", !IO),
-        io.write_int(StringLength, !IO),
+        io.write_int(string.count_utf8_code_units(String), !IO),
         io.write_string(")", !IO)
     ;
-        Const = llconst_multi_string(String),
+        Const = llconst_multi_string(Strings),
         io.write_string("MR_string_const(""", !IO),
-        c_util.output_quoted_multi_string(String, !IO),
+        c_util.output_quoted_multi_string(Strings, !IO),
         io.write_string(""", ", !IO),
 
         % The "+1" is for the NULL character.
-        Length = list.foldl((func(S, L0) = L0 + length(S) + 1), String, 0),
+        SumLengths = (func(S, L0) = L0 + string.count_utf8_code_units(S) + 1),
+        Length = list.foldl(SumLengths, Strings, 0),
         io.write_int(Length, !IO),
         io.write_string(")", !IO)
     ;
diff --git a/compiler/make.m b/compiler/make.m
index f722e78..f377619 100644
--- a/compiler/make.m
+++ b/compiler/make.m
@@ -428,7 +428,7 @@ make_target(Globals, Target, Success, !Info, !IO) :-
 classify_target(Globals, FileName, ModuleName - TargetType) :-
     (
         string.length(FileName, NameLength),
-        search_backwards_for_dot(FileName, NameLength - 1, DotLocn),
+        search_backwards_for_dot(FileName, NameLength, DotLocn),
         string.split(FileName, DotLocn, ModuleNameStr0, Suffix),
         solutions(classify_target_2(Globals, ModuleNameStr0, Suffix),
             TargetFiles),
@@ -549,11 +549,11 @@ classify_target_2(Globals, ModuleNameStr0, Suffix, ModuleName - TargetType) :-
 :- pred search_backwards_for_dot(string::in, int::in, int::out) is semidet.
 
 search_backwards_for_dot(String, Index, DotIndex) :-
-    Index >= 0,
-    ( string.index_det(String, Index, '.') ->
-        DotIndex = Index
+    string.unsafe_prev_index(String, Index, CharIndex, Char),
+    ( Char = ('.') ->
+        DotIndex = CharIndex
     ;
-        search_backwards_for_dot(String, Index - 1, DotIndex)
+        search_backwards_for_dot(String, CharIndex, DotIndex)
     ).
 
 :- func get_executable_type(globals) = linked_target_type.
diff --git a/compiler/mercury_to_mercury.m b/compiler/mercury_to_mercury.m
index bdaada4..aeda650 100644
--- a/compiler/mercury_to_mercury.m
+++ b/compiler/mercury_to_mercury.m
@@ -4359,7 +4359,7 @@ mercury_limited_term_nq_to_string(VarSet, AppendVarnums, NextToGraphicToken,
         Limit, Term) = String :-
     mercury_format_term_nq(VarSet, AppendVarnums, NextToGraphicToken, Term,
         "", FullString),
-    FullLen = string.length(FullString),
+    FullLen = string.count_codepoints(FullString),
     ( FullLen =< Limit ->
         String = FullString
     ;
diff --git a/compiler/mlds_to_cs.m b/compiler/mlds_to_cs.m
index 1585856..45f3b4c 100644
--- a/compiler/mlds_to_cs.m
+++ b/compiler/mlds_to_cs.m
@@ -2011,6 +2011,7 @@ write_identifier_string(String, !IO) :-
     % Although the C# spec does not limit identifier lengths, the Microsoft
     % compiler restricts identifiers to 511 characters and Mono restricts
     % identifiers to 512 characters.
+    % This assumes the identifier contains only ASCII characters.
     Length = string.length(String),
     ( Length > 511 ->
         Left = string.left(String, 251),
diff --git a/compiler/passes_aux.m b/compiler/passes_aux.m
index 9ce8a64..8f5b86c 100644
--- a/compiler/passes_aux.m
+++ b/compiler/passes_aux.m
@@ -546,15 +546,7 @@ should_dump_stage(StageNum, StageNumStr, StageName, DumpStages) :-
         )
     ).
 
-stage_num_str(StageNum) = StageNumStr :-
-    int_to_string(StageNum, StageNumStr0),
-    ( string.length(StageNumStr0, 1) ->
-        StageNumStr = "00" ++ StageNumStr0
-    ; string.length(StageNumStr0, 2) ->
-        StageNumStr = "0" ++ StageNumStr0
-    ;
-        StageNumStr = StageNumStr0
-    ).
+stage_num_str(StageNum) = string.format("%03d", [i(StageNum)]).
 
 maybe_dump_hlds(HLDS, StageNum, StageName, !DumpInfo, !IO) :-
     module_info_get_globals(HLDS, Globals),
diff --git a/compiler/pickle.m b/compiler/pickle.m
index 616289f..23a8ca0 100644
--- a/compiler/pickle.m
+++ b/compiler/pickle.m
@@ -393,6 +393,7 @@ sign_extend_32(X) = R :-
 :- pred pickle_char(char::in, io::di, io::uo) is det.
 
 pickle_char(Char, !IO) :-
+    % XXX handle non-ASCII characters
     char.to_int(Char, Int),
     io.write_byte(Int, !IO).
 
diff --git a/compiler/stack_layout.m b/compiler/stack_layout.m
index 7cda580..55a0e3c 100644
--- a/compiler/stack_layout.m
+++ b/compiler/stack_layout.m
@@ -117,6 +117,7 @@
 :- import_module parse_tree.prog_event.
 
 :- import_module bool.
+:- import_module char.
 :- import_module cord.
 :- import_module counter.
 :- import_module int.
@@ -2533,7 +2534,7 @@ lookup_string_in_table(String, Offset, !StringTable) :-
     ( map.search(TableMap0, String, OldOffset) ->
         Offset = OldOffset
     ;
-        string.length(String, Length),
+        Length = string.count_utf8_code_units(String),
         TableOffset = TableOffset0 + Length + 1,
         % We use a 32 bit unsigned integer to represent the offset.
         % Computing that limit exactly without getting an overflow
diff --git a/compiler/structure_reuse.direct.choose_reuse.m b/compiler/structure_reuse.direct.choose_reuse.m
index 0f58aa9..fd92de8 100644
--- a/compiler/structure_reuse.direct.choose_reuse.m
+++ b/compiler/structure_reuse.direct.choose_reuse.m
@@ -1294,7 +1294,7 @@ line_length = 79.
 dump_line(Msg, !IO) :-
     Prefix = "%---",
     Start = string.append(Prefix, Msg),
-    Remainder = line_length - string.length(Start) - 1,
+    Remainder = line_length - string.count_codepoints(Start) - 1,
     Line = Start ++ string.duplicate_char('-', Remainder),
     io.write_string(Line, !IO),
     io.write_string("%\n", !IO).
diff --git a/compiler/switch_util.m b/compiler/switch_util.m
index 1f14313..ab01d49 100644
--- a/compiler/switch_util.m
+++ b/compiler/switch_util.m
@@ -543,6 +543,7 @@ estimate_switch_tag_test_cost(Tag) = Cost :-
         % the end of the string. The multiplication is an attempt to factor in
         % the fact that each character comparison is in a loop, and thus takes
         % more than one instruction.
+        % On non-ASCII strings, this cost depends on the compiler back-end.
         Cost = 1 + 2 * string.length(String)
     ;
         Tag = shared_with_reserved_addresses_tag(RAs, SubTag),
diff --git a/compiler/timestamp.m b/compiler/timestamp.m
index 81be210..e28f41f 100644
--- a/compiler/timestamp.m
+++ b/compiler/timestamp.m
@@ -53,6 +53,7 @@
 
 :- implementation.
 
+:- import_module char.
 :- import_module int.
 :- import_module maybe.
 :- import_module string.
@@ -153,9 +154,11 @@ timestamp_to_string(timestamp(Timestamp)) = Timestamp.
 
 string_to_timestamp(Timestamp) = timestamp(Timestamp) :-
     % The if-then-else here is to force order of evaluation --
-    % we need to ensure that the length check occurs before the
-    % calls to unsafe_undex to avoid dereferencing invalid pointers.
+    % we need to ensure that the sanity checks occur before the
+    % calls to unsafe_index. The offsets are only valid if the string
+    % contains only ASCII characters, as expected.
     (
+        string.all_match(plausible_timestamp_char, Timestamp),
         string.length(Timestamp) : int = string.length("yyyy-mm-dd hh:mm:ss")
     ->
         string.to_int(string.unsafe_substring(Timestamp, 0, 4), _),
@@ -192,3 +195,10 @@ string_to_timestamp(Timestamp) = timestamp(Timestamp) :-
     ;
         fail
     ).
+
+:- pred plausible_timestamp_char(char::in) is semidet.
+
+plausible_timestamp_char(Char) :-
+    char.to_int(Char, CharInt),
+    char.to_int(':', HighestInt),
+    CharInt =< HighestInt.
diff --git a/compiler/x86_64_out.m b/compiler/x86_64_out.m
index 0dbc518..2145434 100644
--- a/compiler/x86_64_out.m
+++ b/compiler/x86_64_out.m
@@ -509,10 +509,11 @@ output_x86_64_instr_list(Stream, Instrs, !IO) :-
     io::di, io::uo) is det <= stream.writer(Stream, string, io).
 
 output_x86_64_instr(Stream, x86_64_comment(Comment), !IO) :-
-    ( string.length(Comment) > 0 ->
+    ( Comment \= "" ->
         put(Stream, "\t# ", !IO),
-        ( string.length(Comment) > comment_length ->
-            string.split(Comment, comment_length, Comment1, Comment2),
+        ( string.count_codepoints(Comment) > comment_length ->
+            string.split_by_codepoint(Comment, comment_length,
+                Comment1, Comment2),
             put(Stream, string.word_wrap(Comment1, comment_length), !IO),
             put(Stream, "\n", !IO),
             output_x86_64_instr(Stream, x86_64_comment(Comment2), !IO)
@@ -523,7 +524,7 @@ output_x86_64_instr(Stream, x86_64_comment(Comment), !IO) :-
         true
     ).
 output_x86_64_instr(Stream, x86_64_label(LabelName), !IO) :-
-    ( string.length(LabelName) > 0 ->
+    ( LabelName \= "" ->
         put(Stream, "\n" ++ LabelName ++ ":", !IO)
     ;
         true
@@ -1057,7 +1058,7 @@ output_x86_64_inst(Stream, xor(Src, Dest), !IO) :-
     is det <= stream.writer(Stream, string, io).
 
 output_x86_64_comment(Stream, Comment, !IO) :-
-    ( string.length(Comment) > 0 ->
+    ( Comment \= "" ->
         put(Stream, "\t# ", !IO),
         put(Stream, Comment, !IO)
     ;   
diff --git a/library/parsing_utils.m b/library/parsing_utils.m
index 0114bb5..a29138a 100644
--- a/library/parsing_utils.m
+++ b/library/parsing_utils.m
@@ -159,6 +159,7 @@
 
     % ikeyword(IdChars, Keyword, Src, _, !PS)
     % Case-insensitive version of keyword/6.
+    % Only upper and lowercase unaccented Latin letters are treated specially.
     %
 :- pred ikeyword(string::in, string::in, src::in, unit::out,
     ps::in, ps::out) is semidet.
@@ -498,11 +499,11 @@ eof(Src, unit, !PS) :-
 
 next_char(Src, Char, !PS) :-
     promise_pure (
-        current_offset(Src, Offset, !PS),
+        current_offset(Src, Offset, !.PS, _),
         Offset < Src ^ input_length,
-        Char = Src ^ input_string ^ unsafe_elem(Offset),
+        string.unsafe_index_next(Src ^ input_string, Offset, NextOffset, Char),
         impure record_progress(Src, Offset),
-        !:PS = !.PS + 1
+        !:PS = NextOffset
     ).
 
 %-----------------------------------------------------------------------------%
@@ -538,7 +539,8 @@ match_string(MatchStr, Src, PS, PS + N) :-
 
 match_string_2(N, I, MatchStr, Offset, Str) :-
     ( if I < N then
-        MatchStr ^ unsafe_elem(I) = Str ^ unsafe_elem(Offset + I),
+        string.unsafe_index_code_unit(MatchStr, I, CodeUnit),
+        string.unsafe_index_code_unit(Str, Offset + I, CodeUnit),
         match_string_2(N, I + 1, MatchStr, Offset, Str)
       else
         true
@@ -559,9 +561,14 @@ imatch_string(MatchStr, Src, PS, PS + N) :-
 
 imatch_string_2(N, I, MatchStr, Offset, Str) :-
     ( if I < N then
-        char.to_upper(MatchStr ^ unsafe_elem(I), Chr1),
-        char.to_upper(Str ^ unsafe_elem(Offset + I), Chr2),
-        Chr1 = Chr2,
+        % We can compare by code units because char.to_upper only converts
+        % letters in the ASCII range, and ASCII characters are always encoded
+        % in a single code unit.
+        string.unsafe_index_code_unit(MatchStr, I, CodeUnit1),
+        string.unsafe_index_code_unit(Str, Offset + I, CodeUnit2),
+        char.det_from_int(CodeUnit1, Chr1),
+        char.det_from_int(CodeUnit2, Chr2),
+        char.to_upper(Chr1) = char.to_upper(Chr2) : char,
         imatch_string_2(N, I + 1, MatchStr, Offset, Str)
       else
         true
@@ -824,12 +831,13 @@ digits_2(Base, Src, unit, !PS) :-
 %-----------------------------------------------------------------------------%
 
 string_literal(QuoteChar, Src, String, !PS) :-
-    current_offset(Src, Start, !PS),
     next_char(Src, QuoteChar, !PS),
+    current_offset(Src, Start, !PS),
     string_literal_2(Src, QuoteChar, _, !PS),
     current_offset(Src, EndPlusOne, !PS),
+    string.unsafe_prev_index(Src ^ input_string, EndPlusOne, End, QuoteChar),
     skip_whitespace(Src, !PS),
-    input_substring(Src, Start + 1, EndPlusOne - 1, String).
+    input_substring(Src, Start, End, String).
 
 %-----------------------------------------------------------------------------%
 
diff --git a/library/pprint.m b/library/pprint.m
index ba19416..8b6cda9 100644
--- a/library/pprint.m
+++ b/library/pprint.m
@@ -496,7 +496,7 @@ lb(P, W, K0, K, I, 'LABEL'(L, X), S0, S) :-
     lb(P, W, K0, K, I ++ L, X, S0, S).
 
 lb(P, _, _,  K, I, 'LINE',        S0, S) :-
-    K = string.length(I),
+    K = string.count_codepoints(I),
     P("\n", S0, S1),
     P(I,    S1, S ).
 
@@ -509,7 +509,7 @@ lb(P, W, K0, K, I, 'DOC'(D, U),   S0, S) :-
     lb(P, W, K0, K, I, to_doc(D, univ_value(U)), S0, S).
 
 lb(P, _, K0, K, _, 'TEXT'(T),     S0, S) :-
-    K = K0 + string.length(T),
+    K = K0 + string.count_codepoints(T),
     P(T, S0, S).
 
 %-----------------------------------------------------------------------------%
@@ -531,7 +531,7 @@ ff('LINE',        R) = R.
 ff('GROUP'(X),    R) = ff(X, R).
 ff('DOC'(D, U),   R) = ff(to_doc(D, univ_value(U)), R).
 ff('TEXT'(S),     R) = R - L :-
-    L = string.length(S),
+    L = string.count_codepoints(S),
     R > L.
 
 %-----------------------------------------------------------------------------%
@@ -563,7 +563,7 @@ layout_flat(P, K0, K, 'DOC'(D, U),   S0, S) :-
     layout_flat(P, K0, K, to_doc(D, univ_value(U)), S0, S).
 
 layout_flat(P, K0, K, 'TEXT'(T),     S0, S) :-
-    K = K0 + string.length(T),
+    K = K0 + string.count_codepoints(T),
     P(T, S0, S).
 
 %-----------------------------------------------------------------------------%
diff --git a/library/pretty_printer.m b/library/pretty_printer.m
index 8433d0a..f959638 100644
--- a/library/pretty_printer.m
+++ b/library/pretty_printer.m
@@ -381,12 +381,13 @@ write_doc_to_stream(Stream, Canonicalize, FMap, LineWidth, [Doc | Docs0],
             %
             Doc = str(String),
             stream.put(Stream, String, !IO),
-            !:RemainingWidth = !.RemainingWidth - string.length(String),
+            !:RemainingWidth = !.RemainingWidth -
+                string.count_codepoints(String),
             Docs = Docs0
         ;
             Doc = nl,
             ( if
-                F = ( func(S, W) = string.length(S) + W ),
+                F = ( func(S, W) = string.count_codepoints(S) + W ),
                 IndentWidth = list.foldl(F, !.Indents, 0),
                 !.RemainingWidth < LineWidth - IndentWidth
               then
@@ -479,7 +480,7 @@ output_current_group(Stream, LineWidth, Indents, OpenGroups,
         [Doc | Docs0], Docs, !RemainingWidth, !RemainingLines, !IO) :-
     ( if Doc = str(String) then
         stream.put(Stream, String, !IO),
-        !:RemainingWidth = !.RemainingWidth - string.length(String),
+        !:RemainingWidth = !.RemainingWidth - string.count_codepoints(String),
         output_current_group(Stream, LineWidth, Indents, OpenGroups,
             Docs0, Docs, !RemainingWidth, !RemainingLines, !IO)
       else if Doc = hard_nl then
@@ -545,7 +546,8 @@ expand_docs(Canonicalize, FMap, [Doc | Docs0], Docs, OpenGroups,
       else
         (
             Doc = str(String),
-            !:RemainingWidth = !.RemainingWidth - string.length(String),
+            !:RemainingWidth = !.RemainingWidth -
+                string.count_codepoints(String),
             Docs = [Doc | Docs1],
             expand_docs(Canonicalize, FMap, Docs0, Docs1, OpenGroups,
                 !Limit, !Pri, !RemainingWidth)
@@ -638,7 +640,7 @@ output_indentation(_Stream, [], !RemainingWidth, !IO).
 output_indentation(Stream, [Indent | Indents], !RemainingWidth, !IO) :-
     output_indentation(Stream, Indents, !RemainingWidth, !IO),
     stream.put(Stream, Indent, !IO),
-    !:RemainingWidth = !.RemainingWidth - string.length(Indent).
+    !:RemainingWidth = !.RemainingWidth - string.count_codepoints(Indent).
 
 %-----------------------------------------------------------------------------%
 
diff --git a/library/string.m b/library/string.m
index 0806723..770a450 100644
--- a/library/string.m
+++ b/library/string.m
@@ -94,6 +94,11 @@
 :- func string.count_codepoints(string) = int.
 :- pred string.count_codepoints(string::in, int::out) is det.
 
+    % Determine the number of code units required to represent a string
+    % in UTF-8 encoding.
+    %
+:- func string.count_utf8_code_units(string) = int.
+
     % string.codepoint_offset(String, CodePointCount, CodePointOffset):
     % Equivalent to `string.codepoint_offset(String, 0, CodePointCount,
     % CodePointOffset)'.
@@ -605,7 +610,7 @@
     % The length (in code units) of the maximal suffix of `String' consisting
     % entirely of characters (code points) satisfying Pred.
     %
-:- func suffix_length(pred(char)::in(pred(in) is semidet), string::in)
+:- func string.suffix_length(pred(char)::in(pred(in) is semidet), string::in)
     = (int::out) is det.
 
     % string.set_char(Char, Index, String0, String):
@@ -1101,7 +1106,7 @@ string.to_int(String, Int) :-
 
 string.base_string_to_int(Base, String, Int) :-
     string.index(String, 0, Char),
-    Len = string.length(String),
+    Len = string.count_codepoints(String),
     ( Char = ('-') ->
         Len > 1,
         foldl_substring(accumulate_negative_int(Base), String, 1,
@@ -1614,19 +1619,7 @@ string.to_char_list(Str::uo, CharList::in) :-
 ").
 
 string.to_char_list_2(Str, CharList) :-
-    string.to_char_list_3(Str, string.length(Str) - 1, [], CharList).
-
-:- pred string.to_char_list_3(string::in, int::in,
-    list(char)::di, list(char)::uo) is det.
-
-string.to_char_list_3(Str, Index, CharList0, CharList) :-
-    ( Index >= 0 ->
-        string.unsafe_index(Str, Index, Char),
-        CharList1 = [Char | CharList0],
-        string.to_char_list_3(Str, Index - 1, CharList1, CharList)
-    ;
-        CharList = CharList0
-    ).
+    string.foldr(list.cons, Str, [], CharList).
 
 %-----------------------------------------------------------------------------%
 
@@ -3258,7 +3251,7 @@ format_int(Flags, Width, Prec, Int) = String :-
         AbsInteger = integer.abs(Integer),
         AbsIntStr = integer.to_string(AbsInteger)
     ),
-    AbsIntStrLength = string.length(AbsIntStr),
+    AbsIntStrLength = string.count_codepoints(AbsIntStr),
 
     % Do we need to increase precision?
     (
@@ -3273,7 +3266,7 @@ format_int(Flags, Width, Prec, Int) = String :-
     % Do we need to pad to the field width?
     (
         Width = yes(FieldWidth),
-        FieldWidth > string.length(PrecStr),
+        FieldWidth > string.count_codepoints(PrecStr),
         member('0', Flags),
         \+ member('-', Flags),
         Prec = no
@@ -3324,7 +3317,7 @@ format_unsigned_int(Flags, Width, Prec, Base, Int, IsTypeP, Prefix) = String :-
             AbsIntStr = AbsIntStr0
         )
     ),
-    AbsIntStrLength = string.length(AbsIntStr),
+    AbsIntStrLength = string.count_codepoints(AbsIntStr),
 
     % Do we need to increase precision?
     (
@@ -3350,7 +3343,7 @@ format_unsigned_int(Flags, Width, Prec, Base, Int, IsTypeP, Prefix) = String :-
     % Do we need to pad to the field width?
     (
         Width = yes(FieldWidth),
-        FieldWidth > string.length(PrecModStr),
+        FieldWidth > string.count_codepoints(PrecModStr),
         member('0', Flags),
         \+ member('-', Flags),
         Prec = no
@@ -3412,7 +3405,7 @@ format_float(Flags, Width, Prec, Float) = NewFloat :-
             \+ member('#', Flags),
             Prec = yes(0)
         ->
-            PrecStrLen = string.length(PrecStr),
+            PrecStrLen = string.count_codepoints(PrecStr),
             PrecModStr = string.substring(PrecStr, 0, PrecStrLen - 1)
         ;
             PrecModStr = PrecStr
@@ -3422,7 +3415,7 @@ format_float(Flags, Width, Prec, Float) = NewFloat :-
     % Do we need to change field width?
     (
         Width = yes(FieldWidth),
-        FieldWidth > string.length(PrecModStr),
+        FieldWidth > string.count_codepoints(PrecModStr),
         member('0', Flags),
         \+ member('-', Flags)
     ->
@@ -3469,7 +3462,7 @@ format_scientific_number_g(Flags, Width, Prec, Float, E) = NewFloat :-
         %
     (
         Width = yes(FieldWidth),
-        FieldWidth > string.length(PrecStr),
+        FieldWidth > string.count_codepoints(PrecStr),
         member('0', Flags),
         \+ member('-', Flags)
     ->
@@ -3521,7 +3514,7 @@ format_scientific_number(Flags, Width, Prec, Float, E) = NewFloat :-
     % Do we need to change field width?
     (
         Width = yes(FieldWidth),
-        FieldWidth > string.length(PrecModStr),
+        FieldWidth > string.count_codepoints(PrecModStr),
         member('0', Flags),
         \+ member('-', Flags)
     ->
@@ -3802,7 +3795,7 @@ change_to_e_notation(Float, Prec, E) = ScientificFloat :-
 
     % Is mantissa greater than one digit long?
     split_at_decimal_point(UnsafeBase, MantissaStr, _FractionStr),
-    ( string.length(MantissaStr) > 1 ->
+    ( string.count_codepoints(MantissaStr) > 1 ->
         % Need to append 0, to fix the problem of having no numbers
         % after the decimal point.
         SafeBase = calculate_base_unsafe(string.append(UnsafeBase, "0"),
@@ -3842,7 +3835,7 @@ size_of_required_exponent(Float, Prec) = Exponent :-
 
     % Is mantissa one digit long?
     split_at_decimal_point(UnsafeBase, MantissaStr, _FractionStr),
-    ( string.length(MantissaStr) > 1 ->
+    ( string.count_codepoints(MantissaStr) > 1 ->
         % We will need to move decimal pt one place to the left:
         % therefore, increment exponent.
         Exponent = UnsafeExponent + 1
@@ -3883,7 +3876,7 @@ remove_zeros(CharNum) = TrimmedNum :-
 
 decimal_pos(Float) = Pos :-
     split_at_decimal_point(Float, MantissaStr, _FractionStr),
-    NumZeros = string.length(MantissaStr) - 1,
+    NumZeros = string.count_codepoints(MantissaStr) - 1,
     Pos = find_non_zero_pos(string.to_char_list(Float), NumZeros).
 
     % Given a list of chars representing a floating point number, this
@@ -3951,7 +3944,7 @@ calculate_base_unsafe(Float, Prec) = Exp :-
 
 change_precision(Prec, OldFloat) = NewFloat :-
     split_at_decimal_point(OldFloat, MantissaStr, FractionStr),
-    FracStrLen = string.length(FractionStr),
+    FracStrLen = string.count_codepoints(FractionStr),
     ( Prec > FracStrLen ->
         PrecFracStr = string.pad_right(FractionStr, '0', Prec),
         PrecMantissaStr = MantissaStr
@@ -3965,7 +3958,10 @@ change_precision(Prec, OldFloat) = NewFloat :-
             NewPrecFrac = string.det_to_int(UnroundedFrac) + 1,
             NewPrecFracStrNotOK = string.int_to_string( NewPrecFrac),
             NewPrecFracStr = string.pad_left(NewPrecFracStrNotOK, '0', Prec),
-            ( string.length(NewPrecFracStr) > string.length(UnroundedFrac) ->
+            (
+                string.count_codepoints(NewPrecFracStr) >
+                    string.count_codepoints(UnroundedFrac)
+            ->
                 PrecFracStr = substring(NewPrecFracStr, 1, Prec),
                 PrecMantissaInt = det_to_int(MantissaStr) + 1,
                 PrecMantissaStr = int_to_string(PrecMantissaInt)
@@ -4995,6 +4991,38 @@ count_codepoints_2(String, I, Count0, Count) :-
     Count = String.codePointCount(0, String.length());
 ").
 
+/*-----------------------------------------------------------------------*/
+
+:- pragma foreign_proc("C",
+    string.count_utf8_code_units(Str::in) = (Length::out),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    Length = strlen(Str);
+").
+:- pragma foreign_proc("Erlang",
+    string.count_utf8_code_units(Str::in) = (Length::out),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    Length = size(Str)
+").
+
+string.count_utf8_code_units(String) = Length :-
+    string.foldl(count_utf8_code_units_2, String, 0, Length).
+
+:- pred count_utf8_code_units_2(char::in, int::in, int::out) is det.
+
+count_utf8_code_units_2(Char, !Length) :-
+    char.to_int(Char, CharInt),
+    ( CharInt =< 0x7f ->
+        !:Length = !.Length + 1
+    ; char.to_utf8(Char, UTF8) ->
+        !:Length = !.Length + list.length(UTF8)
+    ;
+        error("string.count_utf8_code_units: char.to_utf8 failed")
+    ).
+
+/*-----------------------------------------------------------------------*/
+
     % Note: we do not define what happens with unpaired surrogates.
     %
 string.codepoint_offset(String, N, Index) :-
diff --git a/library/term_io.m b/library/term_io.m
index 5407d56..cbf64ef 100644
--- a/library/term_io.m
+++ b/library/term_io.m
@@ -652,7 +652,7 @@ should_atom_be_quoted(S, NextToGraphicToken) = ShouldQuote :-
         ;
             % Graphic token (6.4.2)
             string.all_match(lexer.graphic_token_char, S),
-            string.length(S) > 0,
+            S \= "",
 
             % We need to quote tokens starting with '#', because Mercury uses
             % '#' to start source line number indicators.
diff --git a/tests/general/test_parsing_utils.exp b/tests/general/test_parsing_utils.exp
index 6f78c66..977e8be 100644
--- a/tests/general/test_parsing_utils.exp
+++ b/tests/general/test_parsing_utils.exp
@@ -28,6 +28,21 @@ pass: keyword("ABC", "ABC") on "ABCA"
 pass: keyword("ABC", "ABC") on "ABC 123"
 	returned unit as expected
 	[4 chars consumed]
+pass: keyword("αβγ", "αβγ") on ""
+	failed as expected
+pass: keyword("αβγ", "αβγ") on "123"
+	failed as expected
+pass: keyword("αβγ", "αβγ") on "αβγα"
+	failed as expected
+pass: keyword("αβγ", "αβγ") on "αβγ 123"
+	returned unit as expected
+	[4 code points consumed]
+pass: keyword("ABC", "ABC") on "abc 123"
+	returned unit as expected
+	[4 chars consumed]
+pass: ikeyword("αβγ", "αβγ") on "αβγ 123"
+	returned unit as expected
+	[4 code points consumed]
 pass: identifier("ABC", "ABCabc_") on ""
 	failed as expected
 pass: identifier("ABC", "ABCabc_") on "abc"
@@ -45,6 +60,18 @@ pass: identifier("ABC", "ABCabc_") on "*"
 pass: identifier("ABC", "ABCabc_") on "Abc !"
 	returned "Abc" as expected
 	[4 chars consumed]
+pass: identifier("αβγ", "αβγ_") on ""
+	failed as expected
+pass: identifier("αβγ", "αβγ_") on "abc"
+	failed as expected
+pass: identifier("αβγ", "αβγ_") on "_"
+	failed as expected
+pass: identifier("αβγ", "αβγ_") on "α"
+	returned "α" as expected
+	[1 code points consumed]
+pass: identifier("αβγ", "αβγ_") on "αβ_γ"
+	returned "αβ_γ" as expected
+	[4 code points consumed]
 pass: whitespace on ""
 	returned unit as expected
 	[0 chars consumed]
@@ -159,6 +186,14 @@ pass: string_literal('\'') on "'123'   abc"
 pass: string_literal('\'') on "'1\'2\'3'   abc"
 	returned "1\\\'2\\\'3" as expected
 	[12 chars consumed]
+pass: string_literal('‖') on ""
+	failed as expected
+pass: string_literal('‖') on "‖123‖   abc"
+	returned "123" as expected
+	[8 code points consumed]
+pass: string_literal('‖') on "‖αβγ‖   abc"
+	returned "αβγ" as expected
+	[8 code points consumed]
 pass: optional(punct("!")) on ""
 	returned no as expected
 	[0 chars consumed]
diff --git a/tests/general/test_parsing_utils.m b/tests/general/test_parsing_utils.m
index 24e53af..2ca65f5 100644
--- a/tests/general/test_parsing_utils.m
+++ b/tests/general/test_parsing_utils.m
@@ -97,15 +97,25 @@ run_test(Result) :-
             PassFail = "fail"
         )
     ),
+    (
+        CurrentOffset = yes(CO),
+        input_substring(Src, 0, CO, Substring)
+    ->
+        NumCodePoints = string.count_codepoints(Substring),
+        NumCodeUnits = string.count_code_units(Substring),
+        ( NumCodeUnits = NumCodePoints ->
+            What = "chars"
+        ;
+            What = "code points"
+        ),
+        Consumed = string.format("\n\t[%d %s consumed]",
+            [i(NumCodePoints), s(What)])
+    ;
+        Consumed = ""
+    ),
     Result = PassFail ++ ": " ++
         ParserName ++ " on \"" ++ TestString ++ "\"\n\t" ++
-        Outcome ++
-        ( if CurrentOffset = yes(CO) then
-            string.format("\n\t[%d chars consumed]", [i(CO)])
-          else
-            ""
-        ) ++
-        "\n".
+        Outcome ++ Consumed ++ "\n".
 
 %-----------------------------------------------------------------------------%
 
@@ -146,6 +156,20 @@ test_case("keyword(\"ABC\", \"ABC\")", stringify(keyword("ABC", "ABC")),
 test_case("keyword(\"ABC\", \"ABC\")", stringify(keyword("ABC", "ABC")),
     "ABC 123", yes("unit")).
 
+test_case("keyword(\"αβγ\", \"αβγ\")", stringify(keyword("αβγ", "αβγ")),
+    "", no).
+test_case("keyword(\"αβγ\", \"αβγ\")", stringify(keyword("αβγ", "αβγ")),
+    "123", no).
+test_case("keyword(\"αβγ\", \"αβγ\")", stringify(keyword("αβγ", "αβγ")),
+    "αβγα", no).
+test_case("keyword(\"αβγ\", \"αβγ\")", stringify(keyword("αβγ", "αβγ")),
+    "αβγ 123", yes("unit")).
+
+test_case("keyword(\"ABC\", \"ABC\")", stringify(ikeyword("ABC", "ABC")),
+    "abc 123", yes("unit")).
+test_case("ikeyword(\"αβγ\", \"αβγ\")", stringify(ikeyword("αβγ", "αβγ")),
+    "αβγ 123", yes("unit")).
+
 test_case("identifier(\"ABC\", \"ABCabc_\")", stringify(identifier("ABC", "ABCabc_")),
     "", no).
 test_case("identifier(\"ABC\", \"ABCabc_\")", stringify(identifier("ABC", "ABCabc_")),
@@ -161,6 +185,17 @@ test_case("identifier(\"ABC\", \"ABCabc_\")", stringify(identifier("ABC", "ABCab
 test_case("identifier(\"ABC\", \"ABCabc_\")", stringify(identifier("ABC", "ABCabc_")),
     "Abc !", yes("\"Abc\"")).
 
+test_case("identifier(\"αβγ\", \"αβγ_\")", stringify(identifier("αβγ", "αβγ_")),
+    "", no).
+test_case("identifier(\"αβγ\", \"αβγ_\")", stringify(identifier("αβγ", "αβγ_")),
+    "abc", no).
+test_case("identifier(\"αβγ\", \"αβγ_\")", stringify(identifier("αβγ", "αβγ_")),
+    "_", no).
+test_case("identifier(\"αβγ\", \"αβγ_\")", stringify(identifier("αβγ", "αβγ_")),
+    "α", yes("\"α\"")).
+test_case("identifier(\"αβγ\", \"αβγ_\")", stringify(identifier("αβγ", "αβγ_")),
+    "αβ_γ", yes("\"αβ_γ\"")).
+
 test_case("whitespace", stringify(whitespace),
     "", yes("unit")).
 test_case("whitespace", stringify(whitespace),
@@ -255,6 +290,13 @@ test_case("string_literal('\\\'')", stringify(string_literal('\'')),
 test_case("string_literal('\\\'')", stringify(string_literal('\'')),
     "\'1\\\'2\\\'3\'   abc", yes("\"1\\\\\\\'2\\\\\\\'3\"")).
 
+test_case("string_literal('‖')", stringify(string_literal('‖')),
+    "", no).
+test_case("string_literal('‖')", stringify(string_literal('‖')),
+    "‖123‖   abc", yes("\"123\"")).
+test_case("string_literal('‖')", stringify(string_literal('‖')),
+    "‖αβγ‖   abc", yes("\"αβγ\"")).
+
 test_case("optional(punct(\"!\"))", stringify(optional(punct("!"))),
     "", yes("no")).
 test_case("optional(punct(\"!\"))", stringify(optional(punct("!"))),
diff --git a/tests/hard_coded/test_pretty_printer.exp b/tests/hard_coded/test_pretty_printer.exp
index 1dd2230..5a8d764 100644
--- a/tests/hard_coded/test_pretty_printer.exp
+++ b/tests/hard_coded/test_pretty_printer.exp
@@ -525,7 +525,7 @@ limit = triangular(10), max lines = 3, line width = 38
 
 limit = triangular(10), max lines = 3, line width = 38
 |------------------------------------|
-{1, 2.0, "three", '4', {5}}
+{1, 2.0, "three", '4', {5}, "«ąąąąą»"}
 |------------------------------------|
 
 limit = triangular(10), max lines = 3, line width = 38
@@ -600,7 +600,7 @@ limit = triangular(10), max lines = 3, line width = 78
 
 limit = triangular(10), max lines = 3, line width = 78
 |----------------------------------------------------------------------------|
-{1, 2.0, "three", '4', {5}}
+{1, 2.0, "three", '4', {5}, "«ąąąąą»"}
 |----------------------------------------------------------------------------|
 
 limit = triangular(10), max lines = 3, line width = 78
@@ -674,7 +674,7 @@ limit = triangular(10), max lines = 10, line width = 38
 
 limit = triangular(10), max lines = 10, line width = 38
 |------------------------------------|
-{1, 2.0, "three", '4', {5}}
+{1, 2.0, "three", '4', {5}, "«ąąąąą»"}
 |------------------------------------|
 
 limit = triangular(10), max lines = 10, line width = 38
@@ -765,7 +765,7 @@ limit = triangular(10), max lines = 10, line width = 78
 
 limit = triangular(10), max lines = 10, line width = 78
 |----------------------------------------------------------------------------|
-{1, 2.0, "three", '4', {5}}
+{1, 2.0, "three", '4', {5}, "«ąąąąą»"}
 |----------------------------------------------------------------------------|
 
 limit = triangular(10), max lines = 10, line width = 78
@@ -834,7 +834,7 @@ limit = triangular(100), max lines = 3, line width = 38
 
 limit = triangular(100), max lines = 3, line width = 38
 |------------------------------------|
-{1, 2.0, "three", '4', {5}}
+{1, 2.0, "three", '4', {5}, "«ąąąąą»"}
 |------------------------------------|
 
 limit = triangular(100), max lines = 3, line width = 38
@@ -919,7 +919,7 @@ limit = triangular(100), max lines = 3, line width = 78
 
 limit = triangular(100), max lines = 3, line width = 78
 |----------------------------------------------------------------------------|
-{1, 2.0, "three", '4', {5}}
+{1, 2.0, "three", '4', {5}, "«ąąąąą»"}
 |----------------------------------------------------------------------------|
 
 limit = triangular(100), max lines = 3, line width = 78
@@ -1009,7 +1009,7 @@ limit = triangular(100), max lines = 10, line width = 38
 
 limit = triangular(100), max lines = 10, line width = 38
 |------------------------------------|
-{1, 2.0, "three", '4', {5}}
+{1, 2.0, "three", '4', {5}, "«ąąąąą»"}
 |------------------------------------|
 
 limit = triangular(100), max lines = 10, line width = 38
@@ -1138,7 +1138,7 @@ limit = triangular(100), max lines = 10, line width = 78
 
 limit = triangular(100), max lines = 10, line width = 78
 |----------------------------------------------------------------------------|
-{1, 2.0, "three", '4', {5}}
+{1, 2.0, "three", '4', {5}, "«ąąąąą»"}
 |----------------------------------------------------------------------------|
 
 limit = triangular(100), max lines = 10, line width = 78
@@ -1235,7 +1235,7 @@ limit = linear(10), max lines = 3, line width = 38
 
 limit = linear(10), max lines = 3, line width = 38
 |------------------------------------|
-{1, 2.0, "three", '4', {5}}
+{1, 2.0, "three", '4', {5}, "«ąąąąą»"}
 |------------------------------------|
 
 limit = linear(10), max lines = 3, line width = 38
@@ -1304,7 +1304,7 @@ limit = linear(10), max lines = 3, line width = 78
 
 limit = linear(10), max lines = 3, line width = 78
 |----------------------------------------------------------------------------|
-{1, 2.0, "three", '4', {5}}
+{1, 2.0, "three", '4', {5}, "«ąąąąą»"}
 |----------------------------------------------------------------------------|
 
 limit = linear(10), max lines = 3, line width = 78
@@ -1368,7 +1368,7 @@ limit = linear(10), max lines = 10, line width = 38
 
 limit = linear(10), max lines = 10, line width = 38
 |------------------------------------|
-{1, 2.0, "three", '4', {5}}
+{1, 2.0, "three", '4', {5}, "«ąąąąą»"}
 |------------------------------------|
 
 limit = linear(10), max lines = 10, line width = 38
@@ -1446,7 +1446,7 @@ limit = linear(10), max lines = 10, line width = 78
 
 limit = linear(10), max lines = 10, line width = 78
 |----------------------------------------------------------------------------|
-{1, 2.0, "three", '4', {5}}
+{1, 2.0, "three", '4', {5}, "«ąąąąą»"}
 |----------------------------------------------------------------------------|
 
 limit = linear(10), max lines = 10, line width = 78
@@ -1510,7 +1510,7 @@ limit = linear(100), max lines = 3, line width = 38
 
 limit = linear(100), max lines = 3, line width = 38
 |------------------------------------|
-{1, 2.0, "three", '4', {5}}
+{1, 2.0, "three", '4', {5}, "«ąąąąą»"}
 |------------------------------------|
 
 limit = linear(100), max lines = 3, line width = 38
@@ -1595,7 +1595,7 @@ limit = linear(100), max lines = 3, line width = 78
 
 limit = linear(100), max lines = 3, line width = 78
 |----------------------------------------------------------------------------|
-{1, 2.0, "three", '4', {5}}
+{1, 2.0, "three", '4', {5}, "«ąąąąą»"}
 |----------------------------------------------------------------------------|
 
 limit = linear(100), max lines = 3, line width = 78
@@ -1685,7 +1685,7 @@ limit = linear(100), max lines = 10, line width = 38
 
 limit = linear(100), max lines = 10, line width = 38
 |------------------------------------|
-{1, 2.0, "three", '4', {5}}
+{1, 2.0, "three", '4', {5}, "«ąąąąą»"}
 |------------------------------------|
 
 limit = linear(100), max lines = 10, line width = 38
@@ -1814,7 +1814,7 @@ limit = linear(100), max lines = 10, line width = 78
 
 limit = linear(100), max lines = 10, line width = 78
 |----------------------------------------------------------------------------|
-{1, 2.0, "three", '4', {5}}
+{1, 2.0, "three", '4', {5}, "«ąąąąą»"}
 |----------------------------------------------------------------------------|
 
 limit = linear(100), max lines = 10, line width = 78
diff --git a/tests/hard_coded/test_pretty_printer.m b/tests/hard_coded/test_pretty_printer.m
index bd08378..2867ae3 100644
--- a/tests/hard_coded/test_pretty_printer.m
+++ b/tests/hard_coded/test_pretty_printer.m
@@ -214,7 +214,7 @@ test_case(test_case(LineWidth, MaxLines, Limit, Doc)) :-
         List, map.init : map(int, float)),
     OpTree = mk_op_tree(200),
     Church = list.foldl(func(_, X) = succ(X), 1..10, zero),
-    Tuple = {1, 2.0, "three", '4', {5}},
+    Tuple = {1, 2.0, "three", '4', {5}, "«ąąąąą»"},
     Square = list.duplicate(10, 1..10) : list(list(int)),
     IndentTest = docs([
         str("indentation test:"),

--------------------------------------------------------------------------
mercury-reviews mailing list
Post messages to:       mercury-reviews at csse.unimelb.edu.au
Administrative Queries: owner-mercury-reviews at csse.unimelb.edu.au
Subscriptions:          mercury-reviews-request at csse.unimelb.edu.au
--------------------------------------------------------------------------



More information about the reviews mailing list