[m-rev.] for review: Define behaviour of string.set_char etc on ill-formed sequences.

Peter Wang novalazy at gmail.com
Mon Nov 11 15:55:08 AEDT 2019


library/string.m:
    Define behaviour of set_char, det_set_char and unsafe_set_char on
    ill-formed sequences. Also define them to throw an exception on an
    attempt to set a null character or surrogate code point in a UTF-8
    string.

    Delete claim that unsafe_set_char is constant time. That would only
    be true for the destructive mode of unsafe_set_char, and that mode
    has been disabled for a long time.

    Implement the defined behaviour for C and C# versions of
    unsafe_set_char. The Java version already behaved as defined.

    Use unsafe_set_char to implement set_char instead of duplicating
    foreign code.

    Replace a couple of uses of strcpy with MR_memcpy as it was
    convenient to do so. (On OpenBSD, the linker issues a warning
    whenever strcpy is used. Avoiding the warning is not high priority
    but we might still like to eliminate all uses of strcpy eventually.)
---
 library/string.m | 246 ++++++++++++++++++-----------------------------
 1 file changed, 91 insertions(+), 155 deletions(-)

diff --git a/library/string.m b/library/string.m
index c0041f99e..5391a6095 100644
--- a/library/string.m
+++ b/library/string.m
@@ -375,11 +375,19 @@
 
     % set_char(Char, Index, String0, String):
     %
-    % `String' is `String0', with the code point beginning at code unit
-    % `Index' removed and replaced by `Char'.
+    % `String' is `String0', with the code unit sequence beginning at `Index'
+    % replaced by the encoding of `Char'. If the code unit at `Index' is the
+    % initial code unit in a valid encoding of a code point, then that entire
+    % code unit sequence is replaced. Otherwise, only the code unit at `Index'
+    % is replaced.
+    %
     % Fails if `Index' is out of range (negative, or greater than or equal to
     % the length of `String0').
     %
+    % Throws an exception if `Char' is the null character or a code point that
+    % cannot be encoded in a string (namely, surrogate code points cannot be
+    % encoded in UTF-8 strings).
+    %
 :- pred set_char(char, int, string, string).
 :- mode set_char(in, in, in, out) is semidet.
 % NOTE This mode is disabled because the compiler puts constant strings
@@ -388,10 +396,8 @@
 
     % det_set_char(Char, Index, String0, String):
     %
-    % `String' is `String0', with the code point beginning at code unit
-    % `Index' removed and replaced by `Char'.
-    % Calls error/1 if `Index' is out of range (negative, or greater than
-    % or equal to the length of `String0').
+    % Same as set_char/4 but throws an exception if `Index' is out of range
+    % (negative, or greater than or equal to the length of `String0').
     %
 :- func det_set_char(char, int, string) = string.
 :- pred det_set_char(char, int, string, string).
@@ -402,12 +408,10 @@
 
     % unsafe_set_char(Char, Index, String0, String):
     %
-    % `String' is `String0', with the code point beginning at code unit
-    % `Index' removed and replaced by `Char'.
+    % Same as set_char/4 but does not check if `Index' is in range.
     % WARNING: behavior is UNDEFINED if `Index' is out of range
     % (negative, or greater than or equal to the length of `String0').
-    % This version is constant time, whereas det_set_char
-    % may be linear in the length of the string. Use with care!
+    % Use with care!
     %
 :- func unsafe_set_char(char, int, string) = string.
 :- mode unsafe_set_char(in, in, in) = out is det.
@@ -2689,119 +2693,23 @@ index_check(Index, Length) :-
 % Writing characters to strings.
 %
 
-% XXX ILSEQ If Index does not point to the start of a well-formed sequence,
-% consider making set_char/unsafe_set_char replace the code unit at Index with
-% the encoding of Char, i.e. treat the code unit as its own pseudo code point.
-
-set_char(Char, Index, !Str) :-
+set_char(Char, Index, Str0, Str) :-
     ( if char.to_int(Char, 0) then
         unexpected($pred, "null character")
+    else if
+        internal_encoding_is_utf8,
+        char.is_surrogate(Char)
+    then
+        unexpected($pred, "surrogate code point")
     else
-        set_char_non_null(Char, Index, !Str)
+        Len0 = length(Str0),
+        ( if index_check(Index, Len0) then
+            unsafe_set_char_copy_string(Char, Index, Len0, Str0, Str)
+        else
+            fail
+        )
     ).
 
-:- pred set_char_non_null(char, int, string, string).
-:- mode set_char_non_null(in, in, in, out) is semidet.
-% NOTE This mode is disabled because the compiler puts constant strings
-% into static data even when they might be updated.
-% :- mode set_char_non_null(in, in, di, uo) is semidet.
-
-:- pragma foreign_proc("C",
-    set_char_non_null(Ch::in, Index::in, Str0::in, Str::out),
-    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
-        does_not_affect_liveness],
-"
-    size_t len = strlen(Str0);
-    if ((MR_Unsigned) Index >= len) {
-        SUCCESS_INDICATOR = MR_FALSE;
-    } else if (MR_is_ascii(Str0[Index]) && MR_is_ascii(Ch)) {
-        // Fast path.
-        SUCCESS_INDICATOR = MR_TRUE;
-        MR_allocate_aligned_string_msg(Str, len, MR_ALLOC_ID);
-        strcpy(Str, Str0);
-        Str[Index] = Ch;
-    } else {
-        int oldc = MR_utf8_get(Str0, Index);
-        if (oldc < 0) {
-            SUCCESS_INDICATOR = MR_FALSE;
-        } else {
-            size_t oldwidth = MR_utf8_width(oldc);
-            size_t newwidth = MR_utf8_width(Ch);
-            size_t newlen;
-
-            newlen = len - oldwidth + newwidth;
-            MR_allocate_aligned_string_msg(Str, newlen, MR_ALLOC_ID);
-            MR_memcpy(Str, Str0, Index);
-            MR_utf8_encode(Str + Index, Ch);
-            strcpy(Str + Index + newwidth, Str0 + Index + oldwidth);
-            SUCCESS_INDICATOR = MR_TRUE;
-        }
-    }
-").
-:- pragma foreign_proc("C#",
-    set_char_non_null(Ch::in, Index::in, Str0::in, Str::out),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    if (Index < 0 || Index >= Str0.Length) {
-        Str = null;
-        SUCCESS_INDICATOR = false;
-    } else {
-        System.Text.StringBuilder sb = new System.Text.StringBuilder(Str0);
-        if (!System.Char.IsHighSurrogate(Str0, Index) && Ch <= 0xffff) {
-            // Fast path.
-            sb[Index] = (char) Ch;
-        } else {
-            if (Str0.Length > Index + 1 &&
-                System.Char.IsLowSurrogate(Str0, Index + 1))
-            {
-                sb.Remove(Index, 2);
-            } else {
-                sb.Remove(Index, 1);
-            }
-            sb.Insert(Index, System.Char.ConvertFromUtf32(Ch));
-        }
-        Str = sb.ToString();
-        SUCCESS_INDICATOR = true;
-    }
-").
-:- pragma foreign_proc("Java",
-    set_char_non_null(Ch::in, Index::in, Str0::in, Str::out),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    if (Index < 0 || Index >= Str0.length()) {
-        Str = null;
-        SUCCESS_INDICATOR = false;
-    } else {
-        java.lang.StringBuilder sb = new StringBuilder(Str0);
-
-        int oldc = sb.codePointAt(Index);
-        int oldwidth = java.lang.Character.charCount(oldc);
-        int newwidth = java.lang.Character.charCount(Ch);
-        if (oldwidth == 1 && newwidth == 1) {
-            sb.setCharAt(Index, (char) Ch);
-        } else {
-            char[] buf = java.lang.Character.toChars(Ch);
-            sb.replace(Index, Index + oldwidth, new String(buf));
-        }
-
-        Str = sb.toString();
-        SUCCESS_INDICATOR = true;
-    }
-").
-:- pragma foreign_proc("Erlang",
-    set_char_non_null(Ch::in, Index::in, Str0::in, Str::out),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    case Str0 of
-        <<Left:Index/binary, _/utf8, Right/binary>> ->
-            Str = unicode:characters_to_binary([Left, Ch, Right]),
-            SUCCESS_INDICATOR = true;
-        _ ->
-            Str = <<>>,
-            SUCCESS_INDICATOR = false
-    end
-").
-
 det_set_char(C, N, S0) = S :-
     det_set_char(C, N, S0, S).
 
@@ -2817,65 +2725,91 @@ det_set_char(Char, Int, String0, String) :-
 unsafe_set_char(C, N, S0) = S :-
     unsafe_set_char(C, N, S0, S).
 
-unsafe_set_char(Char, Index, !Str) :-
+unsafe_set_char(Char, Index, Str0, Str) :-
     ( if char.to_int(Char, 0) then
         unexpected($pred, "null character")
+    else if
+        internal_encoding_is_utf8,
+        char.is_surrogate(Char)
+    then
+        unexpected($pred, "surrogate code point")
     else
-        unsafe_set_char_non_null(Char, Index, !Str)
+        Len0 = length(Str0),
+        unsafe_set_char_copy_string(Char, Index, Len0, Str0, Str)
     ).
 
-:- pred unsafe_set_char_non_null(char, int, string, string).
-:- mode unsafe_set_char_non_null(in, in, in, out) is det.
-% NOTE This mode is disabled because the compiler puts constant strings
-% into static data even when they might be updated.
-% :- mode unsafe_set_char_non_null(in, in, di, uo) is det.
+:- pred unsafe_set_char_copy_string(char, int, int, string, string).
+:- mode unsafe_set_char_copy_string(in, in, in, in, uo) is det.
 
 :- pragma foreign_proc("C",
-    unsafe_set_char_non_null(Ch::in, Index::in, Str0::in, Str::out),
+    unsafe_set_char_copy_string(Ch::in, Index::in, Len0::in,
+        Str0::in, Str::uo),
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness],
 "
-    size_t len = strlen(Str0);
-    if (MR_is_ascii(Str0[Index]) && MR_is_ascii(Ch)) {
+    int b;
+    size_t oldlen;
+    size_t oldwidth;
+    size_t newwidth;
+    size_t newlen;
+
+    // The cast to (unsigned char *) is to prevent sign extension.
+    b = ((const unsigned char *) Str0)[Index];
+    if (MR_utf8_is_lead_byte(b)) {
+        MR_Integer next_index = Index;
+        int oldc = MR_utf8_get_next_mb(Str0, &next_index);
+        if (oldc < 0) {
+            oldwidth = 1;
+        } else {
+            oldwidth = next_index - Index;
+        }
+    } else {
+        oldwidth = 1;
+    }
+
+    if (MR_is_ascii(Ch)) {
+        // Fast path.
+        newwidth = 1;
+    } else {
+        newwidth = MR_utf8_width(Ch);
+    }
+
+    oldlen = Len0;
+    newlen = oldlen - oldwidth + newwidth;
+
+    MR_allocate_aligned_string_msg(Str, newlen, MR_ALLOC_ID);
+    MR_memcpy(Str, Str0, Index);
+    if (MR_is_ascii(Ch)) {
         // Fast path.
-        MR_allocate_aligned_string_msg(Str, len, MR_ALLOC_ID);
-        strcpy(Str, Str0);
         Str[Index] = Ch;
     } else {
-        // XXX ILSEQ Fail for surrogate code points.
-        int oldc = MR_utf8_get(Str0, Index);
-        size_t oldwidth = MR_utf8_width(oldc);
-        size_t newwidth = MR_utf8_width(Ch);
-        size_t newlen;
-        size_t tailofs;
-
-        newlen = len - oldwidth + newwidth;
-        MR_allocate_aligned_string_msg(Str, newlen, MR_ALLOC_ID);
-        MR_memcpy(Str, Str0, Index);
         MR_utf8_encode(Str + Index, Ch);
-        strcpy(Str + Index + newwidth, Str0 + Index + oldwidth);
     }
+    MR_memcpy(Str + Index + newwidth,
+        Str0 + Index + oldwidth,
+        oldlen - Index - oldwidth + 1);
 ").
 :- pragma foreign_proc("C#",
-    unsafe_set_char_non_null(Ch::in, Index::in, Str0::in, Str::out),
+    unsafe_set_char_copy_string(Ch::in, Index::in, _Len0::in,
+        Str0::in, Str::uo),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
-    if (System.Char.IsHighSurrogate(Str0, Index)) {
-        Str = System.String.Concat(
-            Str0.Substring(0, Index),
-            System.Char.ConvertFromUtf32(Ch),
-            Str0.Substring(Index + 2)
-        );
+    int oldwidth;
+    if (System.Char.IsHighSurrogate(Str0, Index)
+        && Index + 1 < Str0.Length
+        && System.Char.IsLowSurrogate(Str0, Index + 1))
+    {
+        oldwidth = 2;
     } else {
-        Str = System.String.Concat(
-            Str0.Substring(0, Index),
-            System.Char.ConvertFromUtf32(Ch),
-            Str0.Substring(Index + 1)
-        );
+        oldwidth = 1;
     }
+    Str = Str0.Substring(0, Index)
+        + System.Char.ConvertFromUtf32(Ch)
+        + Str0.Substring(Index + oldwidth);
 ").
 :- pragma foreign_proc("Java",
-    unsafe_set_char_non_null(Ch::in, Index::in, Str0::in, Str::out),
+    unsafe_set_char_copy_string(Ch::in, Index::in, _Len0::in,
+        Str0::in, Str::uo),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
     int oldc = Str0.codePointAt(Index);
@@ -2885,9 +2819,11 @@ unsafe_set_char(Char, Index, !Str) :-
         + Str0.subSequence(Index + oldwidth, Str0.length());
 ").
 :- pragma foreign_proc("Erlang",
-    unsafe_set_char_non_null(Ch::in, Index::in, Str0::in, Str::out),
+    unsafe_set_char_copy_string(Ch::in, Index::in, _Len0::in,
+        Str0::in, Str::uo),
     [will_not_call_mercury, promise_pure, thread_safe],
 "
+    % XXX does not implement defined behaviour
     <<Left:Index/binary, _/utf8, Right/binary>> = Str0,
     Str = unicode:characters_to_binary([Left, Ch, Right])
 ").
-- 
2.23.0



More information about the reviews mailing list