[m-rev.] for review: utf-8 improvements

Peter Wang novalazy at gmail.com
Mon Mar 26 13:31:50 AEDT 2012


If necessary I'll submit just the bug fixes separately for 11.07.
---

Branches: main, 11.07

Optimise some UTF-8 routines in C grades and fix a few bugs.

library/string.m:
	Avoid function calls in unsafe_index, unsafe_index_next, and
	unsafe_prev_index in the ASCII case.

	Handle illegal code unit at start of string in first_char(in, uo, in)
	and first_char(in, uo, uo) modes.

runtime/mercury_string.c:
runtime/mercury_string.h:
	Fix a bug where MR_utf8_next would not advance from pos 0.  Fortunately
	MR_utf8_next is only rarely called, to skip past illegal code units.

	Delete redundant initial test in MR_utf8_prev.

	Add MR_utf8_get_mb to extract multibyte code points only.
	Unroll a loop.

	Add MR_utf8_get_next_mb to extract multibyte code points only.

	Make MR_utf8_prev_get avoid an extra function call in the ASCII case.

	Use MR_Integer consistently for string offsets instead of int.

diff --git a/library/string.m b/library/string.m
index 951596f..9636c7c 100644
--- a/library/string.m
+++ b/library/string.m
@@ -1651,7 +1651,7 @@ string.to_char_list(Str::uo, CharList::in) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "{
-    int pos = strlen(Str);
+    MR_Integer pos = strlen(Str);
     int c;
 
     CharList = MR_list_empty_msg(MR_ALLOC_ID);
@@ -4533,7 +4533,11 @@ string.unsafe_index(Str, Index, Char) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
-    Ch = MR_utf8_get(Str, Index);
+    Ch = Str[Index];
+    if (!MR_is_ascii(Ch)) {
+        int width;
+        Ch = MR_utf8_get_mb(Str, Index, &width);
+    }
     SUCCESS_INDICATOR = (Ch > 0);
 ").
 :- pragma foreign_proc("C#",
@@ -4580,10 +4584,15 @@ String ^ unsafe_elem(Index) = unsafe_index(String, Index).
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
-    int pos = Index;
-    Ch = MR_utf8_get_next(Str, &pos);
-    NextIndex = pos;
-    SUCCESS_INDICATOR = (Ch > 0);
+    Ch = Str[Index];
+    if (MR_is_ascii(Ch)) {
+        NextIndex = Index + 1;
+        SUCCESS_INDICATOR = (Ch != 0);
+    } else {
+        NextIndex = Index;
+        Ch = MR_utf8_get_next_mb(Str, &NextIndex);
+        SUCCESS_INDICATOR = (Ch > 0);
+    }
 ").
 
 :- pragma foreign_proc("C#",
@@ -4659,10 +4668,18 @@ String ^ unsafe_elem(Index) = unsafe_index(String, Index).
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
-    int pos = Index;
-    Ch = MR_utf8_prev_get(Str, &pos);
-    PrevIndex = pos;
-    SUCCESS_INDICATOR = (Ch > 0);
+    if (Index > 0) {
+        PrevIndex = Index - 1;
+        Ch = Str[PrevIndex];
+        if (MR_is_ascii(Ch)) {
+            SUCCESS_INDICATOR = (Ch != 0);
+        } else {
+            Ch = MR_utf8_prev_get(Str, &PrevIndex);
+            SUCCESS_INDICATOR = (Ch > 0);
+        }
+    } else {
+        SUCCESS_INDICATOR = MR_FALSE;
+    }
 ").
 
 :- pragma foreign_proc("C#",
@@ -5756,7 +5773,7 @@ string.split_by_codepoint(Str, Count, Left, Right) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
-    int pos = 0;
+    MR_Integer pos = 0;
     int c = MR_utf8_get_next(Str, &pos);
     SUCCESS_INDICATOR = (
         c == First &&
@@ -5813,9 +5830,9 @@ string.split_by_codepoint(Str, Count, Left, Right) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "
-    int pos = 0;
+    MR_Integer pos = 0;
     First = MR_utf8_get_next(Str, &pos);
-    SUCCESS_INDICATOR = (First != '\\0' && strcmp(Str + pos, Rest) == 0);
+    SUCCESS_INDICATOR = (First > 0 && strcmp(Str + pos, Rest) == 0);
 ").
 :- pragma foreign_proc("C#",
     string.first_char(Str::in, First::uo, Rest::in),
@@ -5875,7 +5892,7 @@ string.split_by_codepoint(Str, Count, Left, Right) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "{
-    int pos = 0;
+    MR_Integer pos = 0;
     int c = MR_utf8_get_next(Str, &pos);
     if (c != First || First == '\\0') {
         SUCCESS_INDICATOR = MR_FALSE;
@@ -5943,9 +5960,9 @@ string.split_by_codepoint(Str, Count, Left, Right) :-
     [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
         does_not_affect_liveness, no_sharing],
 "{
-    int pos = 0;
+    MR_Integer pos = 0;
     First = MR_utf8_get_next(Str, &pos);
-    if (First == '\\0') {
+    if (First < 1) {
         SUCCESS_INDICATOR = MR_FALSE;
     } else {
         Str += pos;
diff --git a/runtime/mercury_string.c b/runtime/mercury_string.c
index 25fdf79..bd0f6a6 100644
--- a/runtime/mercury_string.c
+++ b/runtime/mercury_string.c
@@ -126,20 +126,18 @@ MR_hash_string3(MR_ConstString s)
 }
 
 MR_bool
-MR_utf8_next(const MR_String s_, int *pos)
+MR_utf8_next(const MR_String s_, MR_Integer *pos)
 {
     const unsigned char *s = (const unsigned char *)s_;
     int c;
 
-    if (*pos == '\0') {
+    if (s[*pos] == '\0') {
+        /* End of string. */
         return MR_FALSE;
     }
 
     for (;;) {
         ++(*pos);
-        if (*pos == '\0') {
-            break;
-        }
         c = s[*pos];
         if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
             break;
@@ -150,41 +148,48 @@ MR_utf8_next(const MR_String s_, int *pos)
 }
 
 MR_bool
-MR_utf8_prev(const MR_String s_, int *pos)
+MR_utf8_prev(const MR_String s_, MR_Integer *pos)
 {
     const unsigned char *s = (const unsigned char *)s_;
     int c;
 
-    if (*pos <= 0) {
-        return MR_FALSE;
-    }
-
     while (*pos > 0) {
         (*pos)--;
         c = s[*pos];
         if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
-            break;
+            return MR_TRUE;
         }
     }
 
-    return MR_TRUE;
+    return MR_FALSE;
 }
 
 MR_int_least32_t
-MR_utf8_get(const MR_String s_, int pos)
+MR_utf8_get(const MR_String s_, MR_Integer pos)
 {
     const unsigned char *s = (const unsigned char *)s_;
     int c;
-    int remain;
-    int minc;
-    int i;
+    int width;
 
     c = s[pos];
-
-    if (c <= 0x7F) {
-        /* Plain ASCII (including NUL terminator). */
+    if (MR_is_ascii(c)) {
         return c;
+    } else {
+        return MR_utf8_get_mb(s_, pos, &width);
     }
+}
+
+MR_int_least32_t
+MR_utf8_get_mb(const MR_String s_, MR_Integer pos, int *width)
+{
+    const unsigned char *s = (const unsigned char *)s_;
+    int c;
+    int d;
+    int minc;
+
+    c = s[pos];
+
+    /* c <= 0x7f (ASCII) must be handled before calling this function. */
 
     if (c <= 0xC1) {
         /* Trailing byte of multi-byte sequence or an overlong encoding for
@@ -196,19 +201,19 @@ MR_utf8_get(const MR_String s_, int pos)
     if (c <= 0xDF) {
         /* 2-byte sequence. */
         c &= 0x1F;
-        remain = 1;
+        *width = 2;
         minc = 0x80;
     }
     else if (c <= 0xEF) {
         /* 3-byte sequence. */
         c &= 0x0F;
-        remain = 2;
+        *width = 3;
         minc = 0x800;
     }
     else if (c <= 0xF4) {
         /* 4-byte sequence. */
         c &= 0x07;
-        remain = 3;
+        *width = 4;
         minc = 0x10000;
     }
     else {
@@ -216,24 +221,32 @@ MR_utf8_get(const MR_String s_, int pos)
         return -2;
     }
 
-    for (i = 1; i <= remain; i++) {
-        if (s[pos + i] == '\0') {
-            return -2;
-        }
-    }
-
-    while (remain--) {
-        int d = s[++pos];
-
-        if (!MR_utf8_is_trail_byte(d)) {
-            return -2;
-        }
-
-        c = (c << 6) | (d & 0x3F);
+    switch (*width) {
+        case 4:
+            d = s[++pos];
+            if (!MR_utf8_is_trail_byte(d)) {
+                return -2;
+            }
+            c = (c << 6) | (d & 0x3F);
+            /* fall through */
+        case 3:
+            d = s[++pos];
+            if (!MR_utf8_is_trail_byte(d)) {
+                return -2;
+            }
+            c = (c << 6) | (d & 0x3F);
+            /* fall through */
+        case 2:
+            d = s[++pos];
+            if (!MR_utf8_is_trail_byte(d)) {
+                return -2;
+            }
+            c = (c << 6) | (d & 0x3F);
+            break;
     }
 
     /* Check for overlong forms, which could be used to bypass security
-     * validations.  We could also check code points aren't above U+10FFFF or in
+     * validations. We could also check code points aren't above U+10FFFF or in
      * the surrogate ranges, but we don't.
      */
 
@@ -245,12 +258,28 @@ MR_utf8_get(const MR_String s_, int pos)
 }
 
 MR_int_least32_t
-MR_utf8_get_next(const MR_String s, int *pos)
+MR_utf8_get_next(const MR_String s, MR_Integer *pos)
+{
+    int c;
+
+    c = s[*pos];
+    if (MR_is_ascii(c)) {
+        (*pos)++;
+        return c;
+    }
+
+    return MR_utf8_get_next_mb(s, pos);
+}
+
+MR_int_least32_t
+MR_utf8_get_next_mb(const MR_String s, MR_Integer *pos)
 {
-    int c = MR_utf8_get(s, *pos);
+    int c, width;
 
+    c = MR_utf8_get_mb(s, *pos, &width);
     if (c >= 0) {
-        (*pos) += MR_utf8_width(c);
+        /* Multibyte code point. */
+        (*pos) += width;
         return c;
     }
 
@@ -260,10 +289,17 @@ MR_utf8_get_next(const MR_String s, int *pos)
 }
 
 MR_int_least32_t
-MR_utf8_prev_get(const MR_String s, int *pos)
+MR_utf8_prev_get(const MR_String s, MR_Integer *pos)
 {
+    int c, width;
+
     if (MR_utf8_prev(s, pos)) {
-        return MR_utf8_get(s, *pos);
+        c = s[*pos];
+        if (MR_is_ascii(c)) {
+            return c;
+        } else {
+            return MR_utf8_get_mb(s, *pos, &width);
+        }
     }
 
     /* Past beginning. */
@@ -336,7 +372,7 @@ MR_utf8_encode(char s_[], MR_Char c)
 MR_bool
 MR_utf8_verify(const MR_String s)
 {
-    int pos = 0;
+    MR_Integer pos = 0;
 
     for (;;) {
         MR_int_least32_t c;
diff --git a/runtime/mercury_string.h b/runtime/mercury_string.h
index bf84900..963b415 100644
--- a/runtime/mercury_string.h
+++ b/runtime/mercury_string.h
@@ -341,33 +341,42 @@ MR_String MR_make_string(MR_AllocSiteInfoPtr alloc_id, const char *fmt, ...);
 ** If `*pos' is already at the end of the string then return MR_FALSE
 ** without modifying `*pos'.
 */
-extern MR_bool  MR_utf8_next(const MR_String s_, int *pos);
+extern MR_bool  MR_utf8_next(const MR_String s_, MR_Integer *pos);
 
 /*
 ** Rewind `*pos' to the beginning of the previous code point in `s'.
 ** If `*pos' is already at the beginning of the string then return MR_FALSE
 ** without modifying `*pos'.
 */
-extern MR_bool  MR_utf8_prev(const MR_String s_, int *pos);
+extern MR_bool  MR_utf8_prev(const MR_String s_, MR_Integer *pos);
 
 /*
 ** Decode and return the code point beginning at `pos' in `s'.
 ** Return 0 if at the end of the string (i.e. the NUL terminator).
 ** If an illegal code sequence exists at that offset, return -2.
+**
+** The _mb version requires s[pos] to be the lead byte of a multibyte code
+** point.
 */
-extern MR_int_least32_t MR_utf8_get(const MR_String s, int pos);
+extern MR_int_least32_t MR_utf8_get(const MR_String s, MR_Integer pos);
+extern MR_int_least32_t MR_utf8_get_mb(const MR_String s, MR_Integer pos,
+    int *width);
 
 /*
 ** Decode the code point beginning at `pos' in `s', and advance `*pos'.
+** The _mb version requires s[pos] to be the lead byte of a multibyte code
+** point.
 */
-extern MR_int_least32_t MR_utf8_get_next(const MR_String s, int *pos);
+extern MR_int_least32_t MR_utf8_get_next(const MR_String s, MR_Integer *pos);
+extern MR_int_least32_t MR_utf8_get_next_mb(const MR_String s,
+    MR_Integer *pos);
 
 /*
 ** Rewind `*pos' to the beginning of the previous code point in `s'
 ** and return that code code.
 ** Return -1 if `*pos' is already at the beginning of the string.
 */
-extern MR_int_least32_t MR_utf8_prev_get(const MR_String s, int *pos);
+extern MR_int_least32_t MR_utf8_prev_get(const MR_String s, MR_Integer *pos);
 
 /*
 ** Return the number of bytes required to encode the code point `c' in UTF-8.

--------------------------------------------------------------------------
mercury-reviews mailing list
Post messages to:       mercury-reviews at csse.unimelb.edu.au
Administrative Queries: owner-mercury-reviews at csse.unimelb.edu.au
Subscriptions:          mercury-reviews-request at csse.unimelb.edu.au
--------------------------------------------------------------------------



More information about the reviews mailing list