[m-rev.] for review: utf-8 improvements
Peter Wang
novalazy at gmail.com
Mon Mar 26 13:31:50 AEDT 2012
If necessary I'll submit just the bug fixes separately for 11.07.
---
Branches: main, 11.07
Optimise some UTF-8 routines in C grades and fix a few bugs.
library/string.m:
Avoid function calls in unsafe_index, unsafe_index_next, and
unsafe_prev_index in the ASCII case.
Handle illegal code unit at start of string in first_char(in, uo, in)
and first_char(in, uo, uo) modes.
runtime/mercury_string.c:
runtime/mercury_string.h:
Fix a bug where MR_utf8_next would not advance from pos 0. Fortunately
MR_utf8_next is only rarely called, to skip past illegal code units.
Delete redundant initial test in MR_utf8_prev.
Add MR_utf8_get_mb to extract multibyte code points only.
Unroll a loop.
Add MR_utf8_get_next_mb to extract multibyte code points only.
Make MR_utf8_prev_get avoid an extra function call in the ASCII case.
Use MR_Integer consistently for string offsets instead of int.
diff --git a/library/string.m b/library/string.m
index 951596f..9636c7c 100644
--- a/library/string.m
+++ b/library/string.m
@@ -1651,7 +1651,7 @@ string.to_char_list(Str::uo, CharList::in) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"{
- int pos = strlen(Str);
+ MR_Integer pos = strlen(Str);
int c;
CharList = MR_list_empty_msg(MR_ALLOC_ID);
@@ -4533,7 +4533,11 @@ string.unsafe_index(Str, Index, Char) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
- Ch = MR_utf8_get(Str, Index);
+ Ch = Str[Index];
+ if (!MR_is_ascii(Ch)) {
+ int width;
+ Ch = MR_utf8_get_mb(Str, Index, &width);
+ }
SUCCESS_INDICATOR = (Ch > 0);
").
:- pragma foreign_proc("C#",
@@ -4580,10 +4584,15 @@ String ^ unsafe_elem(Index) = unsafe_index(String, Index).
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
- int pos = Index;
- Ch = MR_utf8_get_next(Str, &pos);
- NextIndex = pos;
- SUCCESS_INDICATOR = (Ch > 0);
+ Ch = Str[Index];
+ if (MR_is_ascii(Ch)) {
+ NextIndex = Index + 1;
+ SUCCESS_INDICATOR = (Ch != 0);
+ } else {
+ NextIndex = Index;
+ Ch = MR_utf8_get_next_mb(Str, &NextIndex);
+ SUCCESS_INDICATOR = (Ch > 0);
+ }
").
:- pragma foreign_proc("C#",
@@ -4659,10 +4668,18 @@ String ^ unsafe_elem(Index) = unsafe_index(String, Index).
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
- int pos = Index;
- Ch = MR_utf8_prev_get(Str, &pos);
- PrevIndex = pos;
- SUCCESS_INDICATOR = (Ch > 0);
+ if (Index > 0) {
+ PrevIndex = Index - 1;
+ Ch = Str[PrevIndex];
+ if (MR_is_ascii(Ch)) {
+ SUCCESS_INDICATOR = (Ch != 0);
+ } else {
+ Ch = MR_utf8_prev_get(Str, &PrevIndex);
+ SUCCESS_INDICATOR = (Ch > 0);
+ }
+ } else {
+ SUCCESS_INDICATOR = MR_FALSE;
+ }
").
:- pragma foreign_proc("C#",
@@ -5756,7 +5773,7 @@ string.split_by_codepoint(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
- int pos = 0;
+ MR_Integer pos = 0;
int c = MR_utf8_get_next(Str, &pos);
SUCCESS_INDICATOR = (
c == First &&
@@ -5813,9 +5830,9 @@ string.split_by_codepoint(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
- int pos = 0;
+ MR_Integer pos = 0;
First = MR_utf8_get_next(Str, &pos);
- SUCCESS_INDICATOR = (First != '\\0' && strcmp(Str + pos, Rest) == 0);
+ SUCCESS_INDICATOR = (First > 0 && strcmp(Str + pos, Rest) == 0);
").
:- pragma foreign_proc("C#",
string.first_char(Str::in, First::uo, Rest::in),
@@ -5875,7 +5892,7 @@ string.split_by_codepoint(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"{
- int pos = 0;
+ MR_Integer pos = 0;
int c = MR_utf8_get_next(Str, &pos);
if (c != First || First == '\\0') {
SUCCESS_INDICATOR = MR_FALSE;
@@ -5943,9 +5960,9 @@ string.split_by_codepoint(Str, Count, Left, Right) :-
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"{
- int pos = 0;
+ MR_Integer pos = 0;
First = MR_utf8_get_next(Str, &pos);
- if (First == '\\0') {
+ if (First < 1) {
SUCCESS_INDICATOR = MR_FALSE;
} else {
Str += pos;
diff --git a/runtime/mercury_string.c b/runtime/mercury_string.c
index 25fdf79..bd0f6a6 100644
--- a/runtime/mercury_string.c
+++ b/runtime/mercury_string.c
@@ -126,20 +126,18 @@ MR_hash_string3(MR_ConstString s)
}
MR_bool
-MR_utf8_next(const MR_String s_, int *pos)
+MR_utf8_next(const MR_String s_, MR_Integer *pos)
{
const unsigned char *s = (const unsigned char *)s_;
int c;
- if (*pos == '\0') {
+ if (s[*pos] == '\0') {
+ /* End of string. */
return MR_FALSE;
}
for (;;) {
++(*pos);
- if (*pos == '\0') {
- break;
- }
c = s[*pos];
if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
break;
@@ -150,41 +148,48 @@ MR_utf8_next(const MR_String s_, int *pos)
}
MR_bool
-MR_utf8_prev(const MR_String s_, int *pos)
+MR_utf8_prev(const MR_String s_, MR_Integer *pos)
{
const unsigned char *s = (const unsigned char *)s_;
int c;
- if (*pos <= 0) {
- return MR_FALSE;
- }
-
while (*pos > 0) {
(*pos)--;
c = s[*pos];
if (MR_utf8_is_single_byte(c) || MR_utf8_is_lead_byte(c)) {
- break;
+ return MR_TRUE;
}
}
- return MR_TRUE;
+ return MR_FALSE;
}
MR_int_least32_t
-MR_utf8_get(const MR_String s_, int pos)
+MR_utf8_get(const MR_String s_, MR_Integer pos)
{
const unsigned char *s = (const unsigned char *)s_;
int c;
- int remain;
- int minc;
- int i;
+ int width;
c = s[pos];
-
- if (c <= 0x7F) {
- /* Plain ASCII (including NUL terminator). */
+ if (MR_is_ascii(c)) {
return c;
+ } else {
+ return MR_utf8_get_mb(s_, pos, &width);
}
+}
+
+MR_int_least32_t
+MR_utf8_get_mb(const MR_String s_, MR_Integer pos, int *width)
+{
+ const unsigned char *s = (const unsigned char *)s_;
+ int c;
+ int d;
+ int minc;
+
+ c = s[pos];
+
+ /* c <= 0x7f (ASCII) must be handled before calling this function. */
if (c <= 0xC1) {
/* Trailing byte of multi-byte sequence or an overlong encoding for
@@ -196,19 +201,19 @@ MR_utf8_get(const MR_String s_, int pos)
if (c <= 0xDF) {
/* 2-byte sequence. */
c &= 0x1F;
- remain = 1;
+ *width = 2;
minc = 0x80;
}
else if (c <= 0xEF) {
/* 3-byte sequence. */
c &= 0x0F;
- remain = 2;
+ *width = 3;
minc = 0x800;
}
else if (c <= 0xF4) {
/* 4-byte sequence. */
c &= 0x07;
- remain = 3;
+ *width = 4;
minc = 0x10000;
}
else {
@@ -216,24 +221,32 @@ MR_utf8_get(const MR_String s_, int pos)
return -2;
}
- for (i = 1; i <= remain; i++) {
- if (s[pos + i] == '\0') {
- return -2;
- }
- }
-
- while (remain--) {
- int d = s[++pos];
-
- if (!MR_utf8_is_trail_byte(d)) {
- return -2;
- }
-
- c = (c << 6) | (d & 0x3F);
+ switch (*width) {
+ case 4:
+ d = s[++pos];
+ if (!MR_utf8_is_trail_byte(d)) {
+ return -2;
+ }
+ c = (c << 6) | (d & 0x3F);
+ /* fall through */
+ case 3:
+ d = s[++pos];
+ if (!MR_utf8_is_trail_byte(d)) {
+ return -2;
+ }
+ c = (c << 6) | (d & 0x3F);
+ /* fall through */
+ case 2:
+ d = s[++pos];
+ if (!MR_utf8_is_trail_byte(d)) {
+ return -2;
+ }
+ c = (c << 6) | (d & 0x3F);
+ break;
}
/* Check for overlong forms, which could be used to bypass security
- * validations. We could also check code points aren't above U+10FFFF or in
+ * validations. We could also check code points aren't above U+10FFFF or in
* the surrogate ranges, but we don't.
*/
@@ -245,12 +258,28 @@ MR_utf8_get(const MR_String s_, int pos)
}
MR_int_least32_t
-MR_utf8_get_next(const MR_String s, int *pos)
+MR_utf8_get_next(const MR_String s, MR_Integer *pos)
+{
+ int c;
+
+ c = s[*pos];
+ if (MR_is_ascii(c)) {
+ (*pos)++;
+ return c;
+ }
+
+ return MR_utf8_get_next_mb(s, pos);
+}
+
+MR_int_least32_t
+MR_utf8_get_next_mb(const MR_String s, MR_Integer *pos)
{
- int c = MR_utf8_get(s, *pos);
+ int c, width;
+ c = MR_utf8_get_mb(s, *pos, &width);
if (c >= 0) {
- (*pos) += MR_utf8_width(c);
+ /* Multibyte code point. */
+ (*pos) += width;
return c;
}
@@ -260,10 +289,17 @@ MR_utf8_get_next(const MR_String s, int *pos)
}
MR_int_least32_t
-MR_utf8_prev_get(const MR_String s, int *pos)
+MR_utf8_prev_get(const MR_String s, MR_Integer *pos)
{
+ int c, width;
+
if (MR_utf8_prev(s, pos)) {
- return MR_utf8_get(s, *pos);
+ c = s[*pos];
+ if (MR_is_ascii(c)) {
+ return c;
+ } else {
+ return MR_utf8_get_mb(s, *pos, &width);
+ }
}
/* Past beginning. */
@@ -336,7 +372,7 @@ MR_utf8_encode(char s_[], MR_Char c)
MR_bool
MR_utf8_verify(const MR_String s)
{
- int pos = 0;
+ MR_Integer pos = 0;
for (;;) {
MR_int_least32_t c;
diff --git a/runtime/mercury_string.h b/runtime/mercury_string.h
index bf84900..963b415 100644
--- a/runtime/mercury_string.h
+++ b/runtime/mercury_string.h
@@ -341,33 +341,42 @@ MR_String MR_make_string(MR_AllocSiteInfoPtr alloc_id, const char *fmt, ...);
** If `*pos' is already at the end of the string then return MR_FALSE
** without modifying `*pos'.
*/
-extern MR_bool MR_utf8_next(const MR_String s_, int *pos);
+extern MR_bool MR_utf8_next(const MR_String s_, MR_Integer *pos);
/*
** Rewind `*pos' to the beginning of the previous code point in `s'.
** If `*pos' is already at the beginning of the string then return MR_FALSE
** without modifying `*pos'.
*/
-extern MR_bool MR_utf8_prev(const MR_String s_, int *pos);
+extern MR_bool MR_utf8_prev(const MR_String s_, MR_Integer *pos);
/*
** Decode and return the code point beginning at `pos' in `s'.
** Return 0 if at the end of the string (i.e. the NUL terminator).
** If an illegal code sequence exists at that offset, return -2.
+**
+** The _mb version requires s[pos] to be the lead byte of a multibyte code
+** point.
*/
-extern MR_int_least32_t MR_utf8_get(const MR_String s, int pos);
+extern MR_int_least32_t MR_utf8_get(const MR_String s, MR_Integer pos);
+extern MR_int_least32_t MR_utf8_get_mb(const MR_String s, MR_Integer pos,
+ int *width);
/*
** Decode the code point beginning at `pos' in `s', and advance `*pos'.
+** The _mb version requires s[pos] to be the lead byte of a multibyte code
+** point.
*/
-extern MR_int_least32_t MR_utf8_get_next(const MR_String s, int *pos);
+extern MR_int_least32_t MR_utf8_get_next(const MR_String s, MR_Integer *pos);
+extern MR_int_least32_t MR_utf8_get_next_mb(const MR_String s,
+ MR_Integer *pos);
/*
** Rewind `*pos' to the beginning of the previous code point in `s'
** and return that code code.
** Return -1 if `*pos' is already at the beginning of the string.
*/
-extern MR_int_least32_t MR_utf8_prev_get(const MR_String s, int *pos);
+extern MR_int_least32_t MR_utf8_prev_get(const MR_String s, MR_Integer *pos);
/*
** Return the number of bytes required to encode the code point `c' in UTF-8.
--------------------------------------------------------------------------
mercury-reviews mailing list
Post messages to: mercury-reviews at csse.unimelb.edu.au
Administrative Queries: owner-mercury-reviews at csse.unimelb.edu.au
Subscriptions: mercury-reviews-request at csse.unimelb.edu.au
--------------------------------------------------------------------------
More information about the reviews
mailing list