[m-rev.] for review: Reduce memory allocation in string.to_upper, string.to_lower.

Peter Wang novalazy at gmail.com
Mon Jun 20 14:13:13 AEST 2016


library/string.m:
	Implement to_upper(in, uo) and to_lower(in, uo) with foreign
	code, not creating intermediate character lists.

	Implement to_upper(in, in) and to_lower(in, in) modes without
	allocating memory.

	Be more specific in documentation about which characters are
	affected by some functions/predicates.
---
 NEWS             |   2 +
 library/string.m | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 166 insertions(+), 6 deletions(-)

diff --git a/NEWS b/NEWS
index 5147773..99bb129 100644
--- a/NEWS
+++ b/NEWS
@@ -213,6 +213,8 @@ Changes to the Mercury standard library:
 * string.base_digit_to_int/3 and string.det_base_digit_to_int/2 now check
   for overflow and underflow in all bases, not only base 10.
 
+* We have reduced the memory allocated by string.to_lower and string.to_upper.
+
 * The following classification predicates have been added to the float module:
 
    - is_finite/1
diff --git a/library/string.m b/library/string.m
index 55ef8b3..c13f7d4 100644
--- a/library/string.m
+++ b/library/string.m
@@ -768,19 +768,19 @@
 %
 
     % Convert the first character (if any) of a string to uppercase.
-    % Note that this only converts unaccented Latin letters.
+    % Note that this only converts letters (a-z) in the ASCII range.
     %
 :- func capitalize_first(string) = string.
 :- pred capitalize_first(string::in, string::out) is det.
 
     % Convert the first character (if any) of a string to lowercase.
-    % Note that this only converts unaccented Latin letters.
+    % Note that this only converts letters (A-Z) in the ASCII range.
     %
 :- func uncapitalize_first(string) = string.
 :- pred uncapitalize_first(string::in, string::out) is det.
 
     % Converts a string to uppercase.
-    % Note that this only converts unaccented Latin letters.
+    % Note that this only converts letters (a-z) in the ASCII range.
     %
 :- func to_upper(string::in) = (string::uo) is det.
 :- pred to_upper(string, string).
@@ -788,7 +788,7 @@
 :- mode to_upper(in, in) is semidet.        % implied
 
     % Converts a string to lowercase.
-    % Note that this only converts unaccented Latin letters.
+    % Note that this only converts letters (A-Z) in the ASCII range.
     %
 :- func to_lower(string::in) = (string::uo) is det.
 :- pred to_lower(string, string).
@@ -4574,10 +4574,56 @@ uncapitalize_first(S0, S) :-
         S = S0
     ).
 
+%---------------------%
+
 to_upper(S1) = S2 :-
     to_upper(S1, S2).
 
-to_upper(StrIn, StrOut) :-
+:- pragma promise_pure(to_upper/2).
+
+:- pragma foreign_proc("C",
+    to_upper(StrIn::in, StrOut::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness, no_sharing],
+"
+    MR_Integer  i;
+
+    MR_make_aligned_string_copy_msg(StrOut, StrIn, MR_ALLOC_ID);
+
+    for (i = 0; StrOut[i] != '\\0'; i++) {
+        if (StrOut[i] >= 'a' && StrOut[i] <= 'z') {
+            StrOut[i] = StrOut[i] - 'a' + 'A';
+        }
+    }
+").
+:- pragma foreign_proc("C#",
+    to_upper(StrIn::in, StrOut::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness, no_sharing],
+"
+    char[] cs = StrIn.ToCharArray();
+    for (int i = 0; i < cs.Length; i++) {
+        if (cs[i] >= 'a' && cs[i] <= 'z') {
+            cs[i] = (char)(cs[i] - 'a' + 'A');
+        }
+    }
+    StrOut = new System.String(cs);
+").
+:- pragma foreign_proc("Java",
+    to_upper(StrIn::in, StrOut::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness, no_sharing],
+"
+    char[] cs = StrIn.toCharArray();
+    for (int i = 0; i < cs.length; i++) {
+        if (cs[i] >= 'a' && cs[i] <= 'z') {
+            cs[i] = (char)(cs[i] - 'a' + 'A');
+        }
+    }
+    StrOut = new String(cs);
+").
+
+to_upper(StrIn::in, StrOut::uo) :-
     to_char_list(StrIn, List),
     char_list_to_upper(List, ListUpp),
     from_char_list(ListUpp, StrOut).
@@ -4589,10 +4635,89 @@ char_list_to_upper([X | Xs], [Y | Ys]) :-
     char.to_upper(X, Y),
     char_list_to_upper(Xs, Ys).
 
+to_upper(X::in, Y::in) :-
+    length(X, LenX),
+    length(Y, LenY),
+    ( if LenX = LenY then
+        check_upper_loop(X, Y, 0, LenX)
+    else
+        fail
+    ).
+
+:- pred check_upper_loop(string::in, string::in, int::in, int::in) is semidet.
+
+check_upper_loop(X, Y, Index, End) :-
+    ( if Index = End then
+        true
+    else
+        unsafe_index_code_unit(X, Index, CodeX),
+        unsafe_index_code_unit(Y, Index, CodeY),
+        to_upper_code_unit(CodeX, CodeY),
+        check_upper_loop(X, Y, Index + 1, End)
+    ).
+
+:- pred to_upper_code_unit(int::in, int::out) is det.
+
+to_upper_code_unit(Code0, Code) :-
+    ( if
+        Code0 >= to_int('a'),
+        Code0 =< to_int('z')
+    then
+        Code = Code0 - to_int('a') + to_int('A')
+    else
+        Code = Code0
+    ).
+
+%---------------------%
+
 to_lower(S1) = S2 :-
     to_lower(S1, S2).
 
-to_lower(StrIn, StrOut) :-
+:- pragma promise_pure(to_lower/2).
+
+:- pragma foreign_proc("C",
+    to_lower(StrIn::in, StrOut::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness, no_sharing],
+"
+    MR_Integer  i;
+
+    MR_make_aligned_string_copy_msg(StrOut, StrIn, MR_ALLOC_ID);
+
+    for (i = 0; StrOut[i] != '\\0'; i++) {
+        if (StrOut[i] >= 'A' && StrOut[i] <= 'Z') {
+            StrOut[i] = StrOut[i] - 'A' + 'a';
+        }
+    }
+").
+:- pragma foreign_proc("C#",
+    to_lower(StrIn::in, StrOut::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness, no_sharing],
+"
+    char[] cs = StrIn.ToCharArray();
+    for (int i = 0; i < cs.Length; i++) {
+        if (cs[i] >= 'A' && cs[i] <= 'Z') {
+            cs[i] = (char)(cs[i] - 'A' + 'a');
+        }
+    }
+    StrOut = new System.String(cs);
+").
+:- pragma foreign_proc("Java",
+    to_lower(StrIn::in, StrOut::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness, no_sharing],
+"
+    char[] cs = StrIn.toCharArray();
+    for (int i = 0; i < cs.length; i++) {
+        if (cs[i] >= 'A' && cs[i] <= 'Z') {
+            cs[i] = (char)(cs[i] - 'A' + 'a');
+        }
+    }
+    StrOut = new String(cs);
+").
+
+to_lower(StrIn::in, StrOut::uo) :-
     to_char_list(StrIn, List),
     char_list_to_lower(List, ListLow),
     from_char_list(ListLow, StrOut).
@@ -4604,6 +4729,39 @@ char_list_to_lower([X | Xs], [Y | Ys]) :-
     char.to_lower(X, Y),
     char_list_to_lower(Xs, Ys).
 
+to_lower(X::in, Y::in) :-
+    length(X, LenX),
+    length(Y, LenY),
+    ( if LenX = LenY then
+        check_lower_loop(X, Y, 0, LenX)
+    else
+        fail
+    ).
+
+:- pred check_lower_loop(string::in, string::in, int::in, int::in) is semidet.
+
+check_lower_loop(X, Y, Index, End) :-
+    ( if Index = End then
+        true
+    else
+        unsafe_index_code_unit(X, Index, CodeX),
+        unsafe_index_code_unit(Y, Index, CodeY),
+        to_lower_code_unit(CodeX, CodeY),
+        check_lower_loop(X, Y, Index + 1, End)
+    ).
+
+:- pred to_lower_code_unit(int::in, int::out) is det.
+
+to_lower_code_unit(Code0, Code) :-
+    ( if
+        Code0 >= to_int('A'),
+        Code0 =< to_int('Z')
+    then
+        Code = Code0 - to_int('A') + to_int('a')
+    else
+        Code = Code0
+    ).
+
 %---------------------%
 
 pad_left(S1, C, N) = S2 :-
-- 
2.6.4



More information about the reviews mailing list