[m-rev.] for review: Reduce memory allocation in string.to_upper, string.to_lower.
Peter Wang
novalazy at gmail.com
Mon Jun 20 14:13:13 AEST 2016
library/string.m:
Implement to_upper(in, uo) and to_lower(in, uo) with foreign
code, not creating intermediate character lists.
Implement to_upper(in, in) and to_lower(in, in) modes without
allocating memory.
Be more specific in documentation about which characters are
affected by some functions/predicates.
---
NEWS | 2 +
library/string.m | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 166 insertions(+), 6 deletions(-)
diff --git a/NEWS b/NEWS
index 5147773..99bb129 100644
--- a/NEWS
+++ b/NEWS
@@ -213,6 +213,8 @@ Changes to the Mercury standard library:
* string.base_digit_to_int/3 and string.det_base_digit_to_int/2 now check
for overflow and underflow in all bases, not only base 10.
+* We have reduced the memory allocated by string.to_lower and string.to_upper.
+
* The following classification predicates have been added to the float module:
- is_finite/1
diff --git a/library/string.m b/library/string.m
index 55ef8b3..c13f7d4 100644
--- a/library/string.m
+++ b/library/string.m
@@ -768,19 +768,19 @@
%
% Convert the first character (if any) of a string to uppercase.
- % Note that this only converts unaccented Latin letters.
+ % Note that this only converts letters (a-z) in the ASCII range.
%
:- func capitalize_first(string) = string.
:- pred capitalize_first(string::in, string::out) is det.
% Convert the first character (if any) of a string to lowercase.
- % Note that this only converts unaccented Latin letters.
+ % Note that this only converts letters (A-Z) in the ASCII range.
%
:- func uncapitalize_first(string) = string.
:- pred uncapitalize_first(string::in, string::out) is det.
% Converts a string to uppercase.
- % Note that this only converts unaccented Latin letters.
+ % Note that this only converts letters (a-z) in the ASCII range.
%
:- func to_upper(string::in) = (string::uo) is det.
:- pred to_upper(string, string).
@@ -788,7 +788,7 @@
:- mode to_upper(in, in) is semidet. % implied
% Converts a string to lowercase.
- % Note that this only converts unaccented Latin letters.
+ % Note that this only converts letters (A-Z) in the ASCII range.
%
:- func to_lower(string::in) = (string::uo) is det.
:- pred to_lower(string, string).
@@ -4574,10 +4574,56 @@ uncapitalize_first(S0, S) :-
S = S0
).
+%---------------------%
+
to_upper(S1) = S2 :-
to_upper(S1, S2).
-to_upper(StrIn, StrOut) :-
+:- pragma promise_pure(to_upper/2).
+
+:- pragma foreign_proc("C",
+ to_upper(StrIn::in, StrOut::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ MR_Integer i;
+
+ MR_make_aligned_string_copy_msg(StrOut, StrIn, MR_ALLOC_ID);
+
+ for (i = 0; StrOut[i] != '\\0'; i++) {
+ if (StrOut[i] >= 'a' && StrOut[i] <= 'z') {
+ StrOut[i] = StrOut[i] - 'a' + 'A';
+ }
+ }
+").
+:- pragma foreign_proc("C#",
+ to_upper(StrIn::in, StrOut::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ char[] cs = StrIn.ToCharArray();
+ for (int i = 0; i < cs.Length; i++) {
+ if (cs[i] >= 'a' && cs[i] <= 'z') {
+ cs[i] = (char)(cs[i] - 'a' + 'A');
+ }
+ }
+ StrOut = new System.String(cs);
+").
+:- pragma foreign_proc("Java",
+ to_upper(StrIn::in, StrOut::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ char[] cs = StrIn.toCharArray();
+ for (int i = 0; i < cs.length; i++) {
+ if (cs[i] >= 'a' && cs[i] <= 'z') {
+ cs[i] = (char)(cs[i] - 'a' + 'A');
+ }
+ }
+ StrOut = new String(cs);
+").
+
+to_upper(StrIn::in, StrOut::uo) :-
to_char_list(StrIn, List),
char_list_to_upper(List, ListUpp),
from_char_list(ListUpp, StrOut).
@@ -4589,10 +4635,89 @@ char_list_to_upper([X | Xs], [Y | Ys]) :-
char.to_upper(X, Y),
char_list_to_upper(Xs, Ys).
+to_upper(X::in, Y::in) :-
+ length(X, LenX),
+ length(Y, LenY),
+ ( if LenX = LenY then
+ check_upper_loop(X, Y, 0, LenX)
+ else
+ fail
+ ).
+
+:- pred check_upper_loop(string::in, string::in, int::in, int::in) is semidet.
+
+check_upper_loop(X, Y, Index, End) :-
+ ( if Index = End then
+ true
+ else
+ unsafe_index_code_unit(X, Index, CodeX),
+ unsafe_index_code_unit(Y, Index, CodeY),
+ to_upper_code_unit(CodeX, CodeY),
+ check_upper_loop(X, Y, Index + 1, End)
+ ).
+
+:- pred to_upper_code_unit(int::in, int::out) is det.
+
+to_upper_code_unit(Code0, Code) :-
+ ( if
+ Code0 >= to_int('a'),
+ Code0 =< to_int('z')
+ then
+ Code = Code0 - to_int('a') + to_int('A')
+ else
+ Code = Code0
+ ).
+
+%---------------------%
+
to_lower(S1) = S2 :-
to_lower(S1, S2).
-to_lower(StrIn, StrOut) :-
+:- pragma promise_pure(to_lower/2).
+
+:- pragma foreign_proc("C",
+ to_lower(StrIn::in, StrOut::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ MR_Integer i;
+
+ MR_make_aligned_string_copy_msg(StrOut, StrIn, MR_ALLOC_ID);
+
+ for (i = 0; StrOut[i] != '\\0'; i++) {
+ if (StrOut[i] >= 'A' && StrOut[i] <= 'Z') {
+ StrOut[i] = StrOut[i] - 'A' + 'a';
+ }
+ }
+").
+:- pragma foreign_proc("C#",
+ to_lower(StrIn::in, StrOut::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ char[] cs = StrIn.ToCharArray();
+ for (int i = 0; i < cs.Length; i++) {
+ if (cs[i] >= 'A' && cs[i] <= 'Z') {
+ cs[i] = (char)(cs[i] - 'A' + 'a');
+ }
+ }
+ StrOut = new System.String(cs);
+").
+:- pragma foreign_proc("Java",
+ to_lower(StrIn::in, StrOut::uo),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness, no_sharing],
+"
+ char[] cs = StrIn.toCharArray();
+ for (int i = 0; i < cs.length; i++) {
+ if (cs[i] >= 'A' && cs[i] <= 'Z') {
+ cs[i] = (char)(cs[i] - 'A' + 'a');
+ }
+ }
+ StrOut = new String(cs);
+").
+
+to_lower(StrIn::in, StrOut::uo) :-
to_char_list(StrIn, List),
char_list_to_lower(List, ListLow),
from_char_list(ListLow, StrOut).
@@ -4604,6 +4729,39 @@ char_list_to_lower([X | Xs], [Y | Ys]) :-
char.to_lower(X, Y),
char_list_to_lower(Xs, Ys).
+to_lower(X::in, Y::in) :-
+ length(X, LenX),
+ length(Y, LenY),
+ ( if LenX = LenY then
+ check_lower_loop(X, Y, 0, LenX)
+ else
+ fail
+ ).
+
+:- pred check_lower_loop(string::in, string::in, int::in, int::in) is semidet.
+
+check_lower_loop(X, Y, Index, End) :-
+ ( if Index = End then
+ true
+ else
+ unsafe_index_code_unit(X, Index, CodeX),
+ unsafe_index_code_unit(Y, Index, CodeY),
+ to_lower_code_unit(CodeX, CodeY),
+ check_lower_loop(X, Y, Index + 1, End)
+ ).
+
+:- pred to_lower_code_unit(int::in, int::out) is det.
+
+to_lower_code_unit(Code0, Code) :-
+ ( if
+ Code0 >= to_int('A'),
+ Code0 =< to_int('Z')
+ then
+ Code = Code0 - to_int('A') + to_int('a')
+ else
+ Code = Code0
+ ).
+
%---------------------%
pad_left(S1, C, N) = S2 :-
--
2.6.4
More information about the reviews
mailing list