[m-rev.] for review: Add predicates to find chars in strings.
Peter Wang
novalazy at gmail.com
Fri Jul 19 16:56:59 AEST 2024
Add the following predicates to find the first or last occurrence of a
code point in a string:
find_first_char
We already had the code to implement contains_char.
Not strictly necessary as we have sub_string_search.
find_first_char_start
Safe wrapper for unsafe_find_first_char_start.
unsafe_find_first_char_start
This is just the body of find_first_char, which should be useful for
users. Not strictly needed as we have sub_string_search_start.
find_last_char
Commonly needed.
NOTE: I also considered these predicates but discarded them for now:
:- pred find_first_char_between(string::in, char::in,
int::in, int::in, int::out) is semidet.
:- pred find_last_char_between(string::in, char::in,
int::in, int::in, int::out) is semidet.
:- pred find_first_match_between(pred(char)::in(pred(in) is semidet),
string::in, int::in, int::in, int::out) is semidet.
:- pred find_last_match_between(pred(char)::in(pred(in) is semidet),
string::in, int::in, int::in, int::out) is semidet.
The _between predicates required a bit more code than I'd like, for the
amount of use that they would (I imagine) get. The _match predicates
were just conveniences over iterating over a string manually.
All four predicates would incur calls to strlen() in C grades,
which suggests adding "unsafe" versions as well.
library/string.m:
Add the predicates above.
Implement string.contains_char using string.find_first_char.
tests/hard_coded/Mmakefile:
tests/hard_coded/string_find_char.exp:
tests/hard_coded/string_find_char.exp2:
tests/hard_coded/string_find_char.m:
Add test case.
NEWS.md:
Announce additions.
diff --git a/NEWS.md b/NEWS.md
index 7c4254300..6a9e4cbd3 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -782,11 +782,14 @@ Changes to the Mercury standard library
- func `between_code_points/3`
- pred `between_code_points/4`
- pred `check_well_formedness/2`
- - pred `contains_match/2`
- pred `code_point_offset/3`
- pred `code_point_offset/4`
+ - pred `contains_match/2`
- func `count_code_points/1`
- pred `count_code_points/2`
+ - pred `find_first_char/3`
+ - pred `find_first_char_start/4`
+ - pred `find_last_char/3`
- func `internal_string_encoding/0`
- func `left_by_code_point/2`
- pred `left_by_code_point/3`
@@ -794,6 +797,7 @@ Changes to the Mercury standard library
- pred `right_by_code_point/3`
- pred `split_by_code_point/3`
- pred `split_by_code_point/4`
+ - pred `unsafe_find_first_char_start/4`
* The following obsolete modes have been removed from the following predicates:
diff --git a/library/string.m b/library/string.m
index 468fb960d..f55b08475 100644
--- a/library/string.m
+++ b/library/string.m
@@ -763,6 +763,42 @@
:- pred unsafe_sub_string_search_start(string::in, string::in, int::in,
int::out) is semidet.
+ % find_first_char(String, Char, Index):
+ %
+ % Find the first occurrence of the code point Char in String.
+ % On success, Index is the code unit offset of that code point.
+ %
+:- pred find_first_char(string::in, char::in, int::out) is semidet.
+
+ % find_first_char_start(String, Char, BeginAt, Index):
+ %
+ % Find the first occurrence of the code point Char in String,
+ % beginning from the code unit offset BeginAt in String.
+ % On success, Index is the code unit offset of that code point.
+ %
+ % Fails if BeginAt is out of range (negative, or greater than or equal
+ % to the length of String).
+ %
+:- pred find_first_char_start(string::in, char::in, int::in, int::out)
+ is semidet.
+
+ % unsafe_find_first_char_start(String, Char, BeginAt, Index):
+ %
+ % Same as find_first_char_start/4 but does not check that BeginAt
+ % is in range.
+ % WARNING: if BeginAt is either negative, or greater than length(String),
+ % then the behaviour is UNDEFINED. Use with care!
+ %
+:- pred unsafe_find_first_char_start(string::in, char::in, int::in, int::out)
+ is semidet.
+
+ % find_last_char(String, Char, Index):
+ %
+ % Find the last occurrence of the code point Char in String.
+ % On success, Index is the code unit offset of that code point.
+ %
+:- pred find_last_char(string::in, char::in, int::out) is semidet.
+
%---------------------------------------------------------------------------%
%
% Appending strings.
@@ -3548,58 +3584,8 @@ contains_match_loop(P, String, Cur) :-
%---------------------%
-:- pragma foreign_proc("C",
- contains_char(Str::in, Ch::in),
- [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
- does_not_affect_liveness, no_sharing],
-"
- char buf[5];
- size_t len;
- if (MR_is_ascii(Ch)) {
- // Fast path.
- // strchr always returns true when searching for NUL,
- // but the NUL is not part of the string itself.
- SUCCESS_INDICATOR = (Ch != '\\0') && (strchr(Str, Ch) != NULL);
- } else {
- len = MR_utf8_encode(buf, Ch);
- buf[len] = '\\0';
- SUCCESS_INDICATOR = (len > 0) && (strstr(Str, buf) != NULL);
- }
-").
-:- pragma foreign_proc("C#",
- contains_char(Str::in, Ch::in),
- [will_not_call_mercury, promise_pure, thread_safe],
-"
- if (Ch <= 0xffff) {
- SUCCESS_INDICATOR = (Str.IndexOf((char) Ch) != -1);
- } else {
- string s = System.Char.ConvertFromUtf32(Ch);
- SUCCESS_INDICATOR = Str.Contains(s);
- }
-").
-:- pragma foreign_proc("Java",
- contains_char(Str::in, Ch::in),
- [will_not_call_mercury, promise_pure, thread_safe],
-"
- // indexOf(int) handles supplementary characters correctly.
- SUCCESS_INDICATOR = (Str.indexOf((int) Ch) != -1);
-").
-
-contains_char(String, Char) :-
- contains_char_loop(String, Char, 0).
-
-:- pred contains_char_loop(string::in, char::in, int::in) is semidet.
-
-contains_char_loop(Str, Char, I) :-
- unsafe_index_next_repl(Str, I, J, IndexChar, MaybeReplaced),
- ( if
- MaybeReplaced = not_replaced,
- IndexChar = Char
- then
- true
- else
- contains_char_loop(Str, Char, J)
- ).
+contains_char(Str, Char) :-
+ find_first_char(Str, Char, _).
%---------------------%
@@ -3814,6 +3800,148 @@ unsafe_sub_string_search_start_loop(String, SubString, I, LastI,
SubLen, Index)
).
+%---------------------%
+
+find_first_char(Str, Char, Index) :-
+ unsafe_find_first_char_start(Str, Char, 0, Index).
+
+find_first_char_start(Str, Char, BeginAt, Index) :-
+ ( if
+ (
+ BeginAt = 0
+ ;
+ BeginAt > 0,
+ BeginAt < string.count_code_units(Str)
+ )
+ then
+ unsafe_find_first_char_start(Str, Char, BeginAt, Index)
+ else
+ fail
+ ).
+
+:- pragma foreign_proc("C",
+ unsafe_find_first_char_start(Str::in, Ch::in, BeginAt::in, Index::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ char *p = NULL;
+
+ if (MR_is_ascii(Ch)) {
+ // strchr will always find the null terminator, but the terminator
+ // is not part of the string.
+ if (Ch != '\\0') {
+ p = strchr(Str + BeginAt, Ch);
+ }
+ } else {
+ char buf[5];
+ size_t len;
+
+ len = MR_utf8_encode(buf, Ch);
+ if (len > 0) {
+ buf[len] = '\\0';
+ p = strstr(Str + BeginAt, buf);
+ }
+ }
+ if (p != NULL) {
+ SUCCESS_INDICATOR = MR_TRUE;
+ Index = (p - Str);
+ } else {
+ SUCCESS_INDICATOR = MR_FALSE;
+ Index = -1;
+ }
+").
+:- pragma foreign_proc("C#",
+ unsafe_find_first_char_start(Str::in, Ch::in, BeginAt::in, Index::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ if (Ch <= 0xffff) {
+ Index = Str.IndexOf((char) Ch, BeginAt);
+ } else {
+ string s = System.Char.ConvertFromUtf32(Ch);
+ Index = Str.IndexOf(s, BeginAt, System.StringComparison.Ordinal);
+ }
+ SUCCESS_INDICATOR = (Index >= 0);
+").
+:- pragma foreign_proc("Java",
+ unsafe_find_first_char_start(Str::in, Ch::in, BeginAt::in, Index::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ Index = Str.indexOf(Ch, BeginAt);
+ SUCCESS_INDICATOR = (Index >= 0);
+").
+
+%---------------------%
+
+find_last_char(Str, Char, Index) :-
+ % For C grades, find_last_char_2 only works for a single code unit.
+ % i.e. an ASCII character.
+ ( if
+ string.internal_string_encoding = utf8,
+ not char.is_ascii(Char)
+ then
+ find_last_char_loop(Str, Char, 0, string.count_code_units(Str), Index)
+ else
+ find_last_char_2(Str, Char, Index)
+ ).
+
+ % NOTE: the C implementation expects an ASCII character only.
+ %
+:- pred find_last_char_2(string::in, char::in, int::out) is semidet.
+
+:- pragma foreign_proc("C",
+ find_last_char_2(Str::in, Ch::in, Index::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ char *p = NULL;
+ // strchr will always find the null terminator, but the terminator
+ // is not part of the string.
+ if (Ch != '\\0') {
+ p = strrchr(Str, Ch);
+ }
+ if (p != NULL) {
+ SUCCESS_INDICATOR = MR_TRUE;
+ Index = (p - Str);
+ } else {
+ SUCCESS_INDICATOR = MR_FALSE;
+ Index = -1;
+ }
+").
+:- pragma foreign_proc("C#",
+ find_last_char_2(Str::in, Ch::in, Index::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ if (Ch <= 0xffff) {
+ Index = Str.LastIndexOf((char) Ch);
+ } else {
+ string s = System.Char.ConvertFromUtf32(Ch);
+ Index = Str.LastIndexOf(s, System.StringComparison.Ordinal);
+ }
+ SUCCESS_INDICATOR = (Index >= 0);
+").
+:- pragma foreign_proc("Java",
+ find_last_char_2(Str::in, Ch::in, Index::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ Index = Str.lastIndexOf(Ch);
+ SUCCESS_INDICATOR = (Index >= 0);
+").
+
+ % Pre-condition: LowerIndex =< Index0
+ %
+:- pred find_last_char_loop(string::in, char::in, int::in, int::in, int::out)
+ is semidet.
+
+find_last_char_loop(Str, MatchChar, LowerIndex, Index0, MatchIndex) :-
+ string.unsafe_prev_index_repl(Str, Index0, Index1, Char, MaybeReplaced),
+ LowerIndex =< Index1,
+ ( if
+ MaybeReplaced = not_replaced,
+ MatchChar = Char
+ then
+ MatchIndex = Index1
+ else
+ find_last_char_loop(Str, MatchChar, LowerIndex, Index1, MatchIndex)
+ ).
+
%---------------------------------------------------------------------------%
%
% Appending strings.
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index ee30efb57..81c5b40e5 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -403,6 +403,7 @@ ORDINARY_PROGS = \
string_contains_char \
string_contains_match \
string_count_codepoints_ilseq \
+ string_find_char \
string_first_char \
string_fold_ilseq \
string_from_char_list_ilseq \
diff --git a/tests/hard_coded/string_find_char.exp b/tests/hard_coded/string_find_char.exp
new file mode 100644
index 000000000..eb6b82659
--- /dev/null
+++ b/tests/hard_coded/string_find_char.exp
@@ -0,0 +1,11 @@
+* * 😀 A B 😀 | 😀 A B
+0 1 2 6 7 8 12 13 17 18
+
+find_first_char ==> index=6, char='A'
+find_first_char ==> index=2, char='😀'
+find_first_char failed
+
+find_last_char ==> index=17, char='A'
+find_last_char ==> index=13, char='😀'
+find_last_char failed
+
diff --git a/tests/hard_coded/string_find_char.exp2 b/tests/hard_coded/string_find_char.exp2
new file mode 100644
index 000000000..704c570b1
--- /dev/null
+++ b/tests/hard_coded/string_find_char.exp2
@@ -0,0 +1,11 @@
+* * 😀 A B 😀 | 😀 A B
+0 1 2 4 5 6 8 9 11 12
+
+find_first_char ==> index=4, char='A'
+find_first_char ==> index=2, char='😀'
+find_first_char failed
+
+find_last_char ==> index=11, char='A'
+find_last_char ==> index=9, char='😀'
+find_last_char failed
+
diff --git a/tests/hard_coded/string_find_char.m b/tests/hard_coded/string_find_char.m
new file mode 100644
index 000000000..00ca2509b
--- /dev/null
+++ b/tests/hard_coded/string_find_char.m
@@ -0,0 +1,137 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+%
+% The .exp file is for backends using UTF-8 string encoding.
+% The .exp2 file is for backends using UTF-16 string encoding.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_find_char.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module int.
+:- import_module char.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+ Str0 = "**",
+ Str1 = Str0 ++ "😀",
+ Str2 = Str1 ++ "AB😀",
+ Str = Str2 ++ "|😀AB",
+
+ string.count_code_units(Str0, Happy1),
+ string.count_code_units(Str1, A1),
+ string.count_code_units(Str2, Bar),
+ string.count_code_units(Str, End),
+
+ _Offsets = [
+ {-1, End}, % out of range
+ {0, End + 1}, % out of range
+ {A1, A1}, % zero length span
+ {0, End}, % whole string
+
+ {0, A1}, % contains 1st emoji
+ {0, A1 - 1}, % 1st emoji truncated, should not be found
+
+ {A1, Bar}, % contains 2nd emoji
+ {A1, Bar - 1}, % 2nd emoji truncated, should not be found
+
+ {Happy1, Bar}, % contains 1st A, two emoji
+ {Bar, End} % contains 2nd A, one emoji
+ ],
+
+ % Show the string being searched.
+ show_string(Str, !IO),
+ io.nl(!IO),
+
+ Ascii = 'A',
+ Happy = '😀',
+ NotPresent = '?',
+
+ % Find first char.
+ test_find_first_char(Str, Ascii, !IO),
+ test_find_first_char(Str, Happy, !IO),
+ test_find_first_char(Str, NotPresent, !IO),
+ io.nl(!IO),
+
+ % Find last char.
+ test_find_last_char(Str, Ascii, !IO),
+ test_find_last_char(Str, Happy, !IO),
+ test_find_last_char(Str, NotPresent, !IO),
+ io.nl(!IO),
+
+ /*
+ % Find first char between, single and multiple code units.
+ list.foldl(test_find_first_char_between(Ascii, Str), Offsets, !IO),
+ io.nl(!IO),
+ list.foldl(test_find_first_char_between(Happy, Str), Offsets, !IO),
+ io.nl(!IO),
+
+ % Find last char between, single and multiple code units.
+ list.foldl(test_find_last_char_between(Ascii, Str), Offsets, !IO),
+ io.nl(!IO),
+ list.foldl(test_find_last_char_between(Happy, Str), Offsets, !IO),
+ io.nl(!IO),
+ */
+ true.
+
+%---------------------------------------------------------------------------%
+
+:- pred show_string(string::in, io::di, io::uo) is det.
+
+show_string(Str, !IO) :-
+ string.to_char_list(Str, CharList),
+ list.foldl(write_char_padded, CharList, !IO),
+ io.nl(!IO),
+ list.foldl2(write_char_offset, CharList, 0, _Offset, !IO),
+ io.nl(!IO).
+
+:- pred write_char_padded(char::in, io::di, io::uo) is det.
+
+write_char_padded(Char, !IO) :-
+ ( if char.to_int(Char) >= 0x1F600 then
+ io.format("%-3c", [c(Char)], !IO)
+ else
+ io.format("%-4c", [c(Char)], !IO)
+ ).
+
+:- pred write_char_offset(char::in, int::in, int::out, io::di, io::uo) is det.
+
+write_char_offset(Char, Offset0, Offset, !IO) :-
+ io.format("%-4d", [i(Offset0)], !IO),
+ Str = string.from_char(Char),
+ Offset = Offset0 + string.count_code_units(Str).
+
+%---------------------------------------------------------------------------%
+
+:- pred test_find_first_char(string::in, char::in, io::di, io::uo) is det.
+
+test_find_first_char(Str, Char, !IO) :-
+ ( if find_first_char(Str, Char, Index) then
+ io.format("find_first_char ==> index=%d, char='%c'\n",
+ [i(Index), c(Char)], !IO)
+ else
+ io.write_string("find_first_char failed\n", !IO)
+ ).
+
+:- pred test_find_last_char(string::in, char::in, io::di, io::uo) is det.
+
+test_find_last_char(Str, Char, !IO) :-
+ ( if find_last_char(Str, Char, Index) then
+ io.format("find_last_char ==> index=%d, char='%c'\n",
+ [i(Index), c(Char)], !IO)
+ else
+ io.write_string("find_last_char failed\n", !IO)
+ ).
--
2.44.0
More information about the reviews
mailing list