[m-rev.] for review: Add predicates to find chars in strings.

Peter Wang novalazy at gmail.com
Fri Jul 19 16:56:59 AEST 2024


Add the following predicates to find the first or last occurrence of a
code point in a string:

find_first_char
    We already had the code to implement contains_char.
    Not strictly necessary as we have sub_string_search.

find_first_char_start
    Safe wrapper for unsafe_find_first_char_start.

unsafe_find_first_char_start
    This is just the body of find_first_char, which should be useful for
    users. Not strictly needed as we have sub_string_search_start.

find_last_char
    Commonly needed.

NOTE: I also considered these predicates but discarded them for now:

    :- pred find_first_char_between(string::in, char::in,
        int::in, int::in, int::out) is semidet.

    :- pred find_last_char_between(string::in, char::in,
        int::in, int::in, int::out) is semidet.

    :- pred find_first_match_between(pred(char)::in(pred(in) is semidet),
        string::in, int::in, int::in, int::out) is semidet.

    :- pred find_last_match_between(pred(char)::in(pred(in) is semidet),
        string::in, int::in, int::in, int::out) is semidet.

The _between predicates required a bit more code than I'd like, for the
amount of use that they would (I imagine) get. The _match predicates
were just conveniences over iterating over a string manually.
All four predicates would incur calls to strlen() in C grades,
which suggests adding "unsafe" versions as well.

library/string.m:
    Add the predicates above.

    Implement string.contains_char using string.find_first_char.

tests/hard_coded/Mmakefile:
tests/hard_coded/string_find_char.exp:
tests/hard_coded/string_find_char.exp2:
tests/hard_coded/string_find_char.m:
    Add test case.

NEWS.md:
    Announce additions.

diff --git a/NEWS.md b/NEWS.md
index 7c4254300..6a9e4cbd3 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -782,11 +782,14 @@ Changes to the Mercury standard library
     - func `between_code_points/3`
     - pred `between_code_points/4`
     - pred `check_well_formedness/2`
-    - pred `contains_match/2`
     - pred `code_point_offset/3`
     - pred `code_point_offset/4`
+    - pred `contains_match/2`
     - func `count_code_points/1`
     - pred `count_code_points/2`
+    - pred `find_first_char/3`
+    - pred `find_first_char_start/4`
+    - pred `find_last_char/3`
     - func `internal_string_encoding/0`
     - func `left_by_code_point/2`
     - pred `left_by_code_point/3`
@@ -794,6 +797,7 @@ Changes to the Mercury standard library
     - pred `right_by_code_point/3`
     - pred `split_by_code_point/3`
     - pred `split_by_code_point/4`
+    - pred `unsafe_find_first_char_start/4`
 
 * The following obsolete modes have been removed from the following predicates:
 
diff --git a/library/string.m b/library/string.m
index 468fb960d..f55b08475 100644
--- a/library/string.m
+++ b/library/string.m
@@ -763,6 +763,42 @@
 :- pred unsafe_sub_string_search_start(string::in, string::in, int::in,
     int::out) is semidet.
 
+    % find_first_char(String, Char, Index):
+    %
+    % Find the first occurrence of the code point Char in String.
+    % On success, Index is the code unit offset of that code point.
+    %
+:- pred find_first_char(string::in, char::in, int::out) is semidet.
+
+    % find_first_char_start(String, Char, BeginAt, Index):
+    %
+    % Find the first occurrence of the code point Char in String,
+    % beginning from the code unit offset BeginAt in String.
+    % On success, Index is the code unit offset of that code point.
+    %
+    % Fails if BeginAt is out of range (negative, or greater than or equal
+    % to the length of String).
+    %
+:- pred find_first_char_start(string::in, char::in, int::in, int::out)
+    is semidet.
+
+    % unsafe_find_first_char_start(String, Char, BeginAt, Index):
+    %
+    % Same as find_first_char_start/4 but does not check that BeginAt
+    % is in range.
+    % WARNING: if BeginAt is either negative, or greater than length(String),
+    % then the behaviour is UNDEFINED. Use with care!
+    %
+:- pred unsafe_find_first_char_start(string::in, char::in, int::in, int::out)
+    is semidet.
+
+    % find_last_char(String, Char, Index):
+    %
+    % Find the last occurrence of the code point Char in String.
+    % On success, Index is the code unit offset of that code point.
+    %
+:- pred find_last_char(string::in, char::in, int::out) is semidet.
+
 %---------------------------------------------------------------------------%
 %
 % Appending strings.
@@ -3548,58 +3584,8 @@ contains_match_loop(P, String, Cur) :-
 
 %---------------------%
 
-:- pragma foreign_proc("C",
-    contains_char(Str::in, Ch::in),
-    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
-        does_not_affect_liveness, no_sharing],
-"
-    char    buf[5];
-    size_t  len;
-    if (MR_is_ascii(Ch)) {
-        // Fast path.
-        // strchr always returns true when searching for NUL,
-        // but the NUL is not part of the string itself.
-        SUCCESS_INDICATOR = (Ch != '\\0') && (strchr(Str, Ch) != NULL);
-    } else {
-        len = MR_utf8_encode(buf, Ch);
-        buf[len] = '\\0';
-        SUCCESS_INDICATOR = (len > 0) && (strstr(Str, buf) != NULL);
-    }
-").
-:- pragma foreign_proc("C#",
-    contains_char(Str::in, Ch::in),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    if (Ch <= 0xffff) {
-        SUCCESS_INDICATOR = (Str.IndexOf((char) Ch) != -1);
-    } else {
-        string s = System.Char.ConvertFromUtf32(Ch);
-        SUCCESS_INDICATOR = Str.Contains(s);
-    }
-").
-:- pragma foreign_proc("Java",
-    contains_char(Str::in, Ch::in),
-    [will_not_call_mercury, promise_pure, thread_safe],
-"
-    // indexOf(int) handles supplementary characters correctly.
-    SUCCESS_INDICATOR = (Str.indexOf((int) Ch) != -1);
-").
-
-contains_char(String, Char) :-
-    contains_char_loop(String, Char, 0).
-
-:- pred contains_char_loop(string::in, char::in, int::in) is semidet.
-
-contains_char_loop(Str, Char, I) :-
-    unsafe_index_next_repl(Str, I, J, IndexChar, MaybeReplaced),
-    ( if
-        MaybeReplaced = not_replaced,
-        IndexChar = Char
-    then
-        true
-    else
-        contains_char_loop(Str, Char, J)
-    ).
+contains_char(Str, Char) :-
+    find_first_char(Str, Char, _).
 
 %---------------------%
 
@@ -3814,6 +3800,148 @@ unsafe_sub_string_search_start_loop(String, SubString, I, LastI,
             SubLen, Index)
     ).
 
+%---------------------%
+
+find_first_char(Str, Char, Index) :-
+    unsafe_find_first_char_start(Str, Char, 0, Index).
+
+find_first_char_start(Str, Char, BeginAt, Index) :-
+    ( if
+        (
+            BeginAt = 0
+        ;
+            BeginAt > 0,
+            BeginAt < string.count_code_units(Str)
+        )
+    then
+        unsafe_find_first_char_start(Str, Char, BeginAt, Index)
+    else
+        fail
+    ).
+
+:- pragma foreign_proc("C",
+    unsafe_find_first_char_start(Str::in, Ch::in, BeginAt::in, Index::out),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    char    *p = NULL;
+
+    if (MR_is_ascii(Ch)) {
+        // strchr will always find the null terminator, but the terminator
+        // is not part of the string.
+        if (Ch != '\\0') {
+            p = strchr(Str + BeginAt, Ch);
+        }
+    } else {
+        char    buf[5];
+        size_t  len;
+
+        len = MR_utf8_encode(buf, Ch);
+        if (len > 0) {
+            buf[len] = '\\0';
+            p = strstr(Str + BeginAt, buf);
+        }
+    }
+    if (p != NULL) {
+        SUCCESS_INDICATOR = MR_TRUE;
+        Index = (p - Str);
+    } else {
+        SUCCESS_INDICATOR = MR_FALSE;
+        Index = -1;
+    }
+").
+:- pragma foreign_proc("C#",
+    unsafe_find_first_char_start(Str::in, Ch::in, BeginAt::in, Index::out),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    if (Ch <= 0xffff) {
+        Index = Str.IndexOf((char) Ch, BeginAt);
+    } else {
+        string s = System.Char.ConvertFromUtf32(Ch);
+        Index = Str.IndexOf(s, BeginAt, System.StringComparison.Ordinal);
+    }
+    SUCCESS_INDICATOR = (Index >= 0);
+").
+:- pragma foreign_proc("Java",
+    unsafe_find_first_char_start(Str::in, Ch::in, BeginAt::in, Index::out),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    Index = Str.indexOf(Ch, BeginAt);
+    SUCCESS_INDICATOR = (Index >= 0);
+").
+
+%---------------------%
+
+find_last_char(Str, Char, Index) :-
+    % For C grades, find_last_char_2 only works for a single code unit.
+    % i.e. an ASCII character.
+    ( if
+        string.internal_string_encoding = utf8,
+        not char.is_ascii(Char)
+    then
+        find_last_char_loop(Str, Char, 0, string.count_code_units(Str), Index)
+    else
+        find_last_char_2(Str, Char, Index)
+    ).
+
+    % NOTE: the C implementation expects an ASCII character only.
+    %
+:- pred find_last_char_2(string::in, char::in, int::out) is semidet.
+
+:- pragma foreign_proc("C",
+    find_last_char_2(Str::in, Ch::in, Index::out),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    char *p = NULL;
+    // strchr will always find the null terminator, but the terminator
+    // is not part of the string.
+    if (Ch != '\\0') {
+        p = strrchr(Str, Ch);
+    }
+    if (p != NULL) {
+        SUCCESS_INDICATOR = MR_TRUE;
+        Index = (p - Str);
+    } else {
+        SUCCESS_INDICATOR = MR_FALSE;
+        Index = -1;
+    }
+").
+:- pragma foreign_proc("C#",
+    find_last_char_2(Str::in, Ch::in, Index::out),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    if (Ch <= 0xffff) {
+        Index = Str.LastIndexOf((char) Ch);
+    } else {
+        string s = System.Char.ConvertFromUtf32(Ch);
+        Index = Str.LastIndexOf(s, System.StringComparison.Ordinal);
+    }
+    SUCCESS_INDICATOR = (Index >= 0);
+").
+:- pragma foreign_proc("Java",
+    find_last_char_2(Str::in, Ch::in, Index::out),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    Index = Str.lastIndexOf(Ch);
+    SUCCESS_INDICATOR = (Index >= 0);
+").
+
+    % Pre-condition: LowerIndex =< Index0
+    %
+:- pred find_last_char_loop(string::in, char::in, int::in, int::in, int::out)
+    is semidet.
+
+find_last_char_loop(Str, MatchChar, LowerIndex, Index0, MatchIndex) :-
+    string.unsafe_prev_index_repl(Str, Index0, Index1, Char, MaybeReplaced),
+    LowerIndex =< Index1,
+    ( if
+        MaybeReplaced = not_replaced,
+        MatchChar = Char
+    then
+        MatchIndex = Index1
+    else
+        find_last_char_loop(Str, MatchChar, LowerIndex, Index1, MatchIndex)
+    ).
+
 %---------------------------------------------------------------------------%
 %
 % Appending strings.
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index ee30efb57..81c5b40e5 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -403,6 +403,7 @@ ORDINARY_PROGS = \
 	string_contains_char \
 	string_contains_match \
 	string_count_codepoints_ilseq \
+	string_find_char \
 	string_first_char \
 	string_fold_ilseq \
 	string_from_char_list_ilseq \
diff --git a/tests/hard_coded/string_find_char.exp b/tests/hard_coded/string_find_char.exp
new file mode 100644
index 000000000..eb6b82659
--- /dev/null
+++ b/tests/hard_coded/string_find_char.exp
@@ -0,0 +1,11 @@
+*   *   😀  A   B   😀  |   😀  A   B   
+0   1   2   6   7   8   12  13  17  18  
+
+find_first_char ==> index=6, char='A'
+find_first_char ==> index=2, char='😀'
+find_first_char failed
+
+find_last_char ==> index=17, char='A'
+find_last_char ==> index=13, char='😀'
+find_last_char failed
+
diff --git a/tests/hard_coded/string_find_char.exp2 b/tests/hard_coded/string_find_char.exp2
new file mode 100644
index 000000000..704c570b1
--- /dev/null
+++ b/tests/hard_coded/string_find_char.exp2
@@ -0,0 +1,11 @@
+*   *   😀  A   B   😀  |   😀  A   B   
+0   1   2   4   5   6   8   9   11  12  
+
+find_first_char ==> index=4, char='A'
+find_first_char ==> index=2, char='😀'
+find_first_char failed
+
+find_last_char ==> index=11, char='A'
+find_last_char ==> index=9, char='😀'
+find_last_char failed
+
diff --git a/tests/hard_coded/string_find_char.m b/tests/hard_coded/string_find_char.m
new file mode 100644
index 000000000..00ca2509b
--- /dev/null
+++ b/tests/hard_coded/string_find_char.m
@@ -0,0 +1,137 @@
+%---------------------------------------------------------------------------%
+% vim: ts=4 sw=4 et ft=mercury
+%---------------------------------------------------------------------------%
+%
+% The .exp file is for backends using UTF-8 string encoding.
+% The .exp2 file is for backends using UTF-16 string encoding.
+%
+%---------------------------------------------------------------------------%
+
+:- module string_find_char.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module int.
+:- import_module char.
+:- import_module list.
+:- import_module string.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    Str0 = "**",
+    Str1 = Str0 ++ "😀",
+    Str2 = Str1 ++ "AB😀",
+    Str  = Str2 ++ "|😀AB",
+
+    string.count_code_units(Str0, Happy1),
+    string.count_code_units(Str1, A1),
+    string.count_code_units(Str2, Bar),
+    string.count_code_units(Str, End),
+
+    _Offsets = [
+        {-1, End},          % out of range
+        {0, End + 1},       % out of range
+        {A1, A1},           % zero length span
+        {0, End},           % whole string
+
+        {0, A1},            % contains 1st emoji
+        {0, A1 - 1},        % 1st emoji truncated, should not be found
+
+        {A1, Bar},          % contains 2nd emoji
+        {A1, Bar - 1},      % 2nd emoji truncated, should not be found
+
+        {Happy1, Bar},      % contains 1st A, two emoji
+        {Bar, End}          % contains 2nd A, one emoji
+    ],
+
+    % Show the string being searched.
+    show_string(Str, !IO),
+    io.nl(!IO),
+
+    Ascii = 'A',
+    Happy = '😀',
+    NotPresent = '?',
+
+    % Find first char.
+    test_find_first_char(Str, Ascii, !IO),
+    test_find_first_char(Str, Happy, !IO),
+    test_find_first_char(Str, NotPresent, !IO),
+    io.nl(!IO),
+
+    % Find last char.
+    test_find_last_char(Str, Ascii, !IO),
+    test_find_last_char(Str, Happy, !IO),
+    test_find_last_char(Str, NotPresent, !IO),
+    io.nl(!IO),
+
+    /*
+    % Find first char between, single and multiple code units.
+    list.foldl(test_find_first_char_between(Ascii, Str), Offsets, !IO),
+    io.nl(!IO),
+    list.foldl(test_find_first_char_between(Happy, Str), Offsets, !IO),
+    io.nl(!IO),
+
+    % Find last char between, single and multiple code units.
+    list.foldl(test_find_last_char_between(Ascii, Str), Offsets, !IO),
+    io.nl(!IO),
+    list.foldl(test_find_last_char_between(Happy, Str), Offsets, !IO),
+    io.nl(!IO),
+    */
+    true.
+
+%---------------------------------------------------------------------------%
+
+:- pred show_string(string::in, io::di, io::uo) is det.
+
+show_string(Str, !IO) :-
+    string.to_char_list(Str, CharList),
+    list.foldl(write_char_padded, CharList, !IO),
+    io.nl(!IO),
+    list.foldl2(write_char_offset, CharList, 0, _Offset, !IO),
+    io.nl(!IO).
+
+:- pred write_char_padded(char::in, io::di, io::uo) is det.
+
+write_char_padded(Char, !IO) :-
+    ( if char.to_int(Char) >= 0x1F600 then
+        io.format("%-3c", [c(Char)], !IO)
+    else
+        io.format("%-4c", [c(Char)], !IO)
+    ).
+
+:- pred write_char_offset(char::in, int::in, int::out, io::di, io::uo) is det.
+
+write_char_offset(Char, Offset0, Offset, !IO) :-
+    io.format("%-4d", [i(Offset0)], !IO),
+    Str = string.from_char(Char),
+    Offset = Offset0 + string.count_code_units(Str).
+
+%---------------------------------------------------------------------------%
+
+:- pred test_find_first_char(string::in, char::in, io::di, io::uo) is det.
+
+test_find_first_char(Str, Char, !IO) :-
+    ( if find_first_char(Str, Char, Index) then
+        io.format("find_first_char ==> index=%d, char='%c'\n",
+            [i(Index), c(Char)], !IO)
+    else
+        io.write_string("find_first_char failed\n", !IO)
+    ).
+
+:- pred test_find_last_char(string::in, char::in, io::di, io::uo) is det.
+
+test_find_last_char(Str, Char, !IO) :-
+    ( if find_last_char(Str, Char, Index) then
+        io.format("find_last_char ==> index=%d, char='%c'\n",
+            [i(Index), c(Char)], !IO)
+    else
+        io.write_string("find_last_char failed\n", !IO)
+    ).
-- 
2.44.0



More information about the reviews mailing list