[m-rev.] for review: Add string indexing predicates that indicate if the char was replaced.
Peter Wang
novalazy at gmail.com
Wed Oct 30 17:09:34 AEDT 2019
library/string.m:
Add index_next_repl, unsafe_index_next_repl, prev_index_repl,
unsafe_prev_index_repl predicates. These are internal for now,
so we can try them out in the string module without committing
to the interface.
---
library/string.m | 104 +++++++++++++++++++++++++++++++++--------------
1 file changed, 73 insertions(+), 31 deletions(-)
diff --git a/library/string.m b/library/string.m
index 8c87b545c..950146233 100644
--- a/library/string.m
+++ b/library/string.m
@@ -1572,6 +1572,7 @@
:- include_module parse_runtime.
:- include_module to_string.
+:- import_module bool.
:- import_module int.
:- import_module pair.
:- import_module require.
@@ -2239,7 +2240,9 @@ duplicate_char(Char, Count, String) :-
:- pragma inline(index/3).
:- pragma inline(det_index/3).
:- pragma inline(index_next/4).
+:- pragma inline(index_next_repl/5).
:- pragma inline(prev_index/4).
+:- pragma inline(prev_index_repl/5).
index(Str, Index, Char) :-
Len = length(Str),
@@ -2315,19 +2318,42 @@ String ^ unsafe_elem(Index) = unsafe_index(String, Index).
%---------------------%
index_next(Str, Index, NextIndex, Char) :-
+ index_next_repl(Str, Index, NextIndex, Char, _IsReplaced).
+
+unsafe_index_next(Str, Index, NextIndex, Ch) :-
+ unsafe_index_next_repl(Str, Index, NextIndex, Ch, _IsReplaced).
+
+ % XXX ILSEQ Export something like this.
+ % index_next_repl(String, Index, NextIndex, Char, IsReplaced):
+ %
+ % Like index_next/4 but `IsReplaced' is `yes' iff `Char' is U+FFFD but
+ % `String' does NOT contain an encoding of U+FFFD beginning at `Index'.
+ % (`IsReplaced' is always `no' when strings are UTF-16 encoded.)
+ %
+:- pred index_next_repl(string::in, int::in, int::out, char::uo, bool::out)
+ is semidet.
+
+index_next_repl(Str, Index, NextIndex, Char, IsReplaced) :-
Len = length(Str),
( if index_check(Index, Len) then
- unsafe_index_next(Str, Index, NextIndex, Char)
+ unsafe_index_next_repl(Str, Index, NextIndex, Char, IsReplaced)
else
fail
).
+ % XXX ILSEQ Export something like this.
+ %
+:- pred unsafe_index_next_repl(string::in, int::in, int::out, char::uo,
+ bool::out) is semidet.
+
:- pragma foreign_proc("C",
- unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
+ unsafe_index_next_repl(Str::in, Index::in, NextIndex::out, Ch::uo,
+ IsReplaced::out),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
Ch = Str[Index];
+ IsReplaced = MR_FALSE;
if (MR_is_ascii(Ch)) {
NextIndex = Index + 1;
SUCCESS_INDICATOR = (Ch != 0);
@@ -2336,16 +2362,19 @@ index_next(Str, Index, NextIndex, Char) :-
Ch = MR_utf8_get_next_mb(Str, &NextIndex);
if (Ch < 0) {
Ch = 0xfffd;
+ IsReplaced = MR_TRUE;
NextIndex = Index + 1;
}
SUCCESS_INDICATOR = MR_TRUE;
}
").
:- pragma foreign_proc("C#",
- unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
+ unsafe_index_next_repl(Str::in, Index::in, NextIndex::out, Ch::uo,
+ IsReplaced::out),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
+ IsReplaced = mr_bool.NO;
try {
Ch = System.Char.ConvertToUtf32(Str, Index);
if (Ch <= 0xffff) {
@@ -2366,10 +2395,12 @@ index_next(Str, Index, NextIndex, Char) :-
}
").
:- pragma foreign_proc("Java",
- unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
+ unsafe_index_next_repl(Str::in, Index::in, NextIndex::out, Ch::uo,
+ IsReplaced::out),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
+ IsReplaced = bool.NO;
try {
Ch = Str.codePointAt(Index);
NextIndex = Index + java.lang.Character.charCount(Ch);
@@ -2381,7 +2412,8 @@ index_next(Str, Index, NextIndex, Char) :-
}
").
:- pragma foreign_proc("Erlang",
- unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
+ unsafe_index_next_repl(Str::in, Index::in, NextIndex::out, Ch::uo,
+ IsReplaced::out),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
@@ -2398,9 +2430,11 @@ index_next(Str, Index, NextIndex, Char) :-
true ->
NextIndex = Index + 4
end,
+ IsReplaced = {no},
SUCCESS_INDICATOR = true;
_ ->
Ch = -1,
+ IsReplaced = {no},
NextIndex = Index,
SUCCESS_INDICATOR = false
end
@@ -2408,39 +2442,37 @@ index_next(Str, Index, NextIndex, Char) :-
%---------------------%
- % XXX ILSEQ Provide public interfaces to index into strings while
- % signalling if we encountered an ill-formed sequence.
- %
-:- pred index_next_not_replaced(string::in, int::in, int::out, char::uo)
- is semidet.
+prev_index(Str, Index, PrevIndex, Char) :-
+ prev_index_repl(Str, Index, PrevIndex, Char, _IsReplaced).
-index_next_not_replaced(Str, Index, NextIndex, Char) :-
- index_next(Str, Index, NextIndex, Char0),
- ( if
- internal_encoding_is_utf8,
- Char0 = '\ufffd'
- then
- unsafe_between(Str, Index, NextIndex, "\ufffd")
- else
- true
- ),
- unsafe_promise_unique(Char0, Char).
+unsafe_prev_index(Str, Index, PrevIndex, Ch) :-
+ unsafe_prev_index_repl(Str, Index, PrevIndex, Ch, _IsReplaced).
-%---------------------%
+ % XXX ILSEQ Export something like this.
+ %
+:- pred prev_index_repl(string::in, int::in, int::out, char::uo, bool::out)
+ is semidet.
-prev_index(Str, Index, PrevIndex, Char) :-
+prev_index_repl(Str, Index, PrevIndex, Char, IsReplaced) :-
Len = length(Str),
( if index_check(Index - 1, Len) then
- unsafe_prev_index(Str, Index, PrevIndex, Char)
+ unsafe_prev_index_repl(Str, Index, PrevIndex, Char, IsReplaced)
else
fail
).
+ % XXX ILSEQ Export something like this.
+ %
+:- pred unsafe_prev_index_repl(string::in, int::in, int::out, char::uo,
+ bool::out) is semidet.
+
:- pragma foreign_proc("C",
- unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
+ unsafe_prev_index_repl(Str::in, Index::in, PrevIndex::out, Ch::uo,
+ IsReplaced::out),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
+ IsReplaced = MR_FALSE;
if (Index <= 0) {
PrevIndex = Index;
Ch = 0;
@@ -2455,6 +2487,7 @@ prev_index(Str, Index, PrevIndex, Char) :-
// unaccounted for.
if (Ch < 0 || PrevIndex + MR_utf8_width(Ch) != Index) {
Ch = 0xfffd;
+ IsReplaced = MR_TRUE;
PrevIndex = Index - 1;
}
}
@@ -2462,10 +2495,12 @@ prev_index(Str, Index, PrevIndex, Char) :-
}
").
:- pragma foreign_proc("C#",
- unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
+ unsafe_prev_index_repl(Str::in, Index::in, PrevIndex::out, Ch::uo,
+ IsReplaced::out),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
+ IsReplaced = mr_bool.NO;
if (Index <= 0) {
Ch = 0;
PrevIndex = Index;
@@ -2494,10 +2529,12 @@ prev_index(Str, Index, PrevIndex, Char) :-
}
").
:- pragma foreign_proc("Java",
- unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
+ unsafe_prev_index_repl(Str::in, Index::in, PrevIndex::out, Ch::uo,
+ IsReplaced::out),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, no_sharing],
"
+ IsReplaced = bool.NO;
try {
Ch = Str.codePointBefore(Index);
PrevIndex = Index - java.lang.Character.charCount(Ch);
@@ -2509,12 +2546,14 @@ prev_index(Str, Index, PrevIndex, Char) :-
}
").
:- pragma foreign_proc("Erlang",
- unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
+ unsafe_prev_index_repl(Str::in, Index::in, PrevIndex::out, Ch::uo,
+ IsReplaced::out),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness, may_not_duplicate, no_sharing],
"
% XXX does not handle ill-formed sequences as described
{PrevIndex, Ch} = do_unsafe_prev_index(Str, Index - 1),
+ IsReplaced = {no},
SUCCESS_INDICATOR = (Ch =/= -1)
").
@@ -3949,7 +3988,8 @@ first_char(Str::uo, First::in, Rest::in) :-
:- mode first_char_rest_in(in, uo, in) is semidet.
first_char_rest_in(Str, First, Rest) :-
- index_next_not_replaced(Str, 0, NextIndex, First0),
+ index_next_repl(Str, 0, NextIndex, First0, IsReplaced),
+ IsReplaced = no,
not is_surrogate(First0),
unsafe_promise_unique(First0, First),
unsafe_compare_substrings((=), Str, NextIndex, Rest, 0, length(Rest)).
@@ -3959,7 +3999,8 @@ first_char_rest_in(Str, First, Rest) :-
:- mode first_char_rest_out(in, uo, uo) is semidet.
first_char_rest_out(Str, First, Rest) :-
- index_next_not_replaced(Str, 0, NextIndex, First0),
+ index_next_repl(Str, 0, NextIndex, First0, IsReplaced),
+ IsReplaced = no,
not is_surrogate(First0),
unsafe_promise_unique(First0, First),
unsafe_between(Str, NextIndex, length(Str), Rest).
@@ -5348,7 +5389,8 @@ char_to_string(C) = S1 :-
char_to_string(Char::in, String::uo) :-
from_char_list([Char], String).
char_to_string(Char::out, String::in) :-
- index_next_not_replaced(String, 0, NextIndex, Char),
+ index_next_repl(String, 0, NextIndex, Char, IsReplaced),
+ IsReplaced = no,
length(String, NextIndex).
from_char(Char) = char_to_string(Char).
--
2.23.0
More information about the reviews
mailing list