[m-rev.] for review: Add string indexing predicates that indicate if the char was replaced.

Mark Brown mark at mercurylang.org
Wed Oct 30 18:48:56 AEDT 2019


This looks fine.

On Wed, Oct 30, 2019 at 5:10 PM Peter Wang <novalazy at gmail.com> wrote:
>
> library/string.m:
>     Add index_next_repl, unsafe_index_next_repl, prev_index_repl,
>     unsafe_prev_index_repl predicates. These are internal for now,
>     so we can try them out in the string module without committing
>     to the interface.
> ---
>  library/string.m | 104 +++++++++++++++++++++++++++++++++--------------
>  1 file changed, 73 insertions(+), 31 deletions(-)
>
> diff --git a/library/string.m b/library/string.m
> index 8c87b545c..950146233 100644
> --- a/library/string.m
> +++ b/library/string.m
> @@ -1572,6 +1572,7 @@
>  :- include_module parse_runtime.
>  :- include_module to_string.
>
> +:- import_module bool.
>  :- import_module int.
>  :- import_module pair.
>  :- import_module require.
> @@ -2239,7 +2240,9 @@ duplicate_char(Char, Count, String) :-
>  :- pragma inline(index/3).
>  :- pragma inline(det_index/3).
>  :- pragma inline(index_next/4).
> +:- pragma inline(index_next_repl/5).
>  :- pragma inline(prev_index/4).
> +:- pragma inline(prev_index_repl/5).
>
>  index(Str, Index, Char) :-
>      Len = length(Str),
> @@ -2315,19 +2318,42 @@ String ^ unsafe_elem(Index) = unsafe_index(String, Index).
>  %---------------------%
>
>  index_next(Str, Index, NextIndex, Char) :-
> +    index_next_repl(Str, Index, NextIndex, Char, _IsReplaced).
> +
> +unsafe_index_next(Str, Index, NextIndex, Ch) :-
> +    unsafe_index_next_repl(Str, Index, NextIndex, Ch, _IsReplaced).
> +
> +    % XXX ILSEQ Export something like this.
> +    % index_next_repl(String, Index, NextIndex, Char, IsReplaced):
> +    %
> +    % Like index_next/4 but `IsReplaced' is `yes' iff `Char' is U+FFFD but
> +    % `String' does NOT contain an encoding of U+FFFD beginning at `Index'.
> +    % (`IsReplaced' is always `no' when strings are UTF-16 encoded.)
> +    %
> +:- pred index_next_repl(string::in, int::in, int::out, char::uo, bool::out)
> +    is semidet.
> +
> +index_next_repl(Str, Index, NextIndex, Char, IsReplaced) :-
>      Len = length(Str),
>      ( if index_check(Index, Len) then
> -        unsafe_index_next(Str, Index, NextIndex, Char)
> +        unsafe_index_next_repl(Str, Index, NextIndex, Char, IsReplaced)
>      else
>          fail
>      ).
>
> +    % XXX ILSEQ Export something like this.
> +    %
> +:- pred unsafe_index_next_repl(string::in, int::in, int::out, char::uo,
> +    bool::out) is semidet.
> +
>  :- pragma foreign_proc("C",
> -    unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
> +    unsafe_index_next_repl(Str::in, Index::in, NextIndex::out, Ch::uo,
> +        IsReplaced::out),
>      [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
>          does_not_affect_liveness, no_sharing],
>  "
>      Ch = Str[Index];
> +    IsReplaced = MR_FALSE;
>      if (MR_is_ascii(Ch)) {
>          NextIndex = Index + 1;
>          SUCCESS_INDICATOR = (Ch != 0);
> @@ -2336,16 +2362,19 @@ index_next(Str, Index, NextIndex, Char) :-
>          Ch = MR_utf8_get_next_mb(Str, &NextIndex);
>          if (Ch < 0) {
>              Ch = 0xfffd;
> +            IsReplaced = MR_TRUE;
>              NextIndex = Index + 1;
>          }
>          SUCCESS_INDICATOR = MR_TRUE;
>      }
>  ").
>  :- pragma foreign_proc("C#",
> -    unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
> +    unsafe_index_next_repl(Str::in, Index::in, NextIndex::out, Ch::uo,
> +        IsReplaced::out),
>      [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
>          does_not_affect_liveness, no_sharing],
>  "
> +    IsReplaced = mr_bool.NO;
>      try {
>          Ch = System.Char.ConvertToUtf32(Str, Index);
>          if (Ch <= 0xffff) {
> @@ -2366,10 +2395,12 @@ index_next(Str, Index, NextIndex, Char) :-
>      }
>  ").
>  :- pragma foreign_proc("Java",
> -    unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
> +    unsafe_index_next_repl(Str::in, Index::in, NextIndex::out, Ch::uo,
> +        IsReplaced::out),
>      [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
>          does_not_affect_liveness, no_sharing],
>  "
> +    IsReplaced = bool.NO;
>      try {
>          Ch = Str.codePointAt(Index);
>          NextIndex = Index + java.lang.Character.charCount(Ch);
> @@ -2381,7 +2412,8 @@ index_next(Str, Index, NextIndex, Char) :-
>      }
>  ").
>  :- pragma foreign_proc("Erlang",
> -    unsafe_index_next(Str::in, Index::in, NextIndex::out, Ch::uo),
> +    unsafe_index_next_repl(Str::in, Index::in, NextIndex::out, Ch::uo,
> +        IsReplaced::out),
>      [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
>          does_not_affect_liveness, no_sharing],
>  "
> @@ -2398,9 +2430,11 @@ index_next(Str, Index, NextIndex, Char) :-
>                  true ->
>                      NextIndex = Index + 4
>              end,
> +            IsReplaced = {no},
>              SUCCESS_INDICATOR = true;
>          _ ->
>              Ch = -1,
> +            IsReplaced = {no},
>              NextIndex = Index,
>              SUCCESS_INDICATOR = false
>      end
> @@ -2408,39 +2442,37 @@ index_next(Str, Index, NextIndex, Char) :-
>
>  %---------------------%
>
> -    % XXX ILSEQ Provide public interfaces to index into strings while
> -    % signalling if we encountered an ill-formed sequence.
> -    %
> -:- pred index_next_not_replaced(string::in, int::in, int::out, char::uo)
> -    is semidet.
> +prev_index(Str, Index, PrevIndex, Char) :-
> +    prev_index_repl(Str, Index, PrevIndex, Char, _IsReplaced).
>
> -index_next_not_replaced(Str, Index, NextIndex, Char) :-
> -    index_next(Str, Index, NextIndex, Char0),
> -    ( if
> -        internal_encoding_is_utf8,
> -        Char0 = '\ufffd'
> -    then
> -        unsafe_between(Str, Index, NextIndex, "\ufffd")
> -    else
> -        true
> -    ),
> -    unsafe_promise_unique(Char0, Char).
> +unsafe_prev_index(Str, Index, PrevIndex, Ch) :-
> +    unsafe_prev_index_repl(Str, Index, PrevIndex, Ch, _IsReplaced).
>
> -%---------------------%
> +    % XXX ILSEQ Export something like this.
> +    %
> +:- pred prev_index_repl(string::in, int::in, int::out, char::uo, bool::out)
> +    is semidet.
>
> -prev_index(Str, Index, PrevIndex, Char) :-
> +prev_index_repl(Str, Index, PrevIndex, Char, IsReplaced) :-
>      Len = length(Str),
>      ( if index_check(Index - 1, Len) then
> -        unsafe_prev_index(Str, Index, PrevIndex, Char)
> +        unsafe_prev_index_repl(Str, Index, PrevIndex, Char, IsReplaced)
>      else
>          fail
>      ).
>
> +    % XXX ILSEQ Export something like this.
> +    %
> +:- pred unsafe_prev_index_repl(string::in, int::in, int::out, char::uo,
> +    bool::out) is semidet.
> +
>  :- pragma foreign_proc("C",
> -    unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
> +    unsafe_prev_index_repl(Str::in, Index::in, PrevIndex::out, Ch::uo,
> +        IsReplaced::out),
>      [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
>          does_not_affect_liveness, no_sharing],
>  "
> +    IsReplaced = MR_FALSE;
>      if (Index <= 0) {
>          PrevIndex = Index;
>          Ch = 0;
> @@ -2455,6 +2487,7 @@ prev_index(Str, Index, PrevIndex, Char) :-
>              // unaccounted for.
>              if (Ch < 0 || PrevIndex + MR_utf8_width(Ch) != Index) {
>                  Ch = 0xfffd;
> +                IsReplaced = MR_TRUE;
>                  PrevIndex = Index - 1;
>              }
>          }
> @@ -2462,10 +2495,12 @@ prev_index(Str, Index, PrevIndex, Char) :-
>      }
>  ").
>  :- pragma foreign_proc("C#",
> -    unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
> +    unsafe_prev_index_repl(Str::in, Index::in, PrevIndex::out, Ch::uo,
> +        IsReplaced::out),
>      [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
>          does_not_affect_liveness, no_sharing],
>  "
> +    IsReplaced = mr_bool.NO;
>      if (Index <= 0) {
>          Ch = 0;
>          PrevIndex = Index;
> @@ -2494,10 +2529,12 @@ prev_index(Str, Index, PrevIndex, Char) :-
>      }
>  ").
>  :- pragma foreign_proc("Java",
> -    unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
> +    unsafe_prev_index_repl(Str::in, Index::in, PrevIndex::out, Ch::uo,
> +        IsReplaced::out),
>      [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
>          does_not_affect_liveness, no_sharing],
>  "
> +    IsReplaced = bool.NO;
>      try {
>          Ch = Str.codePointBefore(Index);
>          PrevIndex = Index - java.lang.Character.charCount(Ch);
> @@ -2509,12 +2546,14 @@ prev_index(Str, Index, PrevIndex, Char) :-
>      }
>  ").
>  :- pragma foreign_proc("Erlang",
> -    unsafe_prev_index(Str::in, Index::in, PrevIndex::out, Ch::uo),
> +    unsafe_prev_index_repl(Str::in, Index::in, PrevIndex::out, Ch::uo,
> +        IsReplaced::out),
>      [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
>          does_not_affect_liveness, may_not_duplicate, no_sharing],
>  "
>      % XXX does not handle ill-formed sequences as described
>      {PrevIndex, Ch} = do_unsafe_prev_index(Str, Index - 1),
> +    IsReplaced = {no},
>      SUCCESS_INDICATOR = (Ch =/= -1)
>  ").
>
> @@ -3949,7 +3988,8 @@ first_char(Str::uo, First::in, Rest::in) :-
>  :- mode first_char_rest_in(in, uo, in) is semidet.
>
>  first_char_rest_in(Str, First, Rest) :-
> -    index_next_not_replaced(Str, 0, NextIndex, First0),
> +    index_next_repl(Str, 0, NextIndex, First0, IsReplaced),
> +    IsReplaced = no,
>      not is_surrogate(First0),
>      unsafe_promise_unique(First0, First),
>      unsafe_compare_substrings((=), Str, NextIndex, Rest, 0, length(Rest)).
> @@ -3959,7 +3999,8 @@ first_char_rest_in(Str, First, Rest) :-
>  :- mode first_char_rest_out(in, uo, uo) is semidet.
>
>  first_char_rest_out(Str, First, Rest) :-
> -    index_next_not_replaced(Str, 0, NextIndex, First0),
> +    index_next_repl(Str, 0, NextIndex, First0, IsReplaced),
> +    IsReplaced = no,
>      not is_surrogate(First0),
>      unsafe_promise_unique(First0, First),
>      unsafe_between(Str, NextIndex, length(Str), Rest).
> @@ -5348,7 +5389,8 @@ char_to_string(C) = S1 :-
>  char_to_string(Char::in, String::uo) :-
>      from_char_list([Char], String).
>  char_to_string(Char::out, String::in) :-
> -    index_next_not_replaced(String, 0, NextIndex, Char),
> +    index_next_repl(String, 0, NextIndex, Char, IsReplaced),
> +    IsReplaced = no,
>      length(String, NextIndex).
>
>  from_char(Char) = char_to_string(Char).
> --
> 2.23.0
>
> _______________________________________________
> reviews mailing list
> reviews at lists.mercurylang.org
> https://lists.mercurylang.org/listinfo/reviews


More information about the reviews mailing list