[m-rev.] diff: (2nd try) string splitting routines

Ondrej Bojar obo at cuni.cz
Fri Feb 2 14:32:34 AEDT 2007


(Extra comment for Ralph: split_at_string is useful for splitting at 
e.g. "<tab>".)

Estimated hours taken: 3

A few handy functions for splitting a string added.

library/string.m:
     Added remove_suffix_if_present, split_at_separator, split_at_char,
     split_at_string

tests/hard_coded/string_split.m:
     A simple test of split_at_* functions.

tests/hard_coded/string_split.exp:
     Expected results of the tests of split_at_* functions.

tests/hard_coded/string_various.m:
     Created. Added testcase for remove_suffix_if_present.

tests/hard_coded/string_various.exp:
     Created. Added results for remove_suffix_if_present.


Index: library/string.m
===================================================================
RCS file: /home/mercury/mercury1/repository/mercury/library/string.m,v
retrieving revision 1.254
diff -u -r1.254 string.m
--- library/string.m	18 Jan 2007 07:33:03 -0000	1.254
+++ library/string.m	2 Feb 2007 03:26:14 -0000
@@ -67,6 +67,11 @@
      %
  :- pred string.remove_suffix(string::in, string::in, string::out) is 
semidet.

+    % string.remove_suffix_if_present(Suffix, String) returns `String' 
minus
+    % `Suffix' if `String' ends with `Suffix', `String' otherwise
+    %
+:- func string.remove_suffix_if_present(string, string) = string.
+
      % string.prefix(String, Prefix) is true iff Prefix is a prefix of 
String.
      % Same as string.append(Prefix, _, String).
      %
@@ -555,6 +560,8 @@
      % string.words_separator(char.is_whitespace, " the cat  sat on the 
  mat") =
      %   ["the", "cat", "sat", "on", "the", "mat"]
      %
+    % Note the difference to string.split_at_separator
+    %
  :- func string.words_separator(pred(char), string) = list(string).
  :- mode string.words_separator(pred(in) is semidet, in) = out is det.

@@ -563,6 +570,34 @@
      %
  :- func string.words(string) = list(string).

+    % string.split_at_separator(SepP, String) returns the list of
+    % substrings of String (in first to last order) that are delimited
+    % by chars matched by SepP. For example,
+    %
+    % string.split_at_separator(char.is_whitespace, " a cat  sat on the 
  mat")
+    %   = ["", "a", "cat", "", "sat", "on", "the", "", "mat"]
+    %
+    % Note the difference to string.words_separator
+    %
+:- func string.split_at_separator(pred(char), string) = list(string).
+:- mode string.split_at_separator(pred(in) is semidet, in) = out is det.
+
+    % string.split_at_char(Char, String) =
+    %     string.split_at_separator(unify(Char), String)
+    %
+:- func string.split_at_char(char, string) = list(string).
+
+    % string.split_at_string(Separator, String) returns the list of 
substrings
+    % of String that are delimited by Separator. For example,
+    %
+    % string.split_at_string("|||", "|||fld2|||fld3")
+    %  = ["", "fld2", [fld3"]
+    %
+    % Always the first match of Separator is used to break the String, for
+    % example: string.split_at_string("aa", "xaaayaaaz") = ["x", "ay", 
"az"]
+    %
+:- func string.split_at_string(string, string) = list(string).
+
      % string.split(String, Count, LeftSubstring, RightSubstring):
      % `LeftSubstring' is the left-most `Count' characters of `String',
      % and `RightSubstring' is the remainder of `String'.
@@ -969,6 +1004,17 @@
      string.to_char_list(C, LC),
      char_list_remove_suffix(LA, LB, LC).

+string.remove_suffix_if_present(Suffix, String) = Out :-
+    string.length(String, Length),
+    LeftCount = Length - length(Suffix),
+    string.split(String, LeftCount, LeftString, RightString),
+    ( RightString = Suffix ->
+        Out = LeftString
+    ;
+        Out = String
+    ).
+
+
  :- pragma promise_equivalent_clauses(string.prefix/2).

  string.prefix(String::in, Prefix::in) :-
@@ -4108,6 +4154,56 @@

 
%------------------------------------------------------------------------------%

+string.split_at_separator(DelimPred, InStr) = OutStrs :-
+    Count = string.length(InStr),
+    split_at_separator2(DelimPred, InStr, Count, Count, [], OutStrs).
+
+:- pred split_at_separator2(pred(char)::in(pred(in) is semidet), 
string::in,
+    int::in, int::in, list(string)::in, list(string)::out) is det.
+split_at_separator2(DelimPred, Str, I, ThisSegEnd, ITail, OTail) :-
+    % walk Str backwards extending accumulated list of chunks as chars
+    % matching DelimPred are found
+    ( I < 0 -> % we're at the beginning
+        ( ThisSegEnd<0 ->
+            OTail = ["" | ITail]
+        ;
+            ThisSeg = string.unsafe_substring(Str, 0, ThisSegEnd+1),
+            OTail = [ThisSeg | ITail]
+        )
+    ;
+        C = string.unsafe_index(Str, I),
+        ( DelimPred(C) -> % chop here
+            ThisSeg = string.unsafe_substring(Str, I+1, ThisSegEnd-I),
+            TTail = [ ThisSeg | ITail ],
+            split_at_separator2(DelimPred, Str, I-1, I-1, TTail, OTail)
+        ; % extend current segment
+            split_at_separator2(DelimPred, Str, I-1, ThisSegEnd, ITail, 
OTail)
+        )
+    ).
+
+%------------------------------------------------------------------------------%
+
+string.split_at_char(C, String) =
+    string.split_at_separator(unify(C), String).
+
+%------------------------------------------------------------------------------%
+
+split_at_string(Needle, Total) =
+    split_at_string(0, length(Needle), Needle, Total).
+
+:- func split_at_string(int, int, string, string) = list(string).
+split_at_string(StartAt, NeedleLen, Needle, Total) = Out :-
+    ( sub_string_search_start(Total, Needle, StartAt, NeedlePos) ->
+        BeforeNeedle = substring(Total, StartAt, NeedlePos-StartAt),
+        Tail = split_at_string(NeedlePos+NeedleLen, NeedleLen, Needle, 
Total),
+        Out = [BeforeNeedle | Tail]
+    ;
+        string__split(Total, StartAt, _skip, Last),
+        Out = [Last]
+    ).
+
+%------------------------------------------------------------------------------%
+
      % preceding_boundary(SepP, String, I) returns the largest index J =< I
      % in String of the char that is SepP and min(-1, I) if there is no 
such J.
      % preceding_boundary/3 is intended for finding (in reverse) 
consecutive
Index: tests/hard_coded/string_split.exp
===================================================================
RCS file: tests/hard_coded/string_split.exp
diff -N tests/hard_coded/string_split.exp
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ tests/hard_coded/string_split.exp	2 Feb 2007 03:00:02 -0000
@@ -0,0 +1,6 @@
+hello:world:how:are:you!
+hello<tab>world<tab>how<tab>are<tab><tab>you!
+user<tab>group<tab>id1<tab>id2
+x<tab>ay<tab>az
+x<tab>a <tab>aax <tab> x
+col1<tab>col2:val2<tab>col3<tab>
Index: tests/hard_coded/string_split.m
===================================================================
RCS file: tests/hard_coded/string_split.m
diff -N tests/hard_coded/string_split.m
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ tests/hard_coded/string_split.m	2 Feb 2007 02:36:14 -0000
@@ -0,0 +1,36 @@
+:- module string_split.
+:- interface.
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+:- implementation.
+
+:- import_module string, char.
+
+main(!IO) :-
+  io__write_list(
+    split_at_separator(char__is_upper, "helloXworldXhowXareYyou!"),
+    ":", io__write_string, !IO),
+  io__nl(!IO),
+  io__write_list(
+    split_at_separator(char__is_whitespace, "hello world\thow 
are\t\tyou!"),
+    "<tab>", io__write_string, !IO),
+  io__nl(!IO),
+  io__write_list(
+    split_at_char(':', "user:group:id1:id2"),
+    "<tab>", io__write_string, !IO),
+  io__nl(!IO),
+  io__write_list(
+    split_at_string("aa", "xaaayaaaz"),
+    "<tab>", io__write_string, !IO),
+  io__nl(!IO),
+  io__write_list(
+    split_at_string("aaa", "xaaaa aaaaax aaa x"),
+    "<tab>", io__write_string, !IO),
+  io__nl(!IO),
+  io__write_list(
+    split_at_string(":::", "col1:::col2:val2:::col3:::"),
+    "<tab>", io__write_string, !IO),
+  io__nl(!IO),
+  true.
Index: tests/hard_coded/string_various.exp
===================================================================
RCS file: tests/hard_coded/string_various.exp
diff -N tests/hard_coded/string_various.exp
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ tests/hard_coded/string_various.exp	2 Feb 2007 02:59:24 -0000
@@ -0,0 +1,3 @@
+myfile
+myfile
+myfile.gz
Index: tests/hard_coded/string_various.m
===================================================================
RCS file: tests/hard_coded/string_various.m
diff -N tests/hard_coded/string_various.m
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ tests/hard_coded/string_various.m	2 Feb 2007 02:58:29 -0000
@@ -0,0 +1,18 @@
+:- module string_various.
+:- interface.
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+:- implementation.
+
+:- import_module string, char.
+
+main(!IO) :-
+  io__write_string(remove_suffix_if_present(".gz", "myfile"), !IO),
+  io__nl(!IO),
+  io__write_string(remove_suffix_if_present(".gz", "myfile.gz"), !IO),
+  io__nl(!IO),
+  io__write_string(remove_suffix_if_present(".gz", "myfile.gz.gz"), !IO),
+  io__nl(!IO),
+  true.

-- 
Ondrej Bojar (mailto:obo at cuni.cz)
http://www.cuni.cz/~obo
--------------------------------------------------------------------------
mercury-reviews mailing list
Post messages to:       mercury-reviews at csse.unimelb.edu.au
Administrative Queries: owner-mercury-reviews at csse.unimelb.edu.au
Subscriptions:          mercury-reviews-request at csse.unimelb.edu.au
--------------------------------------------------------------------------



More information about the reviews mailing list