[m-rev.] diff: (2nd try) string splitting routines
Ondrej Bojar
obo at cuni.cz
Fri Feb 2 14:32:34 AEDT 2007
(Extra comment for Ralph: split_at_string is useful for splitting at
e.g. "<tab>".)
Estimated hours taken: 3
A few handy functions for splitting a string added.
library/string.m:
Added remove_suffix_if_present, split_at_separator, split_at_char,
split_at_string
tests/hard_coded/string_split.m:
A simple test of split_at_* functions.
tests/hard_coded/string_split.exp:
Expected results of the tests of split_at_* functions.
tests/hard_coded/string_various.m:
Created. Added testcase for remove_suffix_if_present.
tests/hard_coded/string_various.exp:
Created. Added results for remove_suffix_if_present.
Index: library/string.m
===================================================================
RCS file: /home/mercury/mercury1/repository/mercury/library/string.m,v
retrieving revision 1.254
diff -u -r1.254 string.m
--- library/string.m 18 Jan 2007 07:33:03 -0000 1.254
+++ library/string.m 2 Feb 2007 03:26:14 -0000
@@ -67,6 +67,11 @@
%
:- pred string.remove_suffix(string::in, string::in, string::out) is
semidet.
+ % string.remove_suffix_if_present(Suffix, String) returns `String'
minus
+ % `Suffix' if `String' ends with `Suffix', `String' otherwise
+ %
+:- func string.remove_suffix_if_present(string, string) = string.
+
% string.prefix(String, Prefix) is true iff Prefix is a prefix of
String.
% Same as string.append(Prefix, _, String).
%
@@ -555,6 +560,8 @@
% string.words_separator(char.is_whitespace, " the cat sat on the
mat") =
% ["the", "cat", "sat", "on", "the", "mat"]
%
+ % Note the difference to string.split_at_separator
+ %
:- func string.words_separator(pred(char), string) = list(string).
:- mode string.words_separator(pred(in) is semidet, in) = out is det.
@@ -563,6 +570,34 @@
%
:- func string.words(string) = list(string).
+ % string.split_at_separator(SepP, String) returns the list of
+ % substrings of String (in first to last order) that are delimited
+ % by chars matched by SepP. For example,
+ %
+ % string.split_at_separator(char.is_whitespace, " a cat sat on the
mat")
+ % = ["", "a", "cat", "", "sat", "on", "the", "", "mat"]
+ %
+ % Note the difference to string.words_separator
+ %
+:- func string.split_at_separator(pred(char), string) = list(string).
+:- mode string.split_at_separator(pred(in) is semidet, in) = out is det.
+
+ % string.split_at_char(Char, String) =
+ % string.split_at_separator(unify(Char), String)
+ %
+:- func string.split_at_char(char, string) = list(string).
+
+ % string.split_at_string(Separator, String) returns the list of
substrings
+ % of String that are delimited by Separator. For example,
+ %
+ % string.split_at_string("|||", "|||fld2|||fld3")
+ % = ["", "fld2", [fld3"]
+ %
+ % Always the first match of Separator is used to break the String, for
+ % example: string.split_at_string("aa", "xaaayaaaz") = ["x", "ay",
"az"]
+ %
+:- func string.split_at_string(string, string) = list(string).
+
% string.split(String, Count, LeftSubstring, RightSubstring):
% `LeftSubstring' is the left-most `Count' characters of `String',
% and `RightSubstring' is the remainder of `String'.
@@ -969,6 +1004,17 @@
string.to_char_list(C, LC),
char_list_remove_suffix(LA, LB, LC).
+string.remove_suffix_if_present(Suffix, String) = Out :-
+ string.length(String, Length),
+ LeftCount = Length - length(Suffix),
+ string.split(String, LeftCount, LeftString, RightString),
+ ( RightString = Suffix ->
+ Out = LeftString
+ ;
+ Out = String
+ ).
+
+
:- pragma promise_equivalent_clauses(string.prefix/2).
string.prefix(String::in, Prefix::in) :-
@@ -4108,6 +4154,56 @@
%------------------------------------------------------------------------------%
+string.split_at_separator(DelimPred, InStr) = OutStrs :-
+ Count = string.length(InStr),
+ split_at_separator2(DelimPred, InStr, Count, Count, [], OutStrs).
+
+:- pred split_at_separator2(pred(char)::in(pred(in) is semidet),
string::in,
+ int::in, int::in, list(string)::in, list(string)::out) is det.
+split_at_separator2(DelimPred, Str, I, ThisSegEnd, ITail, OTail) :-
+ % walk Str backwards extending accumulated list of chunks as chars
+ % matching DelimPred are found
+ ( I < 0 -> % we're at the beginning
+ ( ThisSegEnd<0 ->
+ OTail = ["" | ITail]
+ ;
+ ThisSeg = string.unsafe_substring(Str, 0, ThisSegEnd+1),
+ OTail = [ThisSeg | ITail]
+ )
+ ;
+ C = string.unsafe_index(Str, I),
+ ( DelimPred(C) -> % chop here
+ ThisSeg = string.unsafe_substring(Str, I+1, ThisSegEnd-I),
+ TTail = [ ThisSeg | ITail ],
+ split_at_separator2(DelimPred, Str, I-1, I-1, TTail, OTail)
+ ; % extend current segment
+ split_at_separator2(DelimPred, Str, I-1, ThisSegEnd, ITail,
OTail)
+ )
+ ).
+
+%------------------------------------------------------------------------------%
+
+string.split_at_char(C, String) =
+ string.split_at_separator(unify(C), String).
+
+%------------------------------------------------------------------------------%
+
+split_at_string(Needle, Total) =
+ split_at_string(0, length(Needle), Needle, Total).
+
+:- func split_at_string(int, int, string, string) = list(string).
+split_at_string(StartAt, NeedleLen, Needle, Total) = Out :-
+ ( sub_string_search_start(Total, Needle, StartAt, NeedlePos) ->
+ BeforeNeedle = substring(Total, StartAt, NeedlePos-StartAt),
+ Tail = split_at_string(NeedlePos+NeedleLen, NeedleLen, Needle,
Total),
+ Out = [BeforeNeedle | Tail]
+ ;
+ string__split(Total, StartAt, _skip, Last),
+ Out = [Last]
+ ).
+
+%------------------------------------------------------------------------------%
+
% preceding_boundary(SepP, String, I) returns the largest index J =< I
% in String of the char that is SepP and min(-1, I) if there is no
such J.
% preceding_boundary/3 is intended for finding (in reverse)
consecutive
Index: tests/hard_coded/string_split.exp
===================================================================
RCS file: tests/hard_coded/string_split.exp
diff -N tests/hard_coded/string_split.exp
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ tests/hard_coded/string_split.exp 2 Feb 2007 03:00:02 -0000
@@ -0,0 +1,6 @@
+hello:world:how:are:you!
+hello<tab>world<tab>how<tab>are<tab><tab>you!
+user<tab>group<tab>id1<tab>id2
+x<tab>ay<tab>az
+x<tab>a <tab>aax <tab> x
+col1<tab>col2:val2<tab>col3<tab>
Index: tests/hard_coded/string_split.m
===================================================================
RCS file: tests/hard_coded/string_split.m
diff -N tests/hard_coded/string_split.m
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ tests/hard_coded/string_split.m 2 Feb 2007 02:36:14 -0000
@@ -0,0 +1,36 @@
+:- module string_split.
+:- interface.
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+:- implementation.
+
+:- import_module string, char.
+
+main(!IO) :-
+ io__write_list(
+ split_at_separator(char__is_upper, "helloXworldXhowXareYyou!"),
+ ":", io__write_string, !IO),
+ io__nl(!IO),
+ io__write_list(
+ split_at_separator(char__is_whitespace, "hello world\thow
are\t\tyou!"),
+ "<tab>", io__write_string, !IO),
+ io__nl(!IO),
+ io__write_list(
+ split_at_char(':', "user:group:id1:id2"),
+ "<tab>", io__write_string, !IO),
+ io__nl(!IO),
+ io__write_list(
+ split_at_string("aa", "xaaayaaaz"),
+ "<tab>", io__write_string, !IO),
+ io__nl(!IO),
+ io__write_list(
+ split_at_string("aaa", "xaaaa aaaaax aaa x"),
+ "<tab>", io__write_string, !IO),
+ io__nl(!IO),
+ io__write_list(
+ split_at_string(":::", "col1:::col2:val2:::col3:::"),
+ "<tab>", io__write_string, !IO),
+ io__nl(!IO),
+ true.
Index: tests/hard_coded/string_various.exp
===================================================================
RCS file: tests/hard_coded/string_various.exp
diff -N tests/hard_coded/string_various.exp
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ tests/hard_coded/string_various.exp 2 Feb 2007 02:59:24 -0000
@@ -0,0 +1,3 @@
+myfile
+myfile
+myfile.gz
Index: tests/hard_coded/string_various.m
===================================================================
RCS file: tests/hard_coded/string_various.m
diff -N tests/hard_coded/string_various.m
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ tests/hard_coded/string_various.m 2 Feb 2007 02:58:29 -0000
@@ -0,0 +1,18 @@
+:- module string_various.
+:- interface.
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+:- implementation.
+
+:- import_module string, char.
+
+main(!IO) :-
+ io__write_string(remove_suffix_if_present(".gz", "myfile"), !IO),
+ io__nl(!IO),
+ io__write_string(remove_suffix_if_present(".gz", "myfile.gz"), !IO),
+ io__nl(!IO),
+ io__write_string(remove_suffix_if_present(".gz", "myfile.gz.gz"), !IO),
+ io__nl(!IO),
+ true.
--
Ondrej Bojar (mailto:obo at cuni.cz)
http://www.cuni.cz/~obo
--------------------------------------------------------------------------
mercury-reviews mailing list
Post messages to: mercury-reviews at csse.unimelb.edu.au
Administrative Queries: owner-mercury-reviews at csse.unimelb.edu.au
Subscriptions: mercury-reviews-request at csse.unimelb.edu.au
--------------------------------------------------------------------------
More information about the reviews
mailing list