[m-rev.] For review: updates to parsing_utils.m

Ralph Becket rafe at csse.unimelb.edu.au
Tue Jun 16 17:15:59 AEST 2009

Estimated hours taken: 2
Branches: main

	Add support for parsers that update their own state (e.g., a symbol
	table).  Add support for user-defined whitespace specification (e.g.,
	to include comments).  Add support for converting offsets into line
	numbers (e.g., for error reporting).
	Update the test case.

Index: library/parsing_utils.m
RCS file: /home/mercury1/repository/mercury/library/parsing_utils.m,v
retrieving revision 1.1
diff -u -r1.1 parsing_utils.m
--- library/parsing_utils.m	28 Jan 2009 07:19:41 -0000	1.1
+++ library/parsing_utils.m	16 Jun 2009 04:30:17 -0000
@@ -15,10 +15,13 @@
 % state (ps) input/output pair tracking the current offset into the input.
 % A new src and ps can be constructed by calling
-% new_src_and_ps(InputString, Src, !:PS).  Parsing predicates are semidet
-% and typically take the form p(...input arguments..., Src, Result, !PS).
-% A parser matching variable assignments of the form `x = 42' might be
-% defined like this:
+% new_src_and_ps(InputString, SkipWS, Src, !:PS) where the SkipWS function
+% is used by the primitive parsers to skip over any following whitespace
+% (providing a skipping function allows users to define comments as
+% whitespace).
+% Parsing predicates are semidet and typically take the form
+% p(...parameters..., Src, Result, !PS).  A parser matching variable
+% assignments of the form `x = 42' might be defined like this:
 %   var_assignment(Src, {Var, Value}, !PS) :-
 %       var(Src, Var, !PS),
@@ -54,13 +57,29 @@
 :- type ps.
-    % This type and inst are useful for specifying "standard" parser
+    % These types and insts are useful for specifying "standard" parser
     % signatures.
 :- type parser(T) == pred(src, T, ps, ps).
 :- inst parser == ( pred(in, out, in, out) is semidet ).
-    % Construct a new parser source and state from a string.
+    % The following are for parsers that also transform a separate state value.
+    %
+:- type parser_with_state(T, S) == pred(src, T, S, S, ps, ps).
+:- inst parser_with_state == ( pred(in, out, in, out, in, out) is semidet ).
+    % Construct a new parser source and state from a string, also specifying
+    % a function for skipping over whitespace (several primitive parsers
+    % use this function to consume whitespace after a token; this argument
+    % allows the user to specify a function for, say, skipping over comments
+    % as well).
+    %
+:- pred new_src_and_ps(string::in,
+        (func(src, ps) = ps)::in(func(in, in) = out is det),
+        src::out, ps::out) is det.
+    % Construct a new parser source and state from a string (the default
+    % whitespace parser is used).
 :- pred new_src_and_ps(string::in, src::out, ps::out) is det.
@@ -70,6 +89,21 @@
 :- pred current_offset(src::in, int::out,
         ps::in, ps::out) is det.
+    % Compute a structure from the parser source which can be used to
+    % convert offsets into line numbers and positions in the file (this
+    % is useful for error reporting).
+    %
+:- type line_numbers.
+:- func src_to_line_numbers(src) = line_numbers.
+    % Convert an offset into a line number and position within the line
+    % (the first line is number 1; the first character in a line is
+    % position 1).
+    %
+:- pred offset_to_line_number_and_position(line_numbers::in, int::in,
+        int::out, int::out) is det.
     % input_substring(Src, StartOffset, EndOffsetPlusOne, Substring)
     % Copy the substring from the input occupying the offsets
     % [StartOffset, EndOffsetPlusOne).
@@ -91,7 +125,7 @@
 :- pred punct(string::in, src::in, unit::out,
         ps::in, ps::out) is semidet.
-    % keyword(Src, IdChars, Keyword, _, !PS) matches Keyword exactly (i.e., it
+    % keyword(IdChars, Keyword, Src, _, !PS) matches Keyword exactly (i.e., it
     % must not be followed by any character in IdChars) and any subsequent
     % whitespace.
@@ -194,11 +228,59 @@
 :- pred comma_separated_list(parser(T)::in(parser), src::in, list(T)::out,
         ps::in, ps::out) is semidet.
+% The following parser combinators are equivalent to the above, except that
+% a separate state argument is threaded through the computation (e.g., for
+% parsers that incrementally construct a symbol table).
+    % optional(P, Src, Result, !S, !PS) returns Result = yes(X),
+    % if P(Src, X, !S, !PS), or Result = no if P does not succeed.
+    %
+:- pred optional(parser_with_state(T, S)::in(parser_with_state), src::in,
+        maybe(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
+    % zero_or_more(P, Src, Xs, !S, !PS) returns the list of results Xs obtained
+    % by repeatedly applying P until P fails.  The nth item in Xs is
+    % the result from the nth application of P.
+    %
+:- pred zero_or_more(parser_with_state(T, S)::in(parser_with_state), src::in,
+        list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
+    % one_or_more(P, Src, Xs, !S, !PS) returns the list of results Xs obtained
+    % by repeatedly applying P until P fails.  The nth item in Xs is
+    % the result from the nth application of P.  P must succeed at
+    % least once.
+    %
+:- pred one_or_more(parser_with_state(T, S)::in(parser_with_state), src::in,
+        list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
+    % brackets(L, R, P, Src, X, !S, !PS) is equivalent to
+    %   punct(L, Src, _, !PS), P(Src, X, !S, !PS), punct(R, Src, _, !PS).
+    %
+:- pred brackets(string::in, string::in,
+        parser_with_state(T, S)::in(parser_with_state), src::in,
+        T::out, S::in, S::out, ps::in, ps::out) is semidet.
+    % separated_list(Separator, P, Src, Xs, !S, !PS) is like
+    % zero_or_more(P, Src, Xs, !S, !PS) except that successive applications of
+    % P must be separated by punct(Separator, Src, _, !PS).
+    %
+:- pred separated_list(string::in,
+        parser_with_state(T, S)::in(parser_with_state),
+        src::in, list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
+    % comma_separated_list(P, Src, Xs, !S, !PS) is the same as
+    %   separated_list(",", P, Src, Xs, !S, !PS).
+    %
+:- pred comma_separated_list(parser_with_state(T, S)::in(parser_with_state),
+        src::in, list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
 :- implementation.
+:- import_module array.
     % The parser "state" is just the offset into the input string.
@@ -208,21 +290,90 @@
 :- type src
     --->    src(
                 input_length    ::  int,
-                input_string    ::  string
+                input_string    ::  string,
+                skip_ws_func    ::  func(src, ps) = ps
 new_src_and_ps(InputString, Src, PS) :-
-    Src = src(string.length(InputString), InputString),
+    new_src_and_ps(InputString, skip_whitespace, Src, PS).
+new_src_and_ps(InputString, SkipWS, Src, PS) :-
+    Src = src(string.length(InputString), InputString, SkipWS),
     PS = 0.
+:- func skip_whitespace(src, ps) = ps.
+skip_whitespace(Src, PS0) =
+    ( if whitespace(Src, _, PS0, PS) then PS else PS0 ).
+:- pred skip_whitespace(src::in, ps::in, ps::out) is det.
+skip_whitespace(Src, PS0, PS) :-
+    SkipWS = Src ^ skip_ws_func,
+    PS = SkipWS(Src, PS0).
 % Low-level predicates.
+:- type line_numbers == array(int).
+src_to_line_numbers(Src) = LineNos :-
+    Str = Src ^ input_string,
+    Lo = 0,
+    Hi = Src ^ input_length - 1,
+    F = ( func(I, Ns) =
+        ( if string.unsafe_index(Str, I) = ('\n') then [I | Ns] else Ns )
+    ),
+    LineNosList = int.fold_down(F, Lo, Hi, []),
+    LineNos = array(LineNosList).
+offset_to_line_number_and_position(LineNos, Offset, LineNo, Pos) :-
+    Lo = 0,
+    Hi = array.size(LineNos) - 1,
+    offset_to_line_number_and_position_2(LineNos, Lo, Hi, Offset, LineNo, Pos).
+:- pred offset_to_line_number_and_position_2(line_numbers::in, int::in,
+        int::in, int::in, int::out, int::out) is det.
+    % Perform a binary search looking for the offset of the line number
+    % of the line containing Offset.
+    %
+offset_to_line_number_and_position_2(LineNos, Lo, Hi, Offset, LineNo, Pos) :-
+    ( if Lo < Hi then
+        Mid = (Lo + Hi) / 2,
+        MidOffset = LineNos ^ elem(Mid),
+        ( if MidOffset < Offset then
+            offset_to_line_number_and_position_2(LineNos, Mid + 1, Hi, Offset,
+                LineNo, Pos)
+          else
+            offset_to_line_number_and_position_2(LineNos, Lo, Mid, Offset,
+                LineNo, Pos)
+        )
+      else
+        LoOffset = LineNos ^ elem(Lo),
+        LineNo = 1 + Lo,
+        Pos = 1 + Offset - LoOffset
+    ).
 current_offset(_Src, Offset, !PS) :-
     Offset = !.PS.
@@ -291,11 +442,50 @@
+optional(P, Src, Result, !S, !PS) :-
+    ( if P(Src, X, !S, !PS) then
+        Result = yes(X)
+      else
+        Result = no,
+        semidet_true
+    ).
 zero_or_more(P, Src, Result, !PS) :-
-    ( if P(Src, X, !PS), zero_or_more(P, Src, Xs, !PS) then
-        Result = [X | Xs]
+    zero_or_more_2(P, Src, [], RevResult, !PS),
+    Result = list.reverse(RevResult).
+    % We use an auxiliary predicate to make this tail recursive.  This can
+    % be an issue with long sequences.
+    %
+:- pred zero_or_more_2(parser(T)::in(parser), src::in,
+        list(T)::in, list(T)::out, ps::in, ps::out) is semidet.
+zero_or_more_2(P, Src, !RevResult, !PS) :-
+    ( if P(Src, X, !PS) then
+        list.cons(X, !RevResult),
+        zero_or_more_2(P, Src, !RevResult, !PS)
+      else
+        semidet_true
+    ).
+zero_or_more(P, Src, Result, !S, !PS) :-
+    zero_or_more_2(P, Src, [], RevResult, !S, !PS),
+    Result = list.reverse(RevResult).
+:- pred zero_or_more_2(parser_with_state(T, S)::in(parser_with_state), src::in,
+        list(T)::in, list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
+zero_or_more_2(P, Src, !RevResult, !S, !PS) :-
+    ( if P(Src, X, !S, !PS) then
+        list.cons(X, !RevResult),
+        zero_or_more_2(P, Src, !RevResult, !S, !PS)
-        Result = [],
@@ -308,6 +498,13 @@
+one_or_more(P, Src, Result, !S, !PS) :-
+    P(Src, X, !S, !PS),
+    zero_or_more(P, Src, Xs, !S, !PS),
+    Result = [X | Xs].
 brackets(L, R, P, Src, Result, !PS) :-
     punct(L, Src, _, !PS),
     P(Src, Result, !PS),
@@ -315,6 +512,13 @@
+brackets(L, R, P, Src, Result, !S, !PS) :-
+    punct(L, Src, _, !PS),
+    P(Src, Result, !S, !PS),
+    punct(R, Src, _, !PS).
 separated_list(Separator, P, Src, Result, !PS) :-
     CommaP = ( pred(CommaPSrc::in, CommaPX::out, !.PS::in, !:PS::out)
             is semidet :-
@@ -327,19 +531,37 @@
+separated_list(Separator, P, Src, Result, !S, !PS) :-
+    CommaP = ( pred(CommaPSrc::in, CommaPX::out,
+            !.S::in, !:S::out, !.PS::in, !:PS::out)
+            is semidet :-
+        punct(Separator, CommaPSrc, _, !PS),
+        P(CommaPSrc, CommaPX, !S, !PS)
+    ),
+    P(Src, X, !S, !PS),
+    zero_or_more(CommaP, Src, Xs, !S, !PS),
+    Result = [X | Xs].
 comma_separated_list(P, Src, Result, !PS) :-
     separated_list(",", P, Src, Result, !PS).
+comma_separated_list(P, Src, Result, !S, !PS) :-
+    separated_list(",", P, Src, Result, !S, !PS).
 whitespace(Src, unit, !PS) :-
     ( if
         next_char(Src, C, !PS),
-        whitespace(Src, _, !PS)
+        skip_whitespace(Src, !PS)
-        true
+        semidet_true
@@ -352,14 +574,14 @@
 punct(Punct, Src, unit, !PS) :-
     match_string(Punct, Src, !PS),
-    whitespace(Src, _, !PS).
+    skip_whitespace(Src, !PS).
 keyword(IdChars, Keyword, Src, unit, !PS) :-
     match_string(Keyword, Src, !PS),
     not char_in_class(IdChars, Src, _, !.PS, _),
-    whitespace(Src, _, !PS).
+    skip_whitespace(Src, !PS).
@@ -376,7 +598,7 @@
     current_offset(Src, EndPlusOne, !PS),
-    whitespace(Src, _, !PS),
+    skip_whitespace(Src, !PS),
     input_substring(Src, Start, EndPlusOne, FloatStr).
@@ -396,7 +618,7 @@
         digits(10, Src, _, !.PS, _)
     current_offset(Src, EndPlusOne, !PS),
-    whitespace(Src, _, !PS),
+    skip_whitespace(Src, !PS),
     input_substring(Src, Start, EndPlusOne, IntStr).
@@ -438,7 +660,7 @@
     next_char(Src, QuoteChar, !PS),
     string_literal_2(Src, QuoteChar, _, !PS),
     current_offset(Src, EndPlusOne, !PS),
-    whitespace(Src, _, !PS),
+    skip_whitespace(Src, !PS),
     input_substring(Src, Start + 1, EndPlusOne - 1, String).
@@ -464,7 +686,7 @@
     char_in_class(InitIdChars, Src, _, !PS),
     identifier_2(IdChars, Src, _, !PS),
     current_offset(Src, EndPlusOne, !PS),
-    whitespace(Src, _, !PS),
+    skip_whitespace(Src, !PS),
     input_substring(Src, Start, EndPlusOne, Identifier).
Index: tests/general/test_parsing_utils.exp
RCS file: /home/mercury1/repository/tests/general/test_parsing_utils.exp,v
retrieving revision 1.1
diff -u -r1.1 test_parsing_utils.exp
--- tests/general/test_parsing_utils.exp	28 Jan 2009 07:19:42 -0000	1.1
+++ tests/general/test_parsing_utils.exp	16 Jun 2009 05:09:43 -0000
@@ -213,3 +213,20 @@
 pass: comma_separated_list(punct("!")) on "!, ! , !   abc"
 	returned [unit, unit, unit] as expected
 	[11 chars consumed]
+pass: optional(int_with_state) on "abc"
+	returned [] as expected
+	[0 chars consumed]
+pass: optional(int_with_state) on "1"
+	returned [1] as expected
+	[1 chars consumed]
+pass: zero_or_more(int_with_state) on "abc"
+	returned [] as expected
+	[0 chars consumed]
+pass: zero_or_more(int_with_state) on "1 2 3"
+	returned [3, 2, 1] as expected
+	[5 chars consumed]
+pass: one_or_more(int_with_state) on "abc"
+	failed as expected
+pass: one_or_more(int_with_state) on "1 2 3"
+	returned [3, 2, 1] as expected
+	[5 chars consumed]
Index: tests/general/test_parsing_utils.m
RCS file: /home/mercury1/repository/tests/general/test_parsing_utils.m,v
retrieving revision 1.1
diff -u -r1.1 test_parsing_utils.m
--- tests/general/test_parsing_utils.m	28 Jan 2009 07:19:42 -0000	1.1
+++ tests/general/test_parsing_utils.m	16 Jun 2009 05:08:16 -0000
@@ -295,6 +295,38 @@
     "!, ! , !   abc", yes("[unit, unit, unit]")).
+    stringify_state(optional(int_with_state)),
+    "abc", yes("[]")).
+    stringify_state(optional(int_with_state)),
+    "1", yes("[1]")).
+    stringify_state(zero_or_more(int_with_state)),
+    "abc", yes("[]")).
+    stringify_state(zero_or_more(int_with_state)),
+    "1 2 3", yes("[3, 2, 1]")).
+    stringify_state(one_or_more(int_with_state)),
+    "abc", no).
+    stringify_state(one_or_more(int_with_state)),
+    "1 2 3", yes("[3, 2, 1]")).
+:- pred int_with_state(src::in, int::out, list(int)::in, list(int)::out,
+        ps::in, ps::out) is semidet.
+int_with_state(Src, X, Xs, [X | Xs], !PS) :-
+    int_literal(Src, X, !PS).
 :- pred stringify(
@@ -310,4 +342,19 @@
     String = string.string(X).
+:- pred stringify_state(
+        pred(src, T, list(S), list(S), ps, ps)::
+            in(pred(in, out, in, out, in, out) is semidet),
+        src::in,
+        string::out,
+        ps::in,
+        ps::out)
+        is semidet.
+stringify_state(P, Src, String, !PS) :-
+    P(Src, _, [], State, !PS),
+    String = string.string(State).
mercury-reviews mailing list
Post messages to:       mercury-reviews at csse.unimelb.edu.au
Administrative Queries: owner-mercury-reviews at csse.unimelb.edu.au
Subscriptions:          mercury-reviews-request at csse.unimelb.edu.au

More information about the reviews mailing list