[m-dev.] Proposal: parsing module for the library.

Ralph Becket rafe at csse.unimelb.edu.au
Fri Jan 16 18:16:34 AEDT 2009


Zoltan Somogyi, Friday, 16 January 2009:
> 
> Same here. The compiler has a couple of modules that use this, inst_match and
> equiv_type_hlds, and I hate it too.

I agree.  Here's a relative diff (test case to follow):


--- parsing_utils.m.bak	2009-01-15 13:10:04.935193508 +1100
+++ parsing_utils.m	2009-01-15 14:15:35.413750676 +1100
@@ -1,11 +1,31 @@
-%-----------------------------------------------------------------------------%
-% parsing_utils.m
-% Ralph Becket <rafe at csse.unimelb.edu.au>
-% Tue Jan 13 11:32:49 EST 2009
+%---------------------------------------------------------------------------%
 % vim: ft=mercury ts=4 sw=4 et wm=0 tw=0
+%---------------------------------------------------------------------------%
+% Copyright (C) 2009 The University of Melbourne.
+% This file may only be copied under the terms of the GNU Library General
+% Public License - see the file COPYING.LIB in the Mercury distribution.
+%---------------------------------------------------------------------------%
+% 
+% File: parsing_utils.m
+% Author: Ralph Becket <rafe at csse.unimelb.edu.au>
+% Stability: low
+%
+% Utilities for recursive descent parsers.  Parsers take at least three
+% arguments: a source (src) containing the input string and a parser
+% state (ps) input/output pair tracking the current offset into the input.
 %
-% Utilities for recursive descent parsers.
+% A new src and ps can be constructed by calling
+% new_src_and_ps(InputString, Src, !:PS).  Parsing predicates are semidet
+% and typically take the form p(Src, ...input arguments..., Result, !PS).
+% A parser matching variable assignments of the form `x = 42' might be
+% defined like this:
 %
+%   var_assignment(Src, {Var, Value}, !PS) :-
+%       var(Src, Var, !PS),
+%       punct(Src, "=", !PS),
+%       expr(Src, Expr, !PS).
+%
+%-----------------------------------------------------------------------------%
 %-----------------------------------------------------------------------------%
 
 :- module parsing_utils.
@@ -22,14 +42,19 @@
 
 
 
+    % The parser source (input string).
+    %
+:- type src.
+
     % The parser "state", passed around in DCG arguments.
     % 
 :- type ps.
 
-    % The parser source (input string).
+    % This type and inst are useful for specifying "standard" parser
+    % signatures.  A parser's first argument should be the src: this
+    % arrangement makes parsers easier to combine via higher order
+    % combinators.
     %
-:- type src.
-
 :- type parser(T) == pred(T, ps, ps).
 :- inst parser == ( pred(out, in, out) is semidet ).
 
@@ -37,112 +62,124 @@
     %
 :- pred new_src_and_ps(string::in, src::out, ps::out) is det.
 
-    % Obtain the current offset from the start of the input string.
+    % Obtain the current offset from the start of the input string
+    % (the first character in the input has offset 0).
     %
-:- pred current_offset(src::in, int::out, ps::in, ps::out) is det.
+:- pred current_offset(src::in, int::out,
+        ps::in, ps::out) is det.
+
+    % input_substring(Src, StartOffset, EndOffsetPlusOne, Substring)
+    % Copy the substring from the input occupying the offsets
+    % [StartOffset, EndOffsetPlusOne).
+    %
+:- pred input_substring(src::in, int::in, int::in, string::out) is semidet.
 
     % Read the next char.
     %
-:- pred next_char(src::in)
-        : parser(char) `with_inst` parser.
+:- pred next_char(src::in, char::out,
+        ps::in, ps::out) is semidet.
 
     % Match a char from the given string.
     %
-:- pred char_in_class(src::in, string::in)
-        : parser(char) `with_inst` parser.
+:- pred char_in_class(src::in, string::in, char::out,
+        ps::in, ps::out) is semidet.
 
     % Match a string exactly and any subsequent whitespace.
     %
-:- pred punct(src::in, string::in)
-        : parser(unit) `with_inst` parser.
+:- pred punct(src::in, string::in, unit::out,
+        ps::in, ps::out) is semidet.
 
-    % keyword(Src, IdChars, Keyword, _) matches Keyword exactly (i.e., it must
-    % not be followed by any character in IdChars) and any subsequent
+    % keyword(Src, IdChars, Keyword, _, !PS) matches Keyword exactly (i.e., it
+    % must not be followed by any character in IdChars) and any subsequent
     % whitespace.
     %
-:- pred keyword(src::in, string::in, string::in)
-        : parser(unit) `with_inst` parser.
+:- pred keyword(src::in, string::in, string::in, unit::out,
+        ps::in, ps::out) is semidet.
 
-    % identifier(Src, InitIdChars, IdChars, Identifier) matches the next
+    % identifier(Src, InitIdChars, IdChars, Identifier, !PS) matches the next
     % identifer (result in Identifier) comprising a char from InitIdChars
     % followed by zero or more chars from IdChars.
     %
-:- pred identifier(src::in, string::in, string::in)
-        : parser(string) `with_inst` parser.
+:- pred identifier(src::in, string::in, string::in, string::out,
+        ps::in, ps::out) is semidet.
 
     % Consume any whitespace.
     %
-:- pred whitespace(src::in)
-        : parser(unit) `with_inst` parser.
+:- pred whitespace(src::in, unit::out,
+        ps::in, ps::out) is semidet.
 
     % Consume any input up to, and including, the next newline character
     % marking the end of the current line.
     %
-:- pred skip_to_eol(src::in)
-        : parser(unit) `with_inst` parser.
+:- pred skip_to_eol(src::in, unit::out,
+        ps::in, ps::out) is semidet.
 
     % Succeed if we have reached the end of the input.
     %
-:- pred eof(src::in)
-        : parser(unit) `with_inst` parser.
+:- pred eof(src::in, unit::out,
+        ps::in, ps::out) is semidet.
 
     % Parse a float literal matching [-][0-9]+[.][0-9]+([Ee][-][0-9]+)?
+    % followed by any whitespace.
     %
-:- pred float_literal(src::in)
-        : parser(float) `with_inst` parser.
+:- pred float_literal(src::in, float::out,
+        ps::in, ps::out) is semidet.
 
-    % Parse an int literal matching [-][0-9]+, not followed by [.][0-9]+.
+    % Parse an int literal matching [-][0-9]+, not followed by [.][0-9]+
+    % followed by any whitespace.
     %
-:- pred int_literal(src::in)
-        : parser(int) `with_inst` parser.
+:- pred int_literal(src::in, int::out,
+        ps::in, ps::out) is semidet.
     
-        % Parse an string literal.  The string argument is the quote character.
-        % A backslash (\) character in the string makes the next character
-        % literal (e.g., for embedding quotes).  These 'escaped' characters
-        % are included as-is in the result, along with the preceding backslash.
-        %
-:- pred string_literal(src::in, char::in)
-        : parser(string) `with_inst` parser.
-
-        % optional(Src, P) returns yes(X), if P succeeds returning X, or no
-        % if P does not succeed.
-        %
-:- pred optional(src::in, parser(T)::in(parser))
-        : parser(maybe(T)) `with_inst` parser.
-
-        % zero_or_more(Src, P, Xs) returns the list of results Xs obtained
-        % by repeatedly applying P until P fails.  The nth item in Xs is
-        % the result from the nth application of P.
-        %
-:- pred zero_or_more(src::in, parser(T)::in(parser))
-        : parser(list(T)) `with_inst` parser.
-
-        % one_or_more(Src, P, Xs) returns the list of results Xs obtained
-        % by repeatedly applying P until P fails.  The nth item in Xs is
-        % the result from the nth application of P.  P must succeed at
-        % least once.
-        %
-:- pred one_or_more(src::in, parser(T)::in(parser))
-        : parser(list(T)) `with_inst` parser.
-
-        % brackets(Src, L, R, P, X) is equivalent to
-        %   punct(Src, L, _), P(Src, X), punct(Src, R, _).
-        %
-:- pred brackets(src::in, string::in, string::in, parser(T)::in(parser))
-        : parser(T) `with_inst` parser.
-
-        % separated_list(Src, Separator, P, Xs) is like
-        % zero_or_more(Src, P, Xs) except that successive applications of
-        % P must be separated by punct(Src, Separator, _).
-        %
-:- pred separated_list(src::in, string::in, parser(T)::in(parser))
-        : parser(list(T)) `with_inst` parser.
-
-        % comma_separated_list(Src, P, Xs) is the same as
-        %   separated_list(Src, ",", P, Xs).
-        %
-:- pred comma_separated_list(src::in, parser(T)::in(parser))
-        : parser(list(T)) `with_inst` parser.
+    % Parse an string literal.  The string argument is the quote character.
+    % A backslash (\) character in the string makes the next character
+    % literal (e.g., for embedding quotes).  These 'escaped' characters
+    % are included as-is in the result, along with the preceding backslash.
+    % Any following whitespace is also consumed.
+    %
+:- pred string_literal(src::in, char::in, string::out,
+        ps::in, ps::out) is semidet.
+
+    % optional(Src, P) returns yes(X), if P succeeds returning X, or no
+    % if P does not succeed.
+    %
+:- pred optional(src::in, parser(T)::in(parser), maybe(T)::out,
+        ps::in, ps::out) is semidet.
+
+    % zero_or_more(Src, P, Xs) returns the list of results Xs obtained
+    % by repeatedly applying P until P fails.  The nth item in Xs is
+    % the result from the nth application of P.
+    %
+:- pred zero_or_more(src::in, parser(T)::in(parser), list(T)::out,
+        ps::in, ps::out) is semidet.
+
+    % one_or_more(Src, P, Xs) returns the list of results Xs obtained
+    % by repeatedly applying P until P fails.  The nth item in Xs is
+    % the result from the nth application of P.  P must succeed at
+    % least once.
+    %
+:- pred one_or_more(src::in, parser(T)::in(parser), list(T)::out,
+        ps::in, ps::out) is semidet.
+
+    % brackets(Src, L, R, P, X) is equivalent to
+    %   punct(Src, L, _), P(Src, X), punct(Src, R, _).
+    %
+:- pred brackets(src::in, string::in, string::in, parser(T)::in(parser), T::out,
+        ps::in, ps::out) is semidet.
+
+    % separated_list(Src, Separator, P, Xs) is like
+    % zero_or_more(Src, P, Xs) except that successive applications of
+    % P must be separated by punct(Src, Separator, _).
+    %
+:- pred separated_list(src::in, string::in, parser(T)::in(parser),
+        list(T)::out,
+        ps::in, ps::out) is semidet.
+
+    % comma_separated_list(Src, P, Xs) is the same as
+    %   separated_list(Src, ",", P, Xs).
+    %
+:- pred comma_separated_list(src::in, parser(T)::in(parser), list(T)::out,
+        ps::in, ps::out) is semidet.
 
 %-----------------------------------------------------------------------------%
 %-----------------------------------------------------------------------------%
@@ -198,8 +235,6 @@
 
 %-----------------------------------------------------------------------------%
 
-:- pred input_substring(src::in, int::in, int::in, string::out) is semidet.
-
 input_substring(Src, Start, EndPlusOne, Substring) :-
     EndPlusOne =< Src ^ input_length,
     Substring =
@@ -207,7 +242,8 @@
 
 %-----------------------------------------------------------------------------%
 
-:- pred match_string(src::in, string::in, ps::in, ps::out) is semidet.
+:- pred match_string(src::in, string::in,
+        ps::in, ps::out) is semidet.
 
 match_string(Src, MatchStr, PS, PS + N) :-
     N = string.length(MatchStr),
@@ -347,8 +383,8 @@
 
 %-----------------------------------------------------------------------------%
 
-:- pred digits(src::in, int::in)
-        : parser(unit) `with_inst` parser.
+:- pred digits(src::in, int::in, unit::out,
+        ps::in, ps::out) is semidet.
 
 digits(Src, Base, unit, !PS) :-
     next_char(Src, C, !PS),
@@ -357,8 +393,8 @@
     digits_2(Src, Base, _, !PS).
 
 
-:- pred digits_2(src::in, int::in)
-        : parser(unit) `with_inst` parser.
+:- pred digits_2(src::in, int::in, unit::out,
+        ps::in, ps::out) is semidet.
 
 digits_2(Src, Base, unit, !PS) :-
     ( if
@@ -383,8 +419,8 @@
 
 %-----------------------------------------------------------------------------%
 
-:- pred string_literal_2(src::in, char::in)
-        : parser(unit) `with_inst` parser.
+:- pred string_literal_2(src::in, char::in, unit::out,
+        ps::in, ps::out) is semidet.
 
 string_literal_2(Src, QuoteChar, unit, !PS) :-
     next_char(Src, C, !PS),
@@ -408,8 +444,8 @@
 
 %-----------------------------------------------------------------------------%
 
-:- pred identifier_2(src::in, string::in)
-        : parser(unit) `with_inst` parser.
+:- pred identifier_2(src::in, string::in, unit::out,
+        ps::in, ps::out) is semidet.
 
 identifier_2(Src, IdChars, unit, !PS) :-
     ( if char_in_class(Src, IdChars, _, !PS) then
--------------------------------------------------------------------------
mercury-developers mailing list
Post messages to:       mercury-developers at csse.unimelb.edu.au
Administrative Queries: owner-mercury-developers at csse.unimelb.edu.au
Subscriptions:          mercury-developers-request at csse.unimelb.edu.au
--------------------------------------------------------------------------



More information about the developers mailing list