[m-dev.] Proposal: parsing module for the library.
Ralph Becket
rafe at csse.unimelb.edu.au
Fri Jan 16 18:16:34 AEDT 2009
Zoltan Somogyi, Friday, 16 January 2009:
>
> Same here. The compiler has a couple of modules that use this, inst_match and
> equiv_type_hlds, and I hate it too.
I agree. Here's a relative diff (test case to follow):
--- parsing_utils.m.bak 2009-01-15 13:10:04.935193508 +1100
+++ parsing_utils.m 2009-01-15 14:15:35.413750676 +1100
@@ -1,11 +1,31 @@
-%-----------------------------------------------------------------------------%
-% parsing_utils.m
-% Ralph Becket <rafe at csse.unimelb.edu.au>
-% Tue Jan 13 11:32:49 EST 2009
+%---------------------------------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et wm=0 tw=0
+%---------------------------------------------------------------------------%
+% Copyright (C) 2009 The University of Melbourne.
+% This file may only be copied under the terms of the GNU Library General
+% Public License - see the file COPYING.LIB in the Mercury distribution.
+%---------------------------------------------------------------------------%
+%
+% File: parsing_utils.m
+% Author: Ralph Becket <rafe at csse.unimelb.edu.au>
+% Stability: low
+%
+% Utilities for recursive descent parsers. Parsers take at least three
+% arguments: a source (src) containing the input string and a parser
+% state (ps) input/output pair tracking the current offset into the input.
%
-% Utilities for recursive descent parsers.
+% A new src and ps can be constructed by calling
+% new_src_and_ps(InputString, Src, !:PS). Parsing predicates are semidet
+% and typically take the form p(Src, ...input arguments..., Result, !PS).
+% A parser matching variable assignments of the form `x = 42' might be
+% defined like this:
%
+% var_assignment(Src, {Var, Value}, !PS) :-
+% var(Src, Var, !PS),
+% punct(Src, "=", !PS),
+% expr(Src, Expr, !PS).
+%
+%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
:- module parsing_utils.
@@ -22,14 +42,19 @@
+ % The parser source (input string).
+ %
+:- type src.
+
% The parser "state", passed around in DCG arguments.
%
:- type ps.
- % The parser source (input string).
+ % This type and inst are useful for specifying "standard" parser
+ % signatures. A parser's first argument should be the src: this
+ % arrangement makes parsers easier to combine via higher order
+ % combinators.
%
-:- type src.
-
:- type parser(T) == pred(T, ps, ps).
:- inst parser == ( pred(out, in, out) is semidet ).
@@ -37,112 +62,124 @@
%
:- pred new_src_and_ps(string::in, src::out, ps::out) is det.
- % Obtain the current offset from the start of the input string.
+ % Obtain the current offset from the start of the input string
+ % (the first character in the input has offset 0).
%
-:- pred current_offset(src::in, int::out, ps::in, ps::out) is det.
+:- pred current_offset(src::in, int::out,
+ ps::in, ps::out) is det.
+
+ % input_substring(Src, StartOffset, EndOffsetPlusOne, Substring)
+ % Copy the substring from the input occupying the offsets
+ % [StartOffset, EndOffsetPlusOne).
+ %
+:- pred input_substring(src::in, int::in, int::in, string::out) is semidet.
% Read the next char.
%
-:- pred next_char(src::in)
- : parser(char) `with_inst` parser.
+:- pred next_char(src::in, char::out,
+ ps::in, ps::out) is semidet.
% Match a char from the given string.
%
-:- pred char_in_class(src::in, string::in)
- : parser(char) `with_inst` parser.
+:- pred char_in_class(src::in, string::in, char::out,
+ ps::in, ps::out) is semidet.
% Match a string exactly and any subsequent whitespace.
%
-:- pred punct(src::in, string::in)
- : parser(unit) `with_inst` parser.
+:- pred punct(src::in, string::in, unit::out,
+ ps::in, ps::out) is semidet.
- % keyword(Src, IdChars, Keyword, _) matches Keyword exactly (i.e., it must
- % not be followed by any character in IdChars) and any subsequent
+ % keyword(Src, IdChars, Keyword, _, !PS) matches Keyword exactly (i.e., it
+ % must not be followed by any character in IdChars) and any subsequent
% whitespace.
%
-:- pred keyword(src::in, string::in, string::in)
- : parser(unit) `with_inst` parser.
+:- pred keyword(src::in, string::in, string::in, unit::out,
+ ps::in, ps::out) is semidet.
- % identifier(Src, InitIdChars, IdChars, Identifier) matches the next
+ % identifier(Src, InitIdChars, IdChars, Identifier, !PS) matches the next
% identifer (result in Identifier) comprising a char from InitIdChars
% followed by zero or more chars from IdChars.
%
-:- pred identifier(src::in, string::in, string::in)
- : parser(string) `with_inst` parser.
+:- pred identifier(src::in, string::in, string::in, string::out,
+ ps::in, ps::out) is semidet.
% Consume any whitespace.
%
-:- pred whitespace(src::in)
- : parser(unit) `with_inst` parser.
+:- pred whitespace(src::in, unit::out,
+ ps::in, ps::out) is semidet.
% Consume any input up to, and including, the next newline character
% marking the end of the current line.
%
-:- pred skip_to_eol(src::in)
- : parser(unit) `with_inst` parser.
+:- pred skip_to_eol(src::in, unit::out,
+ ps::in, ps::out) is semidet.
% Succeed if we have reached the end of the input.
%
-:- pred eof(src::in)
- : parser(unit) `with_inst` parser.
+:- pred eof(src::in, unit::out,
+ ps::in, ps::out) is semidet.
% Parse a float literal matching [-][0-9]+[.][0-9]+([Ee][-][0-9]+)?
+ % followed by any whitespace.
%
-:- pred float_literal(src::in)
- : parser(float) `with_inst` parser.
+:- pred float_literal(src::in, float::out,
+ ps::in, ps::out) is semidet.
- % Parse an int literal matching [-][0-9]+, not followed by [.][0-9]+.
+ % Parse an int literal matching [-][0-9]+, not followed by [.][0-9]+
+ % followed by any whitespace.
%
-:- pred int_literal(src::in)
- : parser(int) `with_inst` parser.
+:- pred int_literal(src::in, int::out,
+ ps::in, ps::out) is semidet.
- % Parse an string literal. The string argument is the quote character.
- % A backslash (\) character in the string makes the next character
- % literal (e.g., for embedding quotes). These 'escaped' characters
- % are included as-is in the result, along with the preceding backslash.
- %
-:- pred string_literal(src::in, char::in)
- : parser(string) `with_inst` parser.
-
- % optional(Src, P) returns yes(X), if P succeeds returning X, or no
- % if P does not succeed.
- %
-:- pred optional(src::in, parser(T)::in(parser))
- : parser(maybe(T)) `with_inst` parser.
-
- % zero_or_more(Src, P, Xs) returns the list of results Xs obtained
- % by repeatedly applying P until P fails. The nth item in Xs is
- % the result from the nth application of P.
- %
-:- pred zero_or_more(src::in, parser(T)::in(parser))
- : parser(list(T)) `with_inst` parser.
-
- % one_or_more(Src, P, Xs) returns the list of results Xs obtained
- % by repeatedly applying P until P fails. The nth item in Xs is
- % the result from the nth application of P. P must succeed at
- % least once.
- %
-:- pred one_or_more(src::in, parser(T)::in(parser))
- : parser(list(T)) `with_inst` parser.
-
- % brackets(Src, L, R, P, X) is equivalent to
- % punct(Src, L, _), P(Src, X), punct(Src, R, _).
- %
-:- pred brackets(src::in, string::in, string::in, parser(T)::in(parser))
- : parser(T) `with_inst` parser.
-
- % separated_list(Src, Separator, P, Xs) is like
- % zero_or_more(Src, P, Xs) except that successive applications of
- % P must be separated by punct(Src, Separator, _).
- %
-:- pred separated_list(src::in, string::in, parser(T)::in(parser))
- : parser(list(T)) `with_inst` parser.
-
- % comma_separated_list(Src, P, Xs) is the same as
- % separated_list(Src, ",", P, Xs).
- %
-:- pred comma_separated_list(src::in, parser(T)::in(parser))
- : parser(list(T)) `with_inst` parser.
+ % Parse an string literal. The string argument is the quote character.
+ % A backslash (\) character in the string makes the next character
+ % literal (e.g., for embedding quotes). These 'escaped' characters
+ % are included as-is in the result, along with the preceding backslash.
+ % Any following whitespace is also consumed.
+ %
+:- pred string_literal(src::in, char::in, string::out,
+ ps::in, ps::out) is semidet.
+
+ % optional(Src, P) returns yes(X), if P succeeds returning X, or no
+ % if P does not succeed.
+ %
+:- pred optional(src::in, parser(T)::in(parser), maybe(T)::out,
+ ps::in, ps::out) is semidet.
+
+ % zero_or_more(Src, P, Xs) returns the list of results Xs obtained
+ % by repeatedly applying P until P fails. The nth item in Xs is
+ % the result from the nth application of P.
+ %
+:- pred zero_or_more(src::in, parser(T)::in(parser), list(T)::out,
+ ps::in, ps::out) is semidet.
+
+ % one_or_more(Src, P, Xs) returns the list of results Xs obtained
+ % by repeatedly applying P until P fails. The nth item in Xs is
+ % the result from the nth application of P. P must succeed at
+ % least once.
+ %
+:- pred one_or_more(src::in, parser(T)::in(parser), list(T)::out,
+ ps::in, ps::out) is semidet.
+
+ % brackets(Src, L, R, P, X) is equivalent to
+ % punct(Src, L, _), P(Src, X), punct(Src, R, _).
+ %
+:- pred brackets(src::in, string::in, string::in, parser(T)::in(parser), T::out,
+ ps::in, ps::out) is semidet.
+
+ % separated_list(Src, Separator, P, Xs) is like
+ % zero_or_more(Src, P, Xs) except that successive applications of
+ % P must be separated by punct(Src, Separator, _).
+ %
+:- pred separated_list(src::in, string::in, parser(T)::in(parser),
+ list(T)::out,
+ ps::in, ps::out) is semidet.
+
+ % comma_separated_list(Src, P, Xs) is the same as
+ % separated_list(Src, ",", P, Xs).
+ %
+:- pred comma_separated_list(src::in, parser(T)::in(parser), list(T)::out,
+ ps::in, ps::out) is semidet.
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
@@ -198,8 +235,6 @@
%-----------------------------------------------------------------------------%
-:- pred input_substring(src::in, int::in, int::in, string::out) is semidet.
-
input_substring(Src, Start, EndPlusOne, Substring) :-
EndPlusOne =< Src ^ input_length,
Substring =
@@ -207,7 +242,8 @@
%-----------------------------------------------------------------------------%
-:- pred match_string(src::in, string::in, ps::in, ps::out) is semidet.
+:- pred match_string(src::in, string::in,
+ ps::in, ps::out) is semidet.
match_string(Src, MatchStr, PS, PS + N) :-
N = string.length(MatchStr),
@@ -347,8 +383,8 @@
%-----------------------------------------------------------------------------%
-:- pred digits(src::in, int::in)
- : parser(unit) `with_inst` parser.
+:- pred digits(src::in, int::in, unit::out,
+ ps::in, ps::out) is semidet.
digits(Src, Base, unit, !PS) :-
next_char(Src, C, !PS),
@@ -357,8 +393,8 @@
digits_2(Src, Base, _, !PS).
-:- pred digits_2(src::in, int::in)
- : parser(unit) `with_inst` parser.
+:- pred digits_2(src::in, int::in, unit::out,
+ ps::in, ps::out) is semidet.
digits_2(Src, Base, unit, !PS) :-
( if
@@ -383,8 +419,8 @@
%-----------------------------------------------------------------------------%
-:- pred string_literal_2(src::in, char::in)
- : parser(unit) `with_inst` parser.
+:- pred string_literal_2(src::in, char::in, unit::out,
+ ps::in, ps::out) is semidet.
string_literal_2(Src, QuoteChar, unit, !PS) :-
next_char(Src, C, !PS),
@@ -408,8 +444,8 @@
%-----------------------------------------------------------------------------%
-:- pred identifier_2(src::in, string::in)
- : parser(unit) `with_inst` parser.
+:- pred identifier_2(src::in, string::in, unit::out,
+ ps::in, ps::out) is semidet.
identifier_2(Src, IdChars, unit, !PS) :-
( if char_in_class(Src, IdChars, _, !PS) then
--------------------------------------------------------------------------
mercury-developers mailing list
Post messages to: mercury-developers at csse.unimelb.edu.au
Administrative Queries: owner-mercury-developers at csse.unimelb.edu.au
Subscriptions: mercury-developers-request at csse.unimelb.edu.au
--------------------------------------------------------------------------
More information about the developers
mailing list