[m-dev.] Proposal: parsing module for the library.
Ralph Becket
rafe at csse.unimelb.edu.au
Tue Jan 13 17:00:21 AEDT 2009
Here's a diff addressing most of the review comments:
--- parsing.m 2009-01-13 16:52:32.206161260 +1100
+++ parsing_utils.m 2009-01-13 16:59:03.720192432 +1100
@@ -1,14 +1,14 @@
%-----------------------------------------------------------------------------%
-% parsing.m
+% parsing_utils.m
% Ralph Becket <rafe at csse.unimelb.edu.au>
% Tue Jan 13 11:32:49 EST 2009
% vim: ft=mercury ts=4 sw=4 et wm=0 tw=0
%
-% Support for DCG style parsers.
+% Utilities for recursive descent parsers.
%
%-----------------------------------------------------------------------------%
-:- module parsing.
+:- module parsing_utils.
:- interface.
@@ -30,16 +30,20 @@
%
:- type src.
+:- type parser(T) == pred(T, ps, ps).
+:- inst parser == ( pred(out, in, out) is semidet ).
+
% Construct a new parser source and state from a string.
%
:- pred new_src_and_ps(string::in, src::out, ps::out) is det.
-:- type parser(T) == pred(T, ps, ps).
-:- inst parser == ( pred(out, in, out) is semidet ).
+ % Obtain the current offset from the start of the input string.
+ %
+:- pred current_offset(src::in, int::out, ps::in, ps::out) is det.
% Read the next char.
%
-:- pred char(src::in)
+:- pred next_char(src::in)
: parser(char) `with_inst` parser.
% Match a char from the given string.
@@ -82,36 +86,61 @@
:- pred eof(src::in)
: parser(unit) `with_inst` parser.
- % Parse a float literal
+ % Parse a float literal matching [-][0-9]+[.][0-9]+([Ee][-][0-9]+)?
%
-:- pred float(src::in)
+:- pred float_literal(src::in)
: parser(float) `with_inst` parser.
- % Parse an int literal.
+ % Parse an int literal matching [-][0-9]+, not followed by [.][0-9]+.
%
-:- pred int(src::in)
+:- pred int_literal(src::in)
: parser(int) `with_inst` parser.
% Parse an string literal. The string argument is the quote character.
+ % A backslash (\) character in the string makes the next character
+ % literal (e.g., for embedding quotes). These 'escaped' characters
+ % are included as-is in the result, along with the preceding backslash.
%
-:- pred string(src::in, char::in)
+:- pred string_literal(src::in, char::in)
: parser(string) `with_inst` parser.
+ % optional(Src, P) returns yes(X), if P succeeds returning X, or no
+ % if P does not succeed.
+ %
:- pred optional(src::in, parser(T)::in(parser))
: parser(maybe(T)) `with_inst` parser.
+ % zero_or_more(Src, P, Xs) returns the list of results Xs obtained
+ % by repeatedly applying P until P fails. The nth item in Xs is
+ % the result from the nth application of P.
+ %
:- pred zero_or_more(src::in, parser(T)::in(parser))
: parser(list(T)) `with_inst` parser.
+ % one_or_more(Src, P, Xs) returns the list of results Xs obtained
+ % by repeatedly applying P until P fails. The nth item in Xs is
+ % the result from the nth application of P. P must succeed at
+ % least once.
+ %
:- pred one_or_more(src::in, parser(T)::in(parser))
: parser(list(T)) `with_inst` parser.
+ % brackets(Src, L, R, P, X) is equivalent to
+ % punct(Src, L, _), P(Src, X), punct(Src, R, _).
+ %
:- pred brackets(src::in, string::in, string::in, parser(T)::in(parser))
: parser(T) `with_inst` parser.
+ % separated_list(Src, Separator, P, Xs) is like
+ % zero_or_more(Src, P, Xs) except that successive applications of
+ % P must be separated by punct(Src, Separator, _).
+ %
:- pred separated_list(src::in, string::in, parser(T)::in(parser))
: parser(list(T)) `with_inst` parser.
+ % comma_separated_list(Src, P, Xs) is the same as
+ % separated_list(Src, ",", P, Xs).
+ %
:- pred comma_separated_list(src::in, parser(T)::in(parser))
: parser(list(T)) `with_inst` parser.
@@ -144,33 +173,34 @@
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
-:- pred current_offset(src::in, int::out, ps::in, ps::out) is det.
-
-current_offset(_Src, PS, PS, PS).
+current_offset(_Src, Offset, !PS) :-
+ Offset = !.PS.
%-----------------------------------------------------------------------------%
eof(Src, unit, !PS) :-
- !.PS = Src ^ input_length.
+ current_offset(Src, Offset, !PS),
+ Offset = Src ^ input_length.
%-----------------------------------------------------------------------------%
-char(Src, Char, PS, PS + 1) :-
- PS < Src ^ input_length,
- Char = Src ^ input_string ^ unsafe_elem(PS).
+next_char(Src, Char, !PS) :-
+ current_offset(Src, Offset, !PS),
+ Offset < Src ^ input_length,
+ Char = Src ^ input_string ^ unsafe_elem(Offset),
+ !:PS = !.PS + 1.
%-----------------------------------------------------------------------------%
-char_in_class(Src, CharClass, Char) -->
- char(Src, Char),
- { string.contains_char(CharClass, Char) }.
+char_in_class(Src, CharClass, Char, !PS) :-
+ next_char(Src, Char, !PS),
+ string.contains_char(CharClass, Char).
%-----------------------------------------------------------------------------%
-:- pred input_substring(src::in, int::in, int::in)
- : parser(string) `with_inst` parser.
+:- pred input_substring(src::in, int::in, int::in, string::out) is semidet.
-input_substring(Src, Start, EndPlusOne, Substring, !PS) :-
+input_substring(Src, Start, EndPlusOne, Substring) :-
EndPlusOne =< Src ^ input_length,
Substring =
unsafe_substring(Src ^ input_string, Start, EndPlusOne - Start).
@@ -202,173 +232,178 @@
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
-optional(_Src, P, Result) -->
- ( if P(X) then
- { Result = yes(X) }
+optional(_Src, P, Result, !PS) :-
+ ( if P(X, !PS) then
+ Result = yes(X)
else
- { Result = no },
- { semidet_true }
+ Result = no,
+ semidet_true
).
%-----------------------------------------------------------------------------%
-zero_or_more(Src, P, Result) -->
- ( if P(X), zero_or_more(Src, P, Xs) then
- { Result = [X | Xs] }
+zero_or_more(Src, P, Result, !PS) :-
+ ( if P(X, !PS), zero_or_more(Src, P, Xs, !PS) then
+ Result = [X | Xs]
else
- { Result = [] },
- { semidet_true }
+ Result = [],
+ semidet_true
).
%-----------------------------------------------------------------------------%
-one_or_more(Src, P, Result) -->
- P(X),
- zero_or_more(Src, P, Xs),
- { Result = [X | Xs] }.
+one_or_more(Src, P, Result, !PS) :-
+ P(X, !PS),
+ zero_or_more(Src, P, Xs, !PS),
+ Result = [X | Xs].
%-----------------------------------------------------------------------------%
-brackets(Src, L, R, P, Result) -->
- punct(Src, L, _),
- P(Result),
- punct(Src, R, _).
+brackets(Src, L, R, P, Result, !PS) :-
+ punct(Src, L, _, !PS),
+ P(Result, !PS),
+ punct(Src, R, _, !PS).
%-----------------------------------------------------------------------------%
-separated_list(Src, Separator, P, Result) -->
- { CommaP = ( pred(CommaPX::out, !.PS::in, !:PS::out) is semidet :-
+separated_list(Src, Separator, P, Result, !PS) :-
+ CommaP = ( pred(CommaPX::out, !.PS::in, !:PS::out) is semidet :-
punct(Src, Separator, _, !PS),
P(CommaPX, !PS)
- ) },
- P(X),
- zero_or_more(Src, CommaP, Xs),
- { Result = [X | Xs] }.
+ ),
+ P(X, !PS),
+ zero_or_more(Src, CommaP, Xs, !PS),
+ Result = [X | Xs].
%-----------------------------------------------------------------------------%
-comma_separated_list(Src, P, Result) -->
- separated_list(Src, ",", P, Result).
+comma_separated_list(Src, P, Result, !PS) :-
+ separated_list(Src, ",", P, Result, !PS).
%-----------------------------------------------------------------------------%
-whitespace(Src, unit) -->
- ( if char(Src, C), { char.is_whitespace(C) } then
- whitespace(Src, _)
+whitespace(Src, unit, !PS) :-
+ ( if
+ next_char(Src, C, !PS),
+ char.is_whitespace(C)
+ then
+ whitespace(Src, _, !PS)
else
- []
+ true
).
%-----------------------------------------------------------------------------%
-skip_to_eol(Src, unit) -->
- char(Src, C),
- ( if { C = ('\n') } then [] else skip_to_eol(Src, _) ).
+skip_to_eol(Src, unit, !PS) :-
+ next_char(Src, C, !PS),
+ ( if C = ('\n') then true else skip_to_eol(Src, _, !PS) ).
%-----------------------------------------------------------------------------%
-punct(Src, Punct, unit) -->
- match_string(Src, Punct),
- whitespace(Src, _).
+punct(Src, Punct, unit, !PS) :-
+ match_string(Src, Punct, !PS),
+ whitespace(Src, _, !PS).
%-----------------------------------------------------------------------------%
-keyword(Src, IdChars, Keyword, unit) -->
- match_string(Src, Keyword),
- not char_in_class(Src, IdChars, _),
- whitespace(Src, _).
+keyword(Src, IdChars, Keyword, unit, !PS) :-
+ match_string(Src, Keyword, !PS),
+ not char_in_class(Src, IdChars, _, !.PS, _),
+ whitespace(Src, _, !PS).
%-----------------------------------------------------------------------------%
-float(Src, Float) -->
- current_offset(Src, Start),
- ( if char(Src, ('-')) then [] else [] ),
- digits(Src, 10, _),
- char(Src, ('.')),
- digits(Src, 10, _),
- ( if char_in_class(Src, "eE", _) then
- ( if char(Src, ('-')) then [] else [] ),
- digits(Src, 10, _)
+float_literal(Src, Float, !PS) :-
+ current_offset(Src, Start, !PS),
+ ( if next_char(Src, ('-'), !PS) then true else true ),
+ digits(Src, 10, _, !PS),
+ next_char(Src, ('.'), !PS),
+ digits(Src, 10, _, !PS),
+ ( if char_in_class(Src, "eE", _, !PS) then
+ ( if next_char(Src, ('-'), !PS) then true else true ),
+ digits(Src, 10, _, !PS)
else
- []
+ true
),
- current_offset(Src, EndPlusOne),
- whitespace(Src, _),
+ current_offset(Src, EndPlusOne, !PS),
+ whitespace(Src, _, !PS),
input_substring(Src, Start, EndPlusOne, SubString),
- { Float = string.det_to_float(SubString) }.
+ Float = string.det_to_float(SubString).
%-----------------------------------------------------------------------------%
-int(Src, Int) -->
- current_offset(Src, Start),
- optional(Src, char_in_class(Src, "-"), _),
- digits(Src, 10, _),
+int_literal(Src, Int, !PS) :-
+ current_offset(Src, Start, !PS),
+ optional(Src, char_in_class(Src, "-"), _, !PS),
+ digits(Src, 10, _, !PS),
not (
- char(Src, ('.')),
- digits(Src, 10, _)
+ next_char(Src, ('.'), !PS),
+ digits(Src, 10, _, !.PS, _)
),
- current_offset(Src, EndPlusOne),
- whitespace(Src, _),
+ current_offset(Src, EndPlusOne, !PS),
+ whitespace(Src, _, !PS),
input_substring(Src, Start, EndPlusOne, SubString),
- { Int = string.det_to_int(SubString) }.
+ Int = string.det_to_int(SubString).
%-----------------------------------------------------------------------------%
:- pred digits(src::in, int::in)
: parser(unit) `with_inst` parser.
-digits(Src, Base, unit) -->
- char(Src, C),
- { char.digit_to_int(C, D), D < Base },
- digits_2(Src, Base, _).
+digits(Src, Base, unit, !PS) :-
+ next_char(Src, C, !PS),
+ char.digit_to_int(C, D),
+ D < Base,
+ digits_2(Src, Base, _, !PS).
:- pred digits_2(src::in, int::in)
: parser(unit) `with_inst` parser.
-digits_2(Src, Base, unit) -->
+digits_2(Src, Base, unit, !PS) :-
( if
- char(Src, C),
- { char.digit_to_int(C, D), D < Base }
+ next_char(Src, C, !PS),
+ char.digit_to_int(C, D),
+ D < Base
then
- digits_2(Src, Base, _)
+ digits_2(Src, Base, _, !PS)
else
- []
+ true
).
%-----------------------------------------------------------------------------%
-string(Src, QuoteChar, String) -->
- current_offset(Src, Start),
- char(Src, QuoteChar),
- string_2(Src, QuoteChar, _),
- current_offset(Src, EndPlusOne),
- whitespace(Src, _),
+string_literal(Src, QuoteChar, String, !PS) :-
+ current_offset(Src, Start, !PS),
+ next_char(Src, QuoteChar, !PS),
+ string_literal_2(Src, QuoteChar, _, !PS),
+ current_offset(Src, EndPlusOne, !PS),
+ whitespace(Src, _, !PS),
input_substring(Src, Start + 1, EndPlusOne - 1, String).
%-----------------------------------------------------------------------------%
-:- pred string_2(src::in, char::in)
+:- pred string_literal_2(src::in, char::in)
: parser(unit) `with_inst` parser.
-string_2(Src, QuoteChar, unit) -->
- char(Src, C),
- ( if { C = QuoteChar } then
- []
- else if { C = ('\\') } then
- char(Src, _),
- string_2(Src, QuoteChar, _)
+string_literal_2(Src, QuoteChar, unit, !PS) :-
+ next_char(Src, C, !PS),
+ ( if C = QuoteChar then
+ true
+ else if C = ('\\') then
+ next_char(Src, _, !PS),
+ string_literal_2(Src, QuoteChar, _, !PS)
else
- string_2(Src, QuoteChar, _)
+ string_literal_2(Src, QuoteChar, _, !PS)
).
%-----------------------------------------------------------------------------%
-identifier(Src, InitIdChars, IdChars, Identifier) -->
- current_offset(Src, Start),
- char_in_class(Src, InitIdChars, _),
- identifier_2(Src, IdChars, _),
- current_offset(Src, EndPlusOne),
+identifier(Src, InitIdChars, IdChars, Identifier, !PS) :-
+ current_offset(Src, Start, !PS),
+ char_in_class(Src, InitIdChars, _, !PS),
+ identifier_2(Src, IdChars, _, !PS),
+ current_offset(Src, EndPlusOne, !PS),
input_substring(Src, Start, EndPlusOne, Identifier).
%-----------------------------------------------------------------------------%
@@ -376,11 +411,11 @@
:- pred identifier_2(src::in, string::in)
: parser(unit) `with_inst` parser.
-identifier_2(Src, IdChars, unit) -->
- ( if char_in_class(Src, IdChars, _) then
- identifier_2(Src, IdChars, _)
+identifier_2(Src, IdChars, unit, !PS) :-
+ ( if char_in_class(Src, IdChars, _, !PS) then
+ identifier_2(Src, IdChars, _, !PS)
else
- []
+ true
).
%-----------------------------------------------------------------------------%
--------------------------------------------------------------------------
mercury-developers mailing list
Post messages to: mercury-developers at csse.unimelb.edu.au
Administrative Queries: owner-mercury-developers at csse.unimelb.edu.au
Subscriptions: mercury-developers-request at csse.unimelb.edu.au
--------------------------------------------------------------------------
More information about the developers
mailing list