[m-dev.] Proposal: parsing module for the library.
Ralph Becket
rafe at csse.unimelb.edu.au
Tue Jan 13 14:25:28 AEDT 2009
Below is a recursive descent parsing module I'd like to add to the
library (I've used pretty much identical code half a dozen times over
the last two or three years, so it's useful stuff).
-- Ralph
%-----------------------------------------------------------------------------%
% parsing.m
% Ralph Becket <rafe at csse.unimelb.edu.au>
% Tue Jan 13 11:32:49 EST 2009
% vim: ft=mercury ts=4 sw=4 et wm=0 tw=0
%
% Support for DCG style parsers.
%
%-----------------------------------------------------------------------------%
:- module parsing.
:- interface.
:- import_module char.
:- import_module float.
:- import_module int.
:- import_module list.
:- import_module maybe.
:- import_module string.
:- import_module unit.
% The parser "state", passed around in DCG arguments.
%
:- type ps.
% The parser source (input string).
%
:- type src.
% Construct a new parser source and state from a string.
%
:- pred new_src_and_ps(string::in, src::out, ps::out) is det.
:- type parser(T) == pred(T, ps, ps).
:- inst parser == ( pred(out, in, out) is semidet ).
% Read the next char.
%
:- pred char(src::in)
: parser(char) `with_inst` parser.
% Match a char from the given string.
%
:- pred char_in_class(src::in, string::in)
: parser(char) `with_inst` parser.
% Match a string exactly and any subsequent whitespace.
%
:- pred punct(src::in, string::in)
: parser(unit) `with_inst` parser.
% keyword(Src, IdChars, Keyword, _) matches Keyword exactly (i.e., it must
% not be followed by any character in IdChars) and any subsequent
% whitespace.
%
:- pred keyword(src::in, string::in, string::in)
: parser(unit) `with_inst` parser.
% identifier(Src, InitIdChars, IdChars, Identifier) matches the next
% identifer (result in Identifier) comprising a char from InitIdChars
% followed by zero or more chars from IdChars.
%
:- pred identifier(src::in, string::in, string::in)
: parser(string) `with_inst` parser.
% Consume any whitespace.
%
:- pred whitespace(src::in)
: parser(unit) `with_inst` parser.
% Consume any input up to, and including, the next newline character
% marking the end of the current line.
%
:- pred skip_to_eol(src::in)
: parser(unit) `with_inst` parser.
% Succeed if we have reached the end of the input.
%
:- pred eof(src::in)
: parser(unit) `with_inst` parser.
% Parse a float literal
%
:- pred float(src::in)
: parser(float) `with_inst` parser.
% Parse an int literal.
%
:- pred int(src::in)
: parser(int) `with_inst` parser.
% Parse an string literal. The string argument is the quote character.
%
:- pred string(src::in, char::in)
: parser(string) `with_inst` parser.
:- pred optional(src::in, parser(T)::in(parser))
: parser(maybe(T)) `with_inst` parser.
:- pred zero_or_more(src::in, parser(T)::in(parser))
: parser(list(T)) `with_inst` parser.
:- pred one_or_more(src::in, parser(T)::in(parser))
: parser(list(T)) `with_inst` parser.
:- pred brackets(src::in, string::in, string::in, parser(T)::in(parser))
: parser(T) `with_inst` parser.
:- pred separated_list(src::in, string::in, parser(T)::in(parser))
: parser(list(T)) `with_inst` parser.
:- pred comma_separated_list(src::in, parser(T)::in(parser))
: parser(list(T)) `with_inst` parser.
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
:- implementation.
% The parser "state" is just the offset into the input string.
%
:- type ps == int.
:- type src
---> src(
input_length :: int,
input_string :: string
).
%-----------------------------------------------------------------------------%
new_src_and_ps(InputString, Src, PS) :-
Src = src(string.length(InputString), InputString),
PS = 0.
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
% Low-level predicates.
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
:- pred current_offset(src::in, int::out, ps::in, ps::out) is det.
current_offset(_Src, PS, PS, PS).
%-----------------------------------------------------------------------------%
eof(Src, unit, !PS) :-
!.PS = Src ^ input_length.
%-----------------------------------------------------------------------------%
char(Src, Char, PS, PS + 1) :-
PS < Src ^ input_length,
Char = Src ^ input_string ^ unsafe_elem(PS).
%-----------------------------------------------------------------------------%
char_in_class(Src, CharClass, Char) -->
char(Src, Char),
{ string.contains_char(CharClass, Char) }.
%-----------------------------------------------------------------------------%
:- pred input_substring(src::in, int::in, int::in)
: parser(string) `with_inst` parser.
input_substring(Src, Start, EndPlusOne, Substring, !PS) :-
EndPlusOne =< Src ^ input_length,
Substring =
unsafe_substring(Src ^ input_string, Start, EndPlusOne - Start).
%-----------------------------------------------------------------------------%
:- pred match_string(src::in, string::in, ps::in, ps::out) is semidet.
match_string(Src, MatchStr, PS, PS + N) :-
N = string.length(MatchStr),
PS + N =< Src ^ input_length,
match_string_2(N, 0, MatchStr, PS, Src ^ input_string).
:- pred match_string_2(int::in, int::in, string::in, int::in, string::in)
is semidet.
match_string_2(N, I, MatchStr, Offset, Str) :-
( if I < N then
MatchStr ^ unsafe_elem(I) = Str ^ unsafe_elem(Offset + I),
match_string_2(N, I + 1, MatchStr, Offset, Str)
else
true
).
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
% Utility predicates.
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
optional(_Src, P, Result) -->
( if P(X) then
{ Result = yes(X) }
else
{ Result = no },
{ semidet_true }
).
%-----------------------------------------------------------------------------%
zero_or_more(Src, P, Result) -->
( if P(X), zero_or_more(Src, P, Xs) then
{ Result = [X | Xs] }
else
{ Result = [] },
{ semidet_true }
).
%-----------------------------------------------------------------------------%
one_or_more(Src, P, Result) -->
P(X),
zero_or_more(Src, P, Xs),
{ Result = [X | Xs] }.
%-----------------------------------------------------------------------------%
brackets(Src, L, R, P, Result) -->
punct(Src, L, _),
P(Result),
punct(Src, R, _).
%-----------------------------------------------------------------------------%
separated_list(Src, Separator, P, Result) -->
{ CommaP = ( pred(CommaPX::out, !.PS::in, !:PS::out) is semidet :-
punct(Src, Separator, _, !PS),
P(CommaPX, !PS)
) },
P(X),
zero_or_more(Src, CommaP, Xs),
{ Result = [X | Xs] }.
%-----------------------------------------------------------------------------%
comma_separated_list(Src, P, Result) -->
separated_list(Src, ",", P, Result).
%-----------------------------------------------------------------------------%
whitespace(Src, unit) -->
( if char(Src, C), { char.is_whitespace(C) } then
whitespace(Src, _)
else
[]
).
%-----------------------------------------------------------------------------%
skip_to_eol(Src, unit) -->
char(Src, C),
( if { C = ('\n') } then [] else skip_to_eol(Src, _) ).
%-----------------------------------------------------------------------------%
punct(Src, Punct, unit) -->
match_string(Src, Punct),
whitespace(Src, _).
%-----------------------------------------------------------------------------%
keyword(Src, IdChars, Keyword, unit) -->
match_string(Src, Keyword),
not char_in_class(Src, IdChars, _),
whitespace(Src, _).
%-----------------------------------------------------------------------------%
float(Src, Float) -->
current_offset(Src, Start),
( if char(Src, ('-')) then [] else [] ),
digits(Src, 10, _),
char(Src, ('.')),
digits(Src, 10, _),
( if char_in_class(Src, "eE", _) then
( if char(Src, ('-')) then [] else [] ),
digits(Src, 10, _)
else
[]
),
current_offset(Src, EndPlusOne),
whitespace(Src, _),
input_substring(Src, Start, EndPlusOne, SubString),
{ Float = string.det_to_float(SubString) }.
%-----------------------------------------------------------------------------%
int(Src, Int) -->
current_offset(Src, Start),
optional(Src, char_in_class(Src, "-"), _),
digits(Src, 10, _),
not (
char(Src, ('.')),
digits(Src, 10, _)
),
current_offset(Src, EndPlusOne),
whitespace(Src, _),
input_substring(Src, Start, EndPlusOne, SubString),
{ Int = string.det_to_int(SubString) }.
%-----------------------------------------------------------------------------%
:- pred digits(src::in, int::in)
: parser(unit) `with_inst` parser.
digits(Src, Base, unit) -->
char(Src, C),
{ char.digit_to_int(C, D), D < Base },
digits_2(Src, Base, _).
:- pred digits_2(src::in, int::in)
: parser(unit) `with_inst` parser.
digits_2(Src, Base, unit) -->
( if
char(Src, C),
{ char.digit_to_int(C, D), D < Base }
then
digits_2(Src, Base, _)
else
[]
).
%-----------------------------------------------------------------------------%
string(Src, QuoteChar, String) -->
current_offset(Src, Start),
char(Src, QuoteChar),
string_2(Src, QuoteChar, _),
current_offset(Src, EndPlusOne),
whitespace(Src, _),
input_substring(Src, Start + 1, EndPlusOne - 1, String).
%-----------------------------------------------------------------------------%
:- pred string_2(src::in, char::in)
: parser(unit) `with_inst` parser.
string_2(Src, QuoteChar, unit) -->
char(Src, C),
( if { C = QuoteChar } then
[]
else if { C = ('\\') } then
char(Src, _),
string_2(Src, QuoteChar, _)
else
string_2(Src, QuoteChar, _)
).
%-----------------------------------------------------------------------------%
identifier(Src, InitIdChars, IdChars, Identifier) -->
current_offset(Src, Start),
char_in_class(Src, InitIdChars, _),
identifier_2(Src, IdChars, _),
current_offset(Src, EndPlusOne),
input_substring(Src, Start, EndPlusOne, Identifier).
%-----------------------------------------------------------------------------%
:- pred identifier_2(src::in, string::in)
: parser(unit) `with_inst` parser.
identifier_2(Src, IdChars, unit) -->
( if char_in_class(Src, IdChars, _) then
identifier_2(Src, IdChars, _)
else
[]
).
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
--------------------------------------------------------------------------
mercury-developers mailing list
Post messages to: mercury-developers at csse.unimelb.edu.au
Administrative Queries: owner-mercury-developers at csse.unimelb.edu.au
Subscriptions: mercury-developers-request at csse.unimelb.edu.au
--------------------------------------------------------------------------
More information about the developers
mailing list