[m-rev.] For review: updates to parsing_utils.m
Ralph Becket
rafe at csse.unimelb.edu.au
Tue Jun 16 17:15:59 AEST 2009
Estimated hours taken: 2
Branches: main
library/parsing_utils.m:
Add support for parsers that update their own state (e.g., a symbol
table). Add support for user-defined whitespace specification (e.g.,
to include comments). Add support for converting offsets into line
numbers (e.g., for error reporting).
tests/general/test_parsing_utils.m:
tests/general/test_parsing_utils.exp:
Update the test case.
Index: library/parsing_utils.m
===================================================================
RCS file: /home/mercury1/repository/mercury/library/parsing_utils.m,v
retrieving revision 1.1
diff -u -r1.1 parsing_utils.m
--- library/parsing_utils.m 28 Jan 2009 07:19:41 -0000 1.1
+++ library/parsing_utils.m 16 Jun 2009 04:30:17 -0000
@@ -15,10 +15,13 @@
% state (ps) input/output pair tracking the current offset into the input.
%
% A new src and ps can be constructed by calling
-% new_src_and_ps(InputString, Src, !:PS). Parsing predicates are semidet
-% and typically take the form p(...input arguments..., Src, Result, !PS).
-% A parser matching variable assignments of the form `x = 42' might be
-% defined like this:
+% new_src_and_ps(InputString, SkipWS, Src, !:PS) where the SkipWS function
+% is used by the primitive parsers to skip over any following whitespace
+% (providing a skipping function allows users to define comments as
+% whitespace).
+% Parsing predicates are semidet and typically take the form
+% p(...parameters..., Src, Result, !PS). A parser matching variable
+% assignments of the form `x = 42' might be defined like this:
%
% var_assignment(Src, {Var, Value}, !PS) :-
% var(Src, Var, !PS),
@@ -54,13 +57,29 @@
%
:- type ps.
- % This type and inst are useful for specifying "standard" parser
+ % These types and insts are useful for specifying "standard" parser
% signatures.
%
:- type parser(T) == pred(src, T, ps, ps).
:- inst parser == ( pred(in, out, in, out) is semidet ).
- % Construct a new parser source and state from a string.
+ % The following are for parsers that also transform a separate state value.
+ %
+:- type parser_with_state(T, S) == pred(src, T, S, S, ps, ps).
+:- inst parser_with_state == ( pred(in, out, in, out, in, out) is semidet ).
+
+ % Construct a new parser source and state from a string, also specifying
+ % a function for skipping over whitespace (several primitive parsers
+ % use this function to consume whitespace after a token; this argument
+ % allows the user to specify a function for, say, skipping over comments
+ % as well).
+ %
+:- pred new_src_and_ps(string::in,
+ (func(src, ps) = ps)::in(func(in, in) = out is det),
+ src::out, ps::out) is det.
+
+ % Construct a new parser source and state from a string (the default
+ % whitespace parser is used).
%
:- pred new_src_and_ps(string::in, src::out, ps::out) is det.
@@ -70,6 +89,21 @@
:- pred current_offset(src::in, int::out,
ps::in, ps::out) is det.
+ % Compute a structure from the parser source which can be used to
+ % convert offsets into line numbers and positions in the file (this
+ % is useful for error reporting).
+ %
+:- type line_numbers.
+
+:- func src_to_line_numbers(src) = line_numbers.
+
+ % Convert an offset into a line number and position within the line
+ % (the first line is number 1; the first character in a line is
+ % position 1).
+ %
+:- pred offset_to_line_number_and_position(line_numbers::in, int::in,
+ int::out, int::out) is det.
+
% input_substring(Src, StartOffset, EndOffsetPlusOne, Substring)
% Copy the substring from the input occupying the offsets
% [StartOffset, EndOffsetPlusOne).
@@ -91,7 +125,7 @@
:- pred punct(string::in, src::in, unit::out,
ps::in, ps::out) is semidet.
- % keyword(Src, IdChars, Keyword, _, !PS) matches Keyword exactly (i.e., it
+ % keyword(IdChars, Keyword, Src, _, !PS) matches Keyword exactly (i.e., it
% must not be followed by any character in IdChars) and any subsequent
% whitespace.
%
@@ -194,11 +228,59 @@
:- pred comma_separated_list(parser(T)::in(parser), src::in, list(T)::out,
ps::in, ps::out) is semidet.
+% The following parser combinators are equivalent to the above, except that
+% a separate state argument is threaded through the computation (e.g., for
+% parsers that incrementally construct a symbol table).
+
+ % optional(P, Src, Result, !S, !PS) returns Result = yes(X),
+ % if P(Src, X, !S, !PS), or Result = no if P does not succeed.
+ %
+:- pred optional(parser_with_state(T, S)::in(parser_with_state), src::in,
+ maybe(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
+
+ % zero_or_more(P, Src, Xs, !S, !PS) returns the list of results Xs obtained
+ % by repeatedly applying P until P fails. The nth item in Xs is
+ % the result from the nth application of P.
+ %
+:- pred zero_or_more(parser_with_state(T, S)::in(parser_with_state), src::in,
+ list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
+
+ % one_or_more(P, Src, Xs, !S, !PS) returns the list of results Xs obtained
+ % by repeatedly applying P until P fails. The nth item in Xs is
+ % the result from the nth application of P. P must succeed at
+ % least once.
+ %
+:- pred one_or_more(parser_with_state(T, S)::in(parser_with_state), src::in,
+ list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
+
+ % brackets(L, R, P, Src, X, !S, !PS) is equivalent to
+ % punct(L, Src, _, !PS), P(Src, X, !S, !PS), punct(R, Src, _, !PS).
+ %
+:- pred brackets(string::in, string::in,
+ parser_with_state(T, S)::in(parser_with_state), src::in,
+ T::out, S::in, S::out, ps::in, ps::out) is semidet.
+
+ % separated_list(Separator, P, Src, Xs, !S, !PS) is like
+ % zero_or_more(P, Src, Xs, !S, !PS) except that successive applications of
+ % P must be separated by punct(Separator, Src, _, !PS).
+ %
+:- pred separated_list(string::in,
+ parser_with_state(T, S)::in(parser_with_state),
+ src::in, list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
+
+ % comma_separated_list(P, Src, Xs, !S, !PS) is the same as
+ % separated_list(",", P, Src, Xs, !S, !PS).
+ %
+:- pred comma_separated_list(parser_with_state(T, S)::in(parser_with_state),
+ src::in, list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
+
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
:- implementation.
+:- import_module array.
+
% The parser "state" is just the offset into the input string.
@@ -208,21 +290,90 @@
:- type src
---> src(
input_length :: int,
- input_string :: string
+ input_string :: string,
+ skip_ws_func :: func(src, ps) = ps
).
%-----------------------------------------------------------------------------%
new_src_and_ps(InputString, Src, PS) :-
- Src = src(string.length(InputString), InputString),
+ new_src_and_ps(InputString, skip_whitespace, Src, PS).
+
+%-----------------------------------------------------------------------------%
+
+new_src_and_ps(InputString, SkipWS, Src, PS) :-
+ Src = src(string.length(InputString), InputString, SkipWS),
PS = 0.
%-----------------------------------------------------------------------------%
+
+:- func skip_whitespace(src, ps) = ps.
+
+skip_whitespace(Src, PS0) =
+ ( if whitespace(Src, _, PS0, PS) then PS else PS0 ).
+
+%-----------------------------------------------------------------------------%
+
+:- pred skip_whitespace(src::in, ps::in, ps::out) is det.
+
+skip_whitespace(Src, PS0, PS) :-
+ SkipWS = Src ^ skip_ws_func,
+ PS = SkipWS(Src, PS0).
+
+%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
% Low-level predicates.
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
+:- type line_numbers == array(int).
+
+%-----------------------------------------------------------------------------%
+
+src_to_line_numbers(Src) = LineNos :-
+ Str = Src ^ input_string,
+ Lo = 0,
+ Hi = Src ^ input_length - 1,
+ F = ( func(I, Ns) =
+ ( if string.unsafe_index(Str, I) = ('\n') then [I | Ns] else Ns )
+ ),
+ LineNosList = int.fold_down(F, Lo, Hi, []),
+ LineNos = array(LineNosList).
+
+%-----------------------------------------------------------------------------%
+
+offset_to_line_number_and_position(LineNos, Offset, LineNo, Pos) :-
+ Lo = 0,
+ Hi = array.size(LineNos) - 1,
+ offset_to_line_number_and_position_2(LineNos, Lo, Hi, Offset, LineNo, Pos).
+
+%-----------------------------------------------------------------------------%
+
+:- pred offset_to_line_number_and_position_2(line_numbers::in, int::in,
+ int::in, int::in, int::out, int::out) is det.
+
+ % Perform a binary search looking for the offset of the line number
+ % of the line containing Offset.
+ %
+offset_to_line_number_and_position_2(LineNos, Lo, Hi, Offset, LineNo, Pos) :-
+ ( if Lo < Hi then
+ Mid = (Lo + Hi) / 2,
+ MidOffset = LineNos ^ elem(Mid),
+ ( if MidOffset < Offset then
+ offset_to_line_number_and_position_2(LineNos, Mid + 1, Hi, Offset,
+ LineNo, Pos)
+ else
+ offset_to_line_number_and_position_2(LineNos, Lo, Mid, Offset,
+ LineNo, Pos)
+ )
+ else
+ LoOffset = LineNos ^ elem(Lo),
+ LineNo = 1 + Lo,
+ Pos = 1 + Offset - LoOffset
+ ).
+
+%-----------------------------------------------------------------------------%
+
current_offset(_Src, Offset, !PS) :-
Offset = !.PS.
@@ -291,11 +442,50 @@
%-----------------------------------------------------------------------------%
+optional(P, Src, Result, !S, !PS) :-
+ ( if P(Src, X, !S, !PS) then
+ Result = yes(X)
+ else
+ Result = no,
+ semidet_true
+ ).
+
+%-----------------------------------------------------------------------------%
+
zero_or_more(P, Src, Result, !PS) :-
- ( if P(Src, X, !PS), zero_or_more(P, Src, Xs, !PS) then
- Result = [X | Xs]
+ zero_or_more_2(P, Src, [], RevResult, !PS),
+ Result = list.reverse(RevResult).
+
+
+ % We use an auxiliary predicate to make this tail recursive. This can
+ % be an issue with long sequences.
+ %
+:- pred zero_or_more_2(parser(T)::in(parser), src::in,
+ list(T)::in, list(T)::out, ps::in, ps::out) is semidet.
+
+zero_or_more_2(P, Src, !RevResult, !PS) :-
+ ( if P(Src, X, !PS) then
+ list.cons(X, !RevResult),
+ zero_or_more_2(P, Src, !RevResult, !PS)
+ else
+ semidet_true
+ ).
+
+%-----------------------------------------------------------------------------%
+
+zero_or_more(P, Src, Result, !S, !PS) :-
+ zero_or_more_2(P, Src, [], RevResult, !S, !PS),
+ Result = list.reverse(RevResult).
+
+
+:- pred zero_or_more_2(parser_with_state(T, S)::in(parser_with_state), src::in,
+ list(T)::in, list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
+
+zero_or_more_2(P, Src, !RevResult, !S, !PS) :-
+ ( if P(Src, X, !S, !PS) then
+ list.cons(X, !RevResult),
+ zero_or_more_2(P, Src, !RevResult, !S, !PS)
else
- Result = [],
semidet_true
).
@@ -308,6 +498,13 @@
%-----------------------------------------------------------------------------%
+one_or_more(P, Src, Result, !S, !PS) :-
+ P(Src, X, !S, !PS),
+ zero_or_more(P, Src, Xs, !S, !PS),
+ Result = [X | Xs].
+
+%-----------------------------------------------------------------------------%
+
brackets(L, R, P, Src, Result, !PS) :-
punct(L, Src, _, !PS),
P(Src, Result, !PS),
@@ -315,6 +512,13 @@
%-----------------------------------------------------------------------------%
+brackets(L, R, P, Src, Result, !S, !PS) :-
+ punct(L, Src, _, !PS),
+ P(Src, Result, !S, !PS),
+ punct(R, Src, _, !PS).
+
+%-----------------------------------------------------------------------------%
+
separated_list(Separator, P, Src, Result, !PS) :-
CommaP = ( pred(CommaPSrc::in, CommaPX::out, !.PS::in, !:PS::out)
is semidet :-
@@ -327,19 +531,37 @@
%-----------------------------------------------------------------------------%
+separated_list(Separator, P, Src, Result, !S, !PS) :-
+ CommaP = ( pred(CommaPSrc::in, CommaPX::out,
+ !.S::in, !:S::out, !.PS::in, !:PS::out)
+ is semidet :-
+ punct(Separator, CommaPSrc, _, !PS),
+ P(CommaPSrc, CommaPX, !S, !PS)
+ ),
+ P(Src, X, !S, !PS),
+ zero_or_more(CommaP, Src, Xs, !S, !PS),
+ Result = [X | Xs].
+
+%-----------------------------------------------------------------------------%
+
comma_separated_list(P, Src, Result, !PS) :-
separated_list(",", P, Src, Result, !PS).
%-----------------------------------------------------------------------------%
+comma_separated_list(P, Src, Result, !S, !PS) :-
+ separated_list(",", P, Src, Result, !S, !PS).
+
+%-----------------------------------------------------------------------------%
+
whitespace(Src, unit, !PS) :-
( if
next_char(Src, C, !PS),
char.is_whitespace(C)
then
- whitespace(Src, _, !PS)
+ skip_whitespace(Src, !PS)
else
- true
+ semidet_true
).
%-----------------------------------------------------------------------------%
@@ -352,14 +574,14 @@
punct(Punct, Src, unit, !PS) :-
match_string(Punct, Src, !PS),
- whitespace(Src, _, !PS).
+ skip_whitespace(Src, !PS).
%---------------------------------------------------------------------------%
keyword(IdChars, Keyword, Src, unit, !PS) :-
match_string(Keyword, Src, !PS),
not char_in_class(IdChars, Src, _, !.PS, _),
- whitespace(Src, _, !PS).
+ skip_whitespace(Src, !PS).
%-----------------------------------------------------------------------------%
@@ -376,7 +598,7 @@
true
),
current_offset(Src, EndPlusOne, !PS),
- whitespace(Src, _, !PS),
+ skip_whitespace(Src, !PS),
input_substring(Src, Start, EndPlusOne, FloatStr).
%-----------------------------------------------------------------------------%
@@ -396,7 +618,7 @@
digits(10, Src, _, !.PS, _)
),
current_offset(Src, EndPlusOne, !PS),
- whitespace(Src, _, !PS),
+ skip_whitespace(Src, !PS),
input_substring(Src, Start, EndPlusOne, IntStr).
%-----------------------------------------------------------------------------%
@@ -438,7 +660,7 @@
next_char(Src, QuoteChar, !PS),
string_literal_2(Src, QuoteChar, _, !PS),
current_offset(Src, EndPlusOne, !PS),
- whitespace(Src, _, !PS),
+ skip_whitespace(Src, !PS),
input_substring(Src, Start + 1, EndPlusOne - 1, String).
%-----------------------------------------------------------------------------%
@@ -464,7 +686,7 @@
char_in_class(InitIdChars, Src, _, !PS),
identifier_2(IdChars, Src, _, !PS),
current_offset(Src, EndPlusOne, !PS),
- whitespace(Src, _, !PS),
+ skip_whitespace(Src, !PS),
input_substring(Src, Start, EndPlusOne, Identifier).
%-----------------------------------------------------------------------------%
Index: tests/general/test_parsing_utils.exp
===================================================================
RCS file: /home/mercury1/repository/tests/general/test_parsing_utils.exp,v
retrieving revision 1.1
diff -u -r1.1 test_parsing_utils.exp
--- tests/general/test_parsing_utils.exp 28 Jan 2009 07:19:42 -0000 1.1
+++ tests/general/test_parsing_utils.exp 16 Jun 2009 05:09:43 -0000
@@ -213,3 +213,20 @@
pass: comma_separated_list(punct("!")) on "!, ! , ! abc"
returned [unit, unit, unit] as expected
[11 chars consumed]
+pass: optional(int_with_state) on "abc"
+ returned [] as expected
+ [0 chars consumed]
+pass: optional(int_with_state) on "1"
+ returned [1] as expected
+ [1 chars consumed]
+pass: zero_or_more(int_with_state) on "abc"
+ returned [] as expected
+ [0 chars consumed]
+pass: zero_or_more(int_with_state) on "1 2 3"
+ returned [3, 2, 1] as expected
+ [5 chars consumed]
+pass: one_or_more(int_with_state) on "abc"
+ failed as expected
+pass: one_or_more(int_with_state) on "1 2 3"
+ returned [3, 2, 1] as expected
+ [5 chars consumed]
Index: tests/general/test_parsing_utils.m
===================================================================
RCS file: /home/mercury1/repository/tests/general/test_parsing_utils.m,v
retrieving revision 1.1
diff -u -r1.1 test_parsing_utils.m
--- tests/general/test_parsing_utils.m 28 Jan 2009 07:19:42 -0000 1.1
+++ tests/general/test_parsing_utils.m 16 Jun 2009 05:08:16 -0000
@@ -295,6 +295,38 @@
stringify(comma_separated_list(punct("!"))),
"!, ! , ! abc", yes("[unit, unit, unit]")).
+test_case("optional(int_with_state)",
+ stringify_state(optional(int_with_state)),
+ "abc", yes("[]")).
+
+test_case("optional(int_with_state)",
+ stringify_state(optional(int_with_state)),
+ "1", yes("[1]")).
+
+test_case("zero_or_more(int_with_state)",
+ stringify_state(zero_or_more(int_with_state)),
+ "abc", yes("[]")).
+
+test_case("zero_or_more(int_with_state)",
+ stringify_state(zero_or_more(int_with_state)),
+ "1 2 3", yes("[3, 2, 1]")).
+
+test_case("one_or_more(int_with_state)",
+ stringify_state(one_or_more(int_with_state)),
+ "abc", no).
+
+test_case("one_or_more(int_with_state)",
+ stringify_state(one_or_more(int_with_state)),
+ "1 2 3", yes("[3, 2, 1]")).
+
+%-----------------------------------------------------------------------------%
+
+:- pred int_with_state(src::in, int::out, list(int)::in, list(int)::out,
+ ps::in, ps::out) is semidet.
+
+int_with_state(Src, X, Xs, [X | Xs], !PS) :-
+ int_literal(Src, X, !PS).
+
%-----------------------------------------------------------------------------%
:- pred stringify(
@@ -310,4 +342,19 @@
String = string.string(X).
%-----------------------------------------------------------------------------%
+
+:- pred stringify_state(
+ pred(src, T, list(S), list(S), ps, ps)::
+ in(pred(in, out, in, out, in, out) is semidet),
+ src::in,
+ string::out,
+ ps::in,
+ ps::out)
+ is semidet.
+
+stringify_state(P, Src, String, !PS) :-
+ P(Src, _, [], State, !PS),
+ String = string.string(State).
+
+%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
--------------------------------------------------------------------------
mercury-reviews mailing list
Post messages to: mercury-reviews at csse.unimelb.edu.au
Administrative Queries: owner-mercury-reviews at csse.unimelb.edu.au
Subscriptions: mercury-reviews-request at csse.unimelb.edu.au
--------------------------------------------------------------------------
More information about the reviews
mailing list