[m-dev.] Proposal: parsing module for the library.

Ralph Becket rafe at csse.unimelb.edu.au
Tue Jan 13 17:00:21 AEDT 2009


Here's a diff addressing most of the review comments:

--- parsing.m	2009-01-13 16:52:32.206161260 +1100
+++ parsing_utils.m	2009-01-13 16:59:03.720192432 +1100
@@ -1,14 +1,14 @@
 %-----------------------------------------------------------------------------%
-% parsing.m
+% parsing_utils.m
 % Ralph Becket <rafe at csse.unimelb.edu.au>
 % Tue Jan 13 11:32:49 EST 2009
 % vim: ft=mercury ts=4 sw=4 et wm=0 tw=0
 %
-% Support for DCG style parsers.
+% Utilities for recursive descent parsers.
 %
 %-----------------------------------------------------------------------------%
 
-:- module parsing.
+:- module parsing_utils.
 
 :- interface.
 
@@ -30,16 +30,20 @@
     %
 :- type src.
 
+:- type parser(T) == pred(T, ps, ps).
+:- inst parser == ( pred(out, in, out) is semidet ).
+
     % Construct a new parser source and state from a string.
     %
 :- pred new_src_and_ps(string::in, src::out, ps::out) is det.
 
-:- type parser(T) == pred(T, ps, ps).
-:- inst parser == ( pred(out, in, out) is semidet ).
+    % Obtain the current offset from the start of the input string.
+    %
+:- pred current_offset(src::in, int::out, ps::in, ps::out) is det.
 
     % Read the next char.
     %
-:- pred char(src::in)
+:- pred next_char(src::in)
         : parser(char) `with_inst` parser.
 
     % Match a char from the given string.
@@ -82,36 +86,61 @@
 :- pred eof(src::in)
         : parser(unit) `with_inst` parser.
 
-    % Parse a float literal
+    % Parse a float literal matching [-][0-9]+[.][0-9]+([Ee][-][0-9]+)?
     %
-:- pred float(src::in)
+:- pred float_literal(src::in)
         : parser(float) `with_inst` parser.
 
-    % Parse an int literal.
+    % Parse an int literal matching [-][0-9]+, not followed by [.][0-9]+.
     %
-:- pred int(src::in)
+:- pred int_literal(src::in)
         : parser(int) `with_inst` parser.
     
         % Parse an string literal.  The string argument is the quote character.
+        % A backslash (\) character in the string makes the next character
+        % literal (e.g., for embedding quotes).  These 'escaped' characters
+        % are included as-is in the result, along with the preceding backslash.
         %
-:- pred string(src::in, char::in)
+:- pred string_literal(src::in, char::in)
         : parser(string) `with_inst` parser.
 
+        % optional(Src, P) returns yes(X), if P succeeds returning X, or no
+        % if P does not succeed.
+        %
 :- pred optional(src::in, parser(T)::in(parser))
         : parser(maybe(T)) `with_inst` parser.
 
+        % zero_or_more(Src, P, Xs) returns the list of results Xs obtained
+        % by repeatedly applying P until P fails.  The nth item in Xs is
+        % the result from the nth application of P.
+        %
 :- pred zero_or_more(src::in, parser(T)::in(parser))
         : parser(list(T)) `with_inst` parser.
 
+        % one_or_more(Src, P, Xs) returns the list of results Xs obtained
+        % by repeatedly applying P until P fails.  The nth item in Xs is
+        % the result from the nth application of P.  P must succeed at
+        % least once.
+        %
 :- pred one_or_more(src::in, parser(T)::in(parser))
         : parser(list(T)) `with_inst` parser.
 
+        % brackets(Src, L, R, P, X) is equivalent to
+        %   punct(Src, L, _), P(Src, X), punct(Src, R, _).
+        %
 :- pred brackets(src::in, string::in, string::in, parser(T)::in(parser))
         : parser(T) `with_inst` parser.
 
+        % separated_list(Src, Separator, P, Xs) is like
+        % zero_or_more(Src, P, Xs) except that successive applications of
+        % P must be separated by punct(Src, Separator, _).
+        %
 :- pred separated_list(src::in, string::in, parser(T)::in(parser))
         : parser(list(T)) `with_inst` parser.
 
+        % comma_separated_list(Src, P, Xs) is the same as
+        %   separated_list(Src, ",", P, Xs).
+        %
 :- pred comma_separated_list(src::in, parser(T)::in(parser))
         : parser(list(T)) `with_inst` parser.
 
@@ -144,33 +173,34 @@
 %-----------------------------------------------------------------------------%
 %-----------------------------------------------------------------------------%
 
-:- pred current_offset(src::in, int::out, ps::in, ps::out) is det.
-
-current_offset(_Src, PS, PS, PS).
+current_offset(_Src, Offset, !PS) :-
+    Offset = !.PS.
 
 %-----------------------------------------------------------------------------%
 
 eof(Src, unit, !PS) :-
-    !.PS = Src ^ input_length.
+    current_offset(Src, Offset, !PS),
+    Offset = Src ^ input_length.
 
 %-----------------------------------------------------------------------------%
 
-char(Src, Char, PS, PS + 1) :-
-    PS < Src ^ input_length,
-    Char = Src ^ input_string ^ unsafe_elem(PS).
+next_char(Src, Char, !PS) :-
+    current_offset(Src, Offset, !PS),
+    Offset < Src ^ input_length,
+    Char = Src ^ input_string ^ unsafe_elem(Offset),
+    !:PS = !.PS + 1.
 
 %-----------------------------------------------------------------------------%
 
-char_in_class(Src, CharClass, Char) -->
-    char(Src, Char),
-    { string.contains_char(CharClass, Char) }.
+char_in_class(Src, CharClass, Char, !PS) :-
+    next_char(Src, Char, !PS),
+    string.contains_char(CharClass, Char).
 
 %-----------------------------------------------------------------------------%
 
-:- pred input_substring(src::in, int::in, int::in)
-        : parser(string) `with_inst` parser.
+:- pred input_substring(src::in, int::in, int::in, string::out) is semidet.
 
-input_substring(Src, Start, EndPlusOne, Substring, !PS) :-
+input_substring(Src, Start, EndPlusOne, Substring) :-
     EndPlusOne =< Src ^ input_length,
     Substring =
         unsafe_substring(Src ^ input_string, Start, EndPlusOne - Start).
@@ -202,173 +232,178 @@
 %-----------------------------------------------------------------------------%
 %-----------------------------------------------------------------------------%
 
-optional(_Src, P, Result) -->
-    ( if P(X) then
-        { Result = yes(X) }
+optional(_Src, P, Result, !PS) :-
+    ( if P(X, !PS) then
+        Result = yes(X)
       else
-        { Result = no },
-        { semidet_true }
+        Result = no,
+        semidet_true
     ).
 
 %-----------------------------------------------------------------------------%
 
-zero_or_more(Src, P, Result) -->
-    ( if P(X), zero_or_more(Src, P, Xs) then
-        { Result = [X | Xs] }
+zero_or_more(Src, P, Result, !PS) :-
+    ( if P(X, !PS), zero_or_more(Src, P, Xs, !PS) then
+        Result = [X | Xs]
       else
-        { Result = [] },
-        { semidet_true }
+        Result = [],
+        semidet_true
     ).
 
 %-----------------------------------------------------------------------------%
 
-one_or_more(Src, P, Result) -->
-    P(X),
-    zero_or_more(Src, P, Xs),
-    { Result = [X | Xs] }.
+one_or_more(Src, P, Result, !PS) :-
+    P(X, !PS),
+    zero_or_more(Src, P, Xs, !PS),
+    Result = [X | Xs].
 
 %-----------------------------------------------------------------------------%
 
-brackets(Src, L, R, P, Result) -->
-    punct(Src, L, _),
-    P(Result),
-    punct(Src, R, _).
+brackets(Src, L, R, P, Result, !PS) :-
+    punct(Src, L, _, !PS),
+    P(Result, !PS),
+    punct(Src, R, _, !PS).
 
 %-----------------------------------------------------------------------------%
 
-separated_list(Src, Separator, P, Result) -->
-    { CommaP = ( pred(CommaPX::out, !.PS::in, !:PS::out) is semidet :-
+separated_list(Src, Separator, P, Result, !PS) :-
+    CommaP = ( pred(CommaPX::out, !.PS::in, !:PS::out) is semidet :-
         punct(Src, Separator, _, !PS),
         P(CommaPX, !PS)
-    ) },
-    P(X),
-    zero_or_more(Src, CommaP, Xs),
-    { Result = [X | Xs] }.
+    ),
+    P(X, !PS),
+    zero_or_more(Src, CommaP, Xs, !PS),
+    Result = [X | Xs].
 
 %-----------------------------------------------------------------------------%
 
-comma_separated_list(Src, P, Result) -->
-    separated_list(Src, ",", P, Result).
+comma_separated_list(Src, P, Result, !PS) :-
+    separated_list(Src, ",", P, Result, !PS).
 
 %-----------------------------------------------------------------------------%
 
-whitespace(Src, unit) -->
-    ( if char(Src, C), { char.is_whitespace(C) } then
-        whitespace(Src, _)
+whitespace(Src, unit, !PS) :-
+    ( if
+        next_char(Src, C, !PS),
+        char.is_whitespace(C)
+      then
+        whitespace(Src, _, !PS)
       else
-        []
+        true
     ).
 
 %-----------------------------------------------------------------------------%
 
-skip_to_eol(Src, unit) -->
-    char(Src, C),
-    ( if { C = ('\n') } then [] else skip_to_eol(Src, _) ).
+skip_to_eol(Src, unit, !PS) :-
+    next_char(Src, C, !PS),
+    ( if C = ('\n') then true else skip_to_eol(Src, _, !PS) ).
 
 %-----------------------------------------------------------------------------%
 
-punct(Src, Punct, unit) -->
-    match_string(Src, Punct),
-    whitespace(Src, _).
+punct(Src, Punct, unit, !PS) :-
+    match_string(Src, Punct, !PS),
+    whitespace(Src, _, !PS).
 
 %-----------------------------------------------------------------------------%
 
-keyword(Src, IdChars, Keyword, unit) -->
-    match_string(Src, Keyword),
-    not char_in_class(Src, IdChars, _),
-    whitespace(Src, _).
+keyword(Src, IdChars, Keyword, unit, !PS) :-
+    match_string(Src, Keyword, !PS),
+    not char_in_class(Src, IdChars, _, !.PS, _),
+    whitespace(Src, _, !PS).
 
 %-----------------------------------------------------------------------------%
 
-float(Src, Float) -->
-    current_offset(Src, Start),
-    ( if char(Src, ('-')) then [] else [] ),
-    digits(Src, 10, _),
-    char(Src, ('.')),
-    digits(Src, 10, _),
-    ( if char_in_class(Src, "eE", _) then
-        ( if char(Src, ('-')) then [] else [] ),
-        digits(Src, 10, _)
+float_literal(Src, Float, !PS) :-
+    current_offset(Src, Start, !PS),
+    ( if next_char(Src, ('-'), !PS) then true else true ),
+    digits(Src, 10, _, !PS),
+    next_char(Src, ('.'), !PS),
+    digits(Src, 10, _, !PS),
+    ( if char_in_class(Src, "eE", _, !PS) then
+        ( if next_char(Src, ('-'), !PS) then true else true ),
+        digits(Src, 10, _, !PS)
       else
-        []
+        true
     ),
-    current_offset(Src, EndPlusOne),
-    whitespace(Src, _),
+    current_offset(Src, EndPlusOne, !PS),
+    whitespace(Src, _, !PS),
     input_substring(Src, Start, EndPlusOne, SubString),
-    { Float = string.det_to_float(SubString) }.
+    Float = string.det_to_float(SubString).
 
 %-----------------------------------------------------------------------------%
 
-int(Src, Int) -->
-    current_offset(Src, Start),
-    optional(Src, char_in_class(Src, "-"), _),
-    digits(Src, 10, _),
+int_literal(Src, Int, !PS) :-
+    current_offset(Src, Start, !PS),
+    optional(Src, char_in_class(Src, "-"), _, !PS),
+    digits(Src, 10, _, !PS),
     not (
-        char(Src, ('.')),
-        digits(Src, 10, _)
+        next_char(Src, ('.'), !PS),
+        digits(Src, 10, _, !.PS, _)
     ),
-    current_offset(Src, EndPlusOne),
-    whitespace(Src, _),
+    current_offset(Src, EndPlusOne, !PS),
+    whitespace(Src, _, !PS),
     input_substring(Src, Start, EndPlusOne, SubString),
-    { Int = string.det_to_int(SubString) }.
+    Int = string.det_to_int(SubString).
 
 %-----------------------------------------------------------------------------%
 
 :- pred digits(src::in, int::in)
         : parser(unit) `with_inst` parser.
 
-digits(Src, Base, unit) -->
-    char(Src, C),
-    { char.digit_to_int(C, D), D < Base },
-    digits_2(Src, Base, _).
+digits(Src, Base, unit, !PS) :-
+    next_char(Src, C, !PS),
+    char.digit_to_int(C, D),
+    D < Base,
+    digits_2(Src, Base, _, !PS).
 
 
 :- pred digits_2(src::in, int::in)
         : parser(unit) `with_inst` parser.
 
-digits_2(Src, Base, unit) -->
+digits_2(Src, Base, unit, !PS) :-
     ( if
-        char(Src, C),
-        { char.digit_to_int(C, D), D < Base }
+        next_char(Src, C, !PS),
+        char.digit_to_int(C, D),
+        D < Base
       then
-        digits_2(Src, Base, _)
+        digits_2(Src, Base, _, !PS)
       else
-        []
+        true
     ).
 
 %-----------------------------------------------------------------------------%
 
-string(Src, QuoteChar, String) -->
-    current_offset(Src, Start),
-    char(Src, QuoteChar),
-    string_2(Src, QuoteChar, _),
-    current_offset(Src, EndPlusOne),
-    whitespace(Src, _),
+string_literal(Src, QuoteChar, String, !PS) :-
+    current_offset(Src, Start, !PS),
+    next_char(Src, QuoteChar, !PS),
+    string_literal_2(Src, QuoteChar, _, !PS),
+    current_offset(Src, EndPlusOne, !PS),
+    whitespace(Src, _, !PS),
     input_substring(Src, Start + 1, EndPlusOne - 1, String).
 
 %-----------------------------------------------------------------------------%
 
-:- pred string_2(src::in, char::in)
+:- pred string_literal_2(src::in, char::in)
         : parser(unit) `with_inst` parser.
 
-string_2(Src, QuoteChar, unit) -->
-    char(Src, C),
-    ( if { C = QuoteChar } then
-        []
-      else if { C = ('\\') } then
-        char(Src, _),
-        string_2(Src, QuoteChar, _)
+string_literal_2(Src, QuoteChar, unit, !PS) :-
+    next_char(Src, C, !PS),
+    ( if C = QuoteChar then
+        true
+      else if C = ('\\') then
+        next_char(Src, _, !PS),
+        string_literal_2(Src, QuoteChar, _, !PS)
       else
-        string_2(Src, QuoteChar, _)
+        string_literal_2(Src, QuoteChar, _, !PS)
     ).
 
 %-----------------------------------------------------------------------------%
 
-identifier(Src, InitIdChars, IdChars, Identifier) -->
-    current_offset(Src, Start),
-    char_in_class(Src, InitIdChars, _),
-    identifier_2(Src, IdChars, _),
-    current_offset(Src, EndPlusOne),
+identifier(Src, InitIdChars, IdChars, Identifier, !PS) :-
+    current_offset(Src, Start, !PS),
+    char_in_class(Src, InitIdChars, _, !PS),
+    identifier_2(Src, IdChars, _, !PS),
+    current_offset(Src, EndPlusOne, !PS),
     input_substring(Src, Start, EndPlusOne, Identifier).
 
 %-----------------------------------------------------------------------------%
@@ -376,11 +411,11 @@
 :- pred identifier_2(src::in, string::in)
         : parser(unit) `with_inst` parser.
 
-identifier_2(Src, IdChars, unit) -->
-    ( if char_in_class(Src, IdChars, _) then
-        identifier_2(Src, IdChars, _)
+identifier_2(Src, IdChars, unit, !PS) :-
+    ( if char_in_class(Src, IdChars, _, !PS) then
+        identifier_2(Src, IdChars, _, !PS)
       else
-        []
+        true
     ).
 
 %-----------------------------------------------------------------------------%
--------------------------------------------------------------------------
mercury-developers mailing list
Post messages to:       mercury-developers at csse.unimelb.edu.au
Administrative Queries: owner-mercury-developers at csse.unimelb.edu.au
Subscriptions:          mercury-developers-request at csse.unimelb.edu.au
--------------------------------------------------------------------------



More information about the developers mailing list