[m-rev.] For review: parsing_utils.m

Ralph Becket rafe at csse.unimelb.edu.au
Wed Jan 28 11:44:07 AEDT 2009


Add parsing_utils.m to the library, providing support for recursive descent
parsers.

NEWS:
	Report the addition of parsing_utils.m to the library.

library/library.m:
	Include parsing_utils.m.

library/parsing_utils.m:
	Added.

library/string.m:
	Make string.to_int fail on overflow.  Amend comments to reflect this.

tests/general/Mmakefile:
tests/general/test_parsing_utils.exp:
tests/general/test_parsing_utils.m:
	Test case for parsing_utils.m.

Index: NEWS
===================================================================
RCS file: /home/mercury1/repository/mercury/NEWS,v
retrieving revision 1.499
diff -u -r1.499 NEWS
--- NEWS	12 Jan 2009 02:28:45 -0000	1.499
+++ NEWS	28 Jan 2009 00:14:17 -0000
@@ -30,6 +30,9 @@
 
 Changes to the Mercury standard library:
 
+* A new module, parsing_utils, has been added to provide support for
+  implementing recursive descent parsers.
+
 * We have added extra modes to many of the fold style predicates in the
   library in order to better support (mostly-)unique accumulators.
 
Index: library/library.m
===================================================================
RCS file: /home/mercury1/repository/mercury/library/library.m,v
retrieving revision 1.120
diff -u -r1.120 library.m
--- library/library.m	9 Mar 2008 09:39:24 -0000	1.120
+++ library/library.m	27 Jan 2009 23:44:31 -0000
@@ -89,6 +89,7 @@
 :- import_module ops.
 :- import_module pair.
 :- import_module parser.
+:- import_module parsing_utils.
 :- import_module pprint.
 :- import_module pqueue.
 :- import_module pretty_printer.
Index: library/parsing_utils.m
===================================================================
RCS file: library/parsing_utils.m
diff -N library/parsing_utils.m
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ library/parsing_utils.m	28 Jan 2009 00:16:17 -0000
@@ -0,0 +1,483 @@
+%---------------------------------------------------------------------------%
+% vim: ft=mercury ts=4 sw=4 et wm=0 tw=0
+%---------------------------------------------------------------------------%
+% Copyright (C) 2009 The University of Melbourne.
+% This file may only be copied under the terms of the GNU Library General
+% Public License - see the file COPYING.LIB in the Mercury distribution.
+%---------------------------------------------------------------------------%
+% 
+% File: parsing_utils.m
+% Author: Ralph Becket <rafe at csse.unimelb.edu.au>
+% Stability: low
+%
+% Utilities for recursive descent parsers.  Parsers take at least three
+% arguments: a source (src) containing the input string and a parser
+% state (ps) input/output pair tracking the current offset into the input.
+%
+% A new src and ps can be constructed by calling
+% new_src_and_ps(InputString, Src, !:PS).  Parsing predicates are semidet
+% and typically take the form p(...input arguments..., Src, Result, !PS).
+% A parser matching variable assignments of the form `x = 42' might be
+% defined like this:
+%
+%   var_assignment(Src, {Var, Value}, !PS) :-
+%       var(Src, Var, !PS),
+%       punct(Src, "=", !PS),
+%       expr(Src, Expr, !PS).
+%
+% where var/4 and expr/4 are parsers for variables and expressions
+% respectively and punct/4 is provided by this module for matching
+% punctuation.
+%
+%-----------------------------------------------------------------------------%
+%-----------------------------------------------------------------------------%
+
+:- module parsing_utils.
+
+:- interface.
+
+:- import_module char.
+:- import_module float.
+:- import_module int.
+:- import_module list.
+:- import_module maybe.
+:- import_module string.
+:- import_module unit.
+
+
+
+    % The parser source (input string).
+    %
+:- type src.
+
+    % The parser "state", passed around in DCG arguments.
+    % 
+:- type ps.
+
+    % This type and inst are useful for specifying "standard" parser
+    % signatures.
+    %
+:- type parser(T) == pred(src, T, ps, ps).
+:- inst parser == ( pred(in, out, in, out) is semidet ).
+
+    % Construct a new parser source and state from a string.
+    %
+:- pred new_src_and_ps(string::in, src::out, ps::out) is det.
+
+    % Obtain the current offset from the start of the input string
+    % (the first character in the input has offset 0).
+    %
+:- pred current_offset(src::in, int::out,
+        ps::in, ps::out) is det.
+
+    % input_substring(Src, StartOffset, EndOffsetPlusOne, Substring)
+    % Copy the substring from the input occupying the offsets
+    % [StartOffset, EndOffsetPlusOne).
+    %
+:- pred input_substring(src::in, int::in, int::in, string::out) is semidet.
+
+    % Read the next char.
+    %
+:- pred next_char(src::in, char::out,
+        ps::in, ps::out) is semidet.
+
+    % Match a char from the given string.
+    %
+:- pred char_in_class(string::in, src::in, char::out,
+        ps::in, ps::out) is semidet.
+
+    % Match a string exactly and any subsequent whitespace.
+    %
+:- pred punct(string::in, src::in, unit::out,
+        ps::in, ps::out) is semidet.
+
+    % keyword(Src, IdChars, Keyword, _, !PS) matches Keyword exactly (i.e., it
+    % must not be followed by any character in IdChars) and any subsequent
+    % whitespace.
+    %
+:- pred keyword(string::in, string::in, src::in, unit::out,
+        ps::in, ps::out) is semidet.
+
+    % identifier(Src, InitIdChars, IdChars, Identifier, !PS) matches the next
+    % identifer (result in Identifier) comprising a char from InitIdChars
+    % followed by zero or more chars from IdChars.  Any subsequent whitespace
+    % is consumed.
+    %
+:- pred identifier(string::in, string::in, src::in, string::out,
+        ps::in, ps::out) is semidet.
+
+    % Consume any whitespace.
+    %
+:- pred whitespace(src::in, unit::out,
+        ps::in, ps::out) is semidet.
+
+    % Consume any input up to, and including, the next newline character
+    % marking the end of the current line.
+    %
+:- pred skip_to_eol(src::in, unit::out,
+        ps::in, ps::out) is semidet.
+
+    % Succeed if we have reached the end of the input.
+    %
+:- pred eof(src::in, unit::out,
+        ps::in, ps::out) is semidet.
+
+    % Parse a float literal matching [-][0-9]+[.][0-9]+([Ee][-][0-9]+)?
+    % followed by any whitespace.  The float_literal_as_string version simply
+    % returns the matched string.  The float_literal version uses
+    % string.to_float to convert the output of float_literal_as_string; this
+    % may return an approximate answer since not all floating point numbers
+    % can be perfectly represented as Mercury floats.
+    %
+:- pred float_literal_as_string(src::in, string::out,
+        ps::in, ps::out) is semidet.
+:- pred float_literal(src::in, float::out,
+        ps::in, ps::out) is semidet.
+
+    % Parse an int literal matching [-][0-9]+, not followed by [.][0-9]+,
+    % followed by any whitespace.  The int_literal_as_string version simply
+    % returns the matched string.  The int_literal version uses string.to_int
+    % to convert the output of int_literal_as_string; this may fail if the
+    % number in question cannot be represented as a Mercury int.
+    %
+:- pred int_literal_as_string(src::in, string::out,
+        ps::in, ps::out) is semidet.
+:- pred int_literal(src::in, int::out,
+        ps::in, ps::out) is semidet.
+    
+    % Parse an string literal.  The string argument is the quote character.
+    % A backslash (\) character in the string makes the next character
+    % literal (e.g., for embedding quotes).  These 'escaped' characters
+    % are included as-is in the result, along with the preceding backslash.
+    % Any following whitespace is also consumed.
+    %
+:- pred string_literal(char::in, src::in, string::out,
+        ps::in, ps::out) is semidet.
+
+    % optional(P, Src, Result, !PS) returns Result = yes(X), if P(Src, X, !PS),
+    % or Result = no if P does not succeed.
+    %
+:- pred optional(parser(T)::in(parser), src::in, maybe(T)::out,
+        ps::in, ps::out) is semidet.
+
+    % zero_or_more(P, Src, Xs, !PS) returns the list of results Xs obtained
+    % by repeatedly applying P until P fails.  The nth item in Xs is
+    % the result from the nth application of P.
+    %
+:- pred zero_or_more(parser(T)::in(parser), src::in, list(T)::out,
+        ps::in, ps::out) is semidet.
+
+    % one_or_more(P, Src, Xs, !PS) returns the list of results Xs obtained
+    % by repeatedly applying P until P fails.  The nth item in Xs is
+    % the result from the nth application of P.  P must succeed at
+    % least once.
+    %
+:- pred one_or_more(parser(T)::in(parser), src::in, list(T)::out,
+        ps::in, ps::out) is semidet.
+
+    % brackets(L, R, P, Src, X, !PS) is equivalent to
+    %   punct(L, Src, _, !PS), P(Src, X, !PS), punct(R, Src, _, !PS).
+    %
+:- pred brackets(string::in, string::in, parser(T)::in(parser), src::in,
+        T::out, ps::in, ps::out) is semidet.
+
+    % separated_list(Separator, P, Src, Xs, !PS) is like
+    % zero_or_more(P, Src, Xs, !PS) except that successive applications of
+    % P must be separated by punct(Separator, Src, _, !PS).
+    %
+:- pred separated_list(string::in, parser(T)::in(parser), src::in, 
+        list(T)::out, ps::in, ps::out) is semidet.
+
+    % comma_separated_list(P, Src, Xs) is the same as
+    %   separated_list(",", P, Src, Xs).
+    %
+:- pred comma_separated_list(parser(T)::in(parser), src::in, list(T)::out,
+        ps::in, ps::out) is semidet.
+
+%-----------------------------------------------------------------------------%
+%-----------------------------------------------------------------------------%
+
+:- implementation.
+
+
+
+    % The parser "state" is just the offset into the input string.
+    %
+:- type ps == int.
+
+:- type src
+    --->    src(
+                input_length    ::  int,
+                input_string    ::  string
+            ).
+
+%-----------------------------------------------------------------------------%
+
+new_src_and_ps(InputString, Src, PS) :-
+    Src = src(string.length(InputString), InputString),
+    PS = 0.
+
+%-----------------------------------------------------------------------------%
+%-----------------------------------------------------------------------------%
+% Low-level predicates.
+%-----------------------------------------------------------------------------%
+%-----------------------------------------------------------------------------%
+
+current_offset(_Src, Offset, !PS) :-
+    Offset = !.PS.
+
+%-----------------------------------------------------------------------------%
+
+eof(Src, unit, !PS) :-
+    current_offset(Src, Offset, !PS),
+    Offset = Src ^ input_length.
+
+%-----------------------------------------------------------------------------%
+
+next_char(Src, Char, !PS) :-
+    current_offset(Src, Offset, !PS),
+    Offset < Src ^ input_length,
+    Char = Src ^ input_string ^ unsafe_elem(Offset),
+    !:PS = !.PS + 1.
+
+%-----------------------------------------------------------------------------%
+
+char_in_class(CharClass, Src, Char, !PS) :-
+    next_char(Src, Char, !PS),
+    string.contains_char(CharClass, Char).
+
+%-----------------------------------------------------------------------------%
+
+input_substring(Src, Start, EndPlusOne, Substring) :-
+    EndPlusOne =< Src ^ input_length,
+    Substring =
+        unsafe_substring(Src ^ input_string, Start, EndPlusOne - Start).
+
+%-----------------------------------------------------------------------------%
+
+:- pred match_string(string::in, src::in,
+        ps::in, ps::out) is semidet.
+
+match_string(MatchStr, Src, PS, PS + N) :-
+    N = string.length(MatchStr),
+    PS + N =< Src ^ input_length,
+    match_string_2(N, 0, MatchStr, PS, Src ^ input_string).
+
+
+:- pred match_string_2(int::in, int::in, string::in, int::in, string::in)
+        is semidet.
+
+match_string_2(N, I, MatchStr, Offset, Str) :-
+    ( if I < N then
+        MatchStr ^ unsafe_elem(I) = Str ^ unsafe_elem(Offset + I),
+        match_string_2(N, I + 1, MatchStr, Offset, Str)
+      else
+        true
+    ).
+
+%-----------------------------------------------------------------------------%
+%-----------------------------------------------------------------------------%
+% Utility predicates.
+%-----------------------------------------------------------------------------%
+%-----------------------------------------------------------------------------%
+
+optional(P, Src, Result, !PS) :-
+    ( if P(Src, X, !PS) then
+        Result = yes(X)
+      else
+        Result = no,
+        semidet_true
+    ).
+
+%-----------------------------------------------------------------------------%
+
+zero_or_more(P, Src, Result, !PS) :-
+    ( if P(Src, X, !PS), zero_or_more(P, Src, Xs, !PS) then
+        Result = [X | Xs]
+      else
+        Result = [],
+        semidet_true
+    ).
+
+%-----------------------------------------------------------------------------%
+
+one_or_more(P, Src, Result, !PS) :-
+    P(Src, X, !PS),
+    zero_or_more(P, Src, Xs, !PS),
+    Result = [X | Xs].
+
+%-----------------------------------------------------------------------------%
+
+brackets(L, R, P, Src, Result, !PS) :-
+    punct(L, Src, _, !PS),
+    P(Src, Result, !PS),
+    punct(R, Src, _, !PS).
+
+%-----------------------------------------------------------------------------%
+
+separated_list(Separator, P, Src, Result, !PS) :-
+    CommaP = ( pred(CommaPSrc::in, CommaPX::out, !.PS::in, !:PS::out)
+            is semidet :-
+        punct(Separator, CommaPSrc, _, !PS),
+        P(CommaPSrc, CommaPX, !PS)
+    ),
+    P(Src, X, !PS),
+    zero_or_more(CommaP, Src, Xs, !PS),
+    Result = [X | Xs].
+
+%-----------------------------------------------------------------------------%
+
+comma_separated_list(P, Src, Result, !PS) :-
+    separated_list(",", P, Src, Result, !PS).
+
+%-----------------------------------------------------------------------------%
+
+whitespace(Src, unit, !PS) :-
+    ( if
+        next_char(Src, C, !PS),
+        char.is_whitespace(C)
+      then
+        whitespace(Src, _, !PS)
+      else
+        true
+    ).
+
+%-----------------------------------------------------------------------------%
+
+skip_to_eol(Src, unit, !PS) :-
+    next_char(Src, C, !PS),
+    ( if C = ('\n') then true else skip_to_eol(Src, _, !PS) ).
+
+%-----------------------------------------------------------------------------%
+
+punct(Punct, Src, unit, !PS) :-
+    match_string(Punct, Src, !PS),
+    whitespace(Src, _, !PS).
+
+%---------------------------------------------------------------------------%
+
+keyword(IdChars, Keyword, Src, unit, !PS) :-
+    match_string(Keyword, Src, !PS),
+    not char_in_class(IdChars, Src, _, !.PS, _),
+    whitespace(Src, _, !PS).
+
+%-----------------------------------------------------------------------------%
+
+float_literal_as_string(Src, FloatStr, !PS) :-
+    current_offset(Src, Start, !PS),
+    ( if next_char(Src, ('-'), !PS) then true else true ),
+    digits(10, Src, _, !PS),
+    next_char(Src, ('.'), !PS),
+    digits(10, Src, _, !PS),
+    ( if char_in_class("eE", Src, _, !PS) then
+        ( if next_char(Src, ('-'), !PS) then true else true ),
+        digits(10, Src, _, !PS)
+      else
+        true
+    ),
+    current_offset(Src, EndPlusOne, !PS),
+    whitespace(Src, _, !PS),
+    input_substring(Src, Start, EndPlusOne, FloatStr).
+
+%-----------------------------------------------------------------------------%
+
+float_literal(Src, Float, !PS) :-
+    float_literal_as_string(Src, FloatStr, !PS),
+    string.to_float(FloatStr, Float).
+
+%-----------------------------------------------------------------------------%
+
+int_literal_as_string(Src, IntStr, !PS) :-
+    current_offset(Src, Start, !PS),
+    optional(char_in_class("-"), Src, _, !PS),
+    digits(10, Src, _, !PS),
+    not (
+        next_char(Src, ('.'), !PS),
+        digits(10, Src, _, !.PS, _)
+    ),
+    current_offset(Src, EndPlusOne, !PS),
+    whitespace(Src, _, !PS),
+    input_substring(Src, Start, EndPlusOne, IntStr).
+
+%-----------------------------------------------------------------------------%
+
+int_literal(Src, Int, !PS) :-
+    int_literal_as_string(Src, IntStr, !PS),
+    string.to_int(IntStr, Int).
+
+%-----------------------------------------------------------------------------%
+
+:- pred digits(int::in, src::in, unit::out,
+        ps::in, ps::out) is semidet.
+
+digits(Base, Src, unit, !PS) :-
+    next_char(Src, C, !PS),
+    char.digit_to_int(C, D),
+    D < Base,
+    digits_2(Base, Src, _, !PS).
+
+
+:- pred digits_2(int::in, src::in, unit::out,
+        ps::in, ps::out) is semidet.
+
+digits_2(Base, Src, unit, !PS) :-
+    ( if
+        next_char(Src, C, !PS),
+        char.digit_to_int(C, D),
+        D < Base
+      then
+        digits_2(Base, Src, _, !PS)
+      else
+        true
+    ).
+
+%-----------------------------------------------------------------------------%
+
+string_literal(QuoteChar, Src, String, !PS) :-
+    current_offset(Src, Start, !PS),
+    next_char(Src, QuoteChar, !PS),
+    string_literal_2(Src, QuoteChar, _, !PS),
+    current_offset(Src, EndPlusOne, !PS),
+    whitespace(Src, _, !PS),
+    input_substring(Src, Start + 1, EndPlusOne - 1, String).
+
+%-----------------------------------------------------------------------------%
+
+:- pred string_literal_2(src::in, char::in, unit::out,
+        ps::in, ps::out) is semidet.
+
+string_literal_2(Src, QuoteChar, unit, !PS) :-
+    next_char(Src, C, !PS),
+    ( if C = QuoteChar then
+        true
+      else if C = ('\\') then
+        next_char(Src, _, !PS),
+        string_literal_2(Src, QuoteChar, _, !PS)
+      else
+        string_literal_2(Src, QuoteChar, _, !PS)
+    ).
+
+%-----------------------------------------------------------------------------%
+
+identifier(InitIdChars, IdChars, Src, Identifier, !PS) :-
+    current_offset(Src, Start, !PS),
+    char_in_class(InitIdChars, Src, _, !PS),
+    identifier_2(IdChars, Src, _, !PS),
+    current_offset(Src, EndPlusOne, !PS),
+    whitespace(Src, _, !PS),
+    input_substring(Src, Start, EndPlusOne, Identifier).
+
+%-----------------------------------------------------------------------------%
+
+:- pred identifier_2(string::in, src::in, unit::out,
+        ps::in, ps::out) is semidet.
+
+identifier_2(IdChars, Src, unit, !PS) :-
+    ( if char_in_class(IdChars, Src, _, !PS) then
+        identifier_2(IdChars, Src, _, !PS)
+      else
+        true
+    ).
+
+%-----------------------------------------------------------------------------%
+%-----------------------------------------------------------------------------%
Index: library/string.m
===================================================================
RCS file: /home/mercury1/repository/mercury/library/string.m,v
retrieving revision 1.276
diff -u -r1.276 string.m
--- library/string.m	3 Sep 2008 04:39:16 -0000	1.276
+++ library/string.m	28 Jan 2009 00:32:37 -0000
@@ -306,6 +306,7 @@
 
     % As above, but fail instead of throwing an exception if the
     % list contains a null character.
+    %
 :- pred string.semidet_from_char_list(list(char)::in, string::uo) is semidet.
 
     % Same as string.from_char_list, except that it reverses the order
@@ -323,12 +324,14 @@
 
     % Converts a signed base 10 string to an int; throws an exception
     % if the string argument does not match the regexp [+-]?[0-9]+
+    % or the number is not in the range [int.min_int+1, int.max_int].
     %
 :- func string.det_to_int(string) = int.
 
     % Convert a string to an int. The string must contain only digits,
     % optionally preceded by a plus or minus sign. If the string does
-    % not match this syntax, string.to_int fails.
+    % not match this syntax or the number is not in the range
+    % [int.min_int+1, int.max_int], string.to_int fails.
     %
 :- pred string.to_int(string::in, int::out) is semidet.
 
@@ -336,13 +339,15 @@
     % must contain one or more digits in the specified base, optionally
     % preceded by a plus or minus sign. For bases > 10, digits 10 to 35
     % are represented by the letters A-Z or a-z. If the string does not match
-    % this syntax, the predicate fails.
+    % this syntax or the number is not in the range
+    % [int.min_int+1, int.max_int], the predicate fails.
     %
 :- pred string.base_string_to_int(int::in, string::in, int::out) is semidet.
 
     % Converts a signed base N string to an int; throws an exception
     % if the string argument is not precisely an optional sign followed by
-    % a non-empty string of base N digits.
+    % a non-empty string of base N digits and the number is in the range
+    % [int.min_int+1, int.max_int].
     %
 :- func string.det_base_string_to_int(int, string) = int.
 
@@ -944,9 +949,11 @@
 
 :- pred accumulate_int(int::in, char::in, int::in, int::out) is semidet.
 
-accumulate_int(Base, Char, N, (Base * N) + M) :-
+accumulate_int(Base, Char, N0, N) :-
     char.digit_to_int(Char, M),
-    M < Base.
+    M < Base,
+    N = (Base * N0) + M,
+    N0 =< N.                             % Fail on overflow.
 
 % It is important to inline string.index and string.index_det.
 % so that the compiler can do loop invariant hoisting
Index: tests/general/Mmakefile
===================================================================
RCS file: /home/mercury1/repository/tests/general/Mmakefile,v
retrieving revision 1.56
diff -u -r1.56 Mmakefile
--- tests/general/Mmakefile	1 Dec 2008 00:32:54 -0000	1.56
+++ tests/general/Mmakefile	27 Jan 2009 23:41:10 -0000
@@ -67,6 +67,7 @@
 		string_test \
 		string_test_2 \
 		string_to_float \
+		test_parsing_utils \
 		test_univ \
 		unreachable \
 		unsafe_uniqueness
Index: tests/general/test_parsing_utils.exp
===================================================================
RCS file: tests/general/test_parsing_utils.exp
diff -N tests/general/test_parsing_utils.exp
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ tests/general/test_parsing_utils.exp	27 Jan 2009 23:40:38 -0000
@@ -0,0 +1,216 @@
+pass: next_char on ""
+	failed as expected
+pass: next_char on "123"
+	returned '1' as expected
+	[1 chars consumed]
+pass: char_in_class("123") on ""
+	failed as expected
+pass: char_in_class("123") on "abc"
+	failed as expected
+pass: char_in_class("123") on "123"
+	returned '1' as expected
+	[1 chars consumed]
+pass: punct("!") on ""
+	failed as expected
+pass: punct("!") on "abc"
+	failed as expected
+pass: punct("!") on "*"
+	failed as expected
+pass: punct("!") on "!"
+	returned unit as expected
+	[1 chars consumed]
+pass: keyword("ABC", "ABC") on ""
+	failed as expected
+pass: keyword("ABC", "ABC") on "123"
+	failed as expected
+pass: keyword("ABC", "ABC") on "ABCA"
+	failed as expected
+pass: keyword("ABC", "ABC") on "ABC 123"
+	returned unit as expected
+	[4 chars consumed]
+pass: identifier("ABC", "ABCabc_") on ""
+	failed as expected
+pass: identifier("ABC", "ABCabc_") on "abc"
+	failed as expected
+pass: identifier("ABC", "ABCabc_") on "_"
+	failed as expected
+pass: identifier("ABC", "ABCabc_") on "A"
+	returned "A" as expected
+	[1 chars consumed]
+pass: identifier("ABC", "ABCabc_") on "Ab_c"
+	returned "Ab_c" as expected
+	[4 chars consumed]
+pass: identifier("ABC", "ABCabc_") on "*"
+	failed as expected
+pass: identifier("ABC", "ABCabc_") on "Abc !"
+	returned "Abc" as expected
+	[4 chars consumed]
+pass: whitespace on ""
+	returned unit as expected
+	[0 chars consumed]
+pass: whitespace on "123"
+	returned unit as expected
+	[0 chars consumed]
+pass: whitespace on "   "
+	returned unit as expected
+	[3 chars consumed]
+pass: whitespace on "   123"
+	returned unit as expected
+	[3 chars consumed]
+pass: skip_to_eol on ""
+	failed as expected
+pass: skip_to_eol on "blah blah
+"
+	returned unit as expected
+	[10 chars consumed]
+pass: skip_to_eol on "blah blah
+123"
+	returned unit as expected
+	[10 chars consumed]
+pass: eof on "123"
+	failed as expected
+pass: eof on ""
+	returned unit as expected
+	[0 chars consumed]
+pass: float_literal_as_string on ""
+	failed as expected
+pass: float_literal_as_string on "abc"
+	failed as expected
+pass: float_literal_as_string on "123"
+	failed as expected
+pass: float_literal_as_string on "123.0   abc"
+	returned "123.0" as expected
+	[8 chars consumed]
+pass: float_literal_as_string on "123.0e1   abc"
+	returned "123.0e1" as expected
+	[10 chars consumed]
+pass: float_literal_as_string on "-123.0   abc"
+	returned "-123.0" as expected
+	[9 chars consumed]
+pass: float_literal_as_string on "-123.0e1   abc"
+	returned "-123.0e1" as expected
+	[11 chars consumed]
+pass: float_literal_as_string on "-123.0E-1   abc"
+	returned "-123.0E-1" as expected
+	[12 chars consumed]
+pass: float_literal on ""
+	failed as expected
+pass: float_literal on "abc"
+	failed as expected
+pass: float_literal on "123"
+	failed as expected
+pass: float_literal on "123.0   abc"
+	returned 123.0 as expected
+	[8 chars consumed]
+pass: float_literal on "123.0e1   abc"
+	returned 1230.0 as expected
+	[10 chars consumed]
+pass: float_literal on "-123.0   abc"
+	returned -123.0 as expected
+	[9 chars consumed]
+pass: float_literal on "-123.0e1   abc"
+	returned -1230.0 as expected
+	[11 chars consumed]
+pass: float_literal on "-123.0E-1   abc"
+	returned -12.3 as expected
+	[12 chars consumed]
+pass: int_literal_as_string on ""
+	failed as expected
+pass: int_literal_as_string on "abc"
+	failed as expected
+pass: int_literal_as_string on "123.0"
+	failed as expected
+pass: int_literal_as_string on "123   abc"
+	returned "123" as expected
+	[6 chars consumed]
+pass: int_literal_as_string on "-123   abc"
+	returned "-123" as expected
+	[7 chars consumed]
+pass: int_literal_as_string on "999999999999999999999   abc"
+	returned "999999999999999999999" as expected
+	[24 chars consumed]
+pass: int_literal on ""
+	failed as expected
+pass: int_literal on "abc"
+	failed as expected
+pass: int_literal on "123.0"
+	failed as expected
+pass: int_literal on "123   abc"
+	returned 123 as expected
+	[6 chars consumed]
+pass: int_literal on "-123   abc"
+	returned -123 as expected
+	[7 chars consumed]
+fail: int_literal on "999999999999999999999   abc"
+	failed as expected
+pass: string_literal('\"') on ""
+	failed as expected
+pass: string_literal('\"') on ""123"   abc"
+	returned "123" as expected
+	[8 chars consumed]
+pass: string_literal('\"') on ""1\"2\"3"   abc"
+	returned "1\\\"2\\\"3" as expected
+	[12 chars consumed]
+pass: string_literal('\'') on ""
+	failed as expected
+pass: string_literal('\'') on "'123'   abc"
+	returned "123" as expected
+	[8 chars consumed]
+pass: string_literal('\'') on "'1\'2\'3'   abc"
+	returned "1\\\'2\\\'3" as expected
+	[12 chars consumed]
+pass: optional(punct("!")) on ""
+	returned no as expected
+	[0 chars consumed]
+pass: optional(punct("!")) on "abc"
+	returned no as expected
+	[0 chars consumed]
+pass: optional(punct("!")) on "!   "
+	returned yes(unit) as expected
+	[4 chars consumed]
+pass: zero_or_more(punct("!")) on ""
+	returned [] as expected
+	[0 chars consumed]
+pass: zero_or_more(punct("!")) on "abc"
+	returned [] as expected
+	[0 chars consumed]
+pass: zero_or_more(punct("!")) on "!!!   abc"
+	returned [unit, unit, unit] as expected
+	[6 chars consumed]
+pass: one_or_more(punct("!")) on ""
+	failed as expected
+pass: one_or_more(punct("!")) on "abc"
+	failed as expected
+pass: one_or_more(punct("!")) on "!!!   abc"
+	returned [unit, unit, unit] as expected
+	[6 chars consumed]
+pass: brackets("(", ")", punct("!")) on ""
+	failed as expected
+pass: brackets("(", ")", punct("!")) on "abc"
+	failed as expected
+pass: brackets("(", ")", punct("!")) on "(abc)"
+	failed as expected
+pass: brackets("(", ")", punct("!")) on "(!)   abc"
+	returned unit as expected
+	[6 chars consumed]
+pass: separated_list("+", punct("!")) on ""
+	failed as expected
+pass: separated_list("+", punct("!")) on "abc"
+	failed as expected
+pass: separated_list("+", punct("!")) on "!   abc"
+	returned [unit] as expected
+	[4 chars consumed]
+pass: separated_list("+", punct("!")) on "!+ ! + !   abc"
+	returned [unit, unit, unit] as expected
+	[11 chars consumed]
+pass: comma_separated_list(punct("!")) on ""
+	failed as expected
+pass: comma_separated_list(punct("!")) on "abc"
+	failed as expected
+pass: comma_separated_list(punct("!")) on "!   abc"
+	returned [unit] as expected
+	[4 chars consumed]
+pass: comma_separated_list(punct("!")) on "!, ! , !   abc"
+	returned [unit, unit, unit] as expected
+	[11 chars consumed]
Index: tests/general/test_parsing_utils.m
===================================================================
RCS file: tests/general/test_parsing_utils.m
diff -N tests/general/test_parsing_utils.m
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ tests/general/test_parsing_utils.m	27 Jan 2009 23:40:38 -0000
@@ -0,0 +1,313 @@
+%-----------------------------------------------------------------------------%
+% test_parsing_utils.m
+% Ralph Becket <rafe at csse.unimelb.edu.au>
+% Tue Jan 27 13:44:59 EST 2009
+% vim: ft=mercury ts=4 sw=4 et wm=0 tw=0
+%
+%-----------------------------------------------------------------------------%
+
+:- module test_parsing_utils.
+
+:- interface.
+
+:- import_module io.
+
+
+
+:- pred main(io::di, io::uo) is cc_multi.
+
+%-----------------------------------------------------------------------------%
+%-----------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module list.
+:- import_module maybe.
+:- import_module parsing_utils.
+:- import_module solutions.
+:- import_module string.
+
+%-----------------------------------------------------------------------------%
+
+main(!IO) :-
+    unsorted_aggregate(run_test, io.write_string, !IO).
+
+%-----------------------------------------------------------------------------%
+
+:- pred run_test(string::out) is multi.
+
+run_test(Result) :-
+    test_case(ParserName, Parser, TestString, ExpectedOutput),
+    new_src_and_ps(TestString, Src, PS0),
+    ( if Parser(Src, ActualOutput0, PS0, PS) then
+        ActualOutput = yes(ActualOutput0),
+        current_offset(Src, CurrentOffset0, PS, _),
+        CurrentOffset = yes(CurrentOffset0)
+      else
+        ActualOutput = no,
+        CurrentOffset = no
+    ),
+    (
+        ExpectedOutput = no,
+        ActualOutput = no,
+        Outcome = "failed as expected",
+        PassFail = "pass"
+    ;
+        ExpectedOutput = no,
+        ActualOutput = yes(ActualOutputString),
+        Outcome = "should have failed, but returned \"" ++
+            ActualOutputString ++ "\"",
+        PassFail = "fail"
+    ;
+        ExpectedOutput = yes(ExpectedOutputString),
+        ActualOutput = no,
+        Outcome = "failed, but should have returned \"" ++
+            ExpectedOutputString ++ "\"",
+        PassFail = "fail"
+    ;
+        ExpectedOutput = yes(ExpectedOutputString),
+        ActualOutput = yes(ActualOutputString),
+        Outcome0 = "returned " ++ ActualOutputString,
+        ( if ActualOutputString = ExpectedOutputString then
+            Outcome = Outcome0 ++ " as expected",
+            PassFail = "pass"
+          else
+            Outcome = Outcome0 ++ ", but should have returned " ++
+                ExpectedOutputString,
+            PassFail = "fail"
+        )
+    ),
+    Result = PassFail ++ ": " ++
+        ParserName ++ " on \"" ++ TestString ++ "\"\n\t" ++
+        Outcome ++
+        ( if CurrentOffset = yes(CO) then
+            string.format("\n\t[%d chars consumed]", [i(CO)])
+          else
+            ""
+        ) ++
+        "\n".
+
+%-----------------------------------------------------------------------------%
+
+:- pred test_case(
+        string::out,
+        pred(src, string, ps, ps)::out(pred(in, out, in, out) is semidet),
+        string::out,
+        maybe(string)::out)
+        is multi.
+
+test_case("next_char", stringify(next_char),
+    "", no).
+test_case("next_char", stringify(next_char),
+    "123", yes("'1'")).
+
+test_case("char_in_class(\"123\")", stringify(char_in_class("123")),
+    "", no).
+test_case("char_in_class(\"123\")", stringify(char_in_class("123")),
+    "abc", no).
+test_case("char_in_class(\"123\")", stringify(char_in_class("123")),
+    "123", yes("'1'")).
+
+test_case("punct(\"!\")", stringify(punct("!")),
+    "", no).
+test_case("punct(\"!\")", stringify(punct("!")),
+    "abc", no).
+test_case("punct(\"!\")", stringify(punct("!")),
+    "*", no).
+test_case("punct(\"!\")", stringify(punct("!")),
+    "!", yes("unit")).
+
+test_case("keyword(\"ABC\", \"ABC\")", stringify(keyword("ABC", "ABC")),
+    "", no).
+test_case("keyword(\"ABC\", \"ABC\")", stringify(keyword("ABC", "ABC")),
+    "123", no).
+test_case("keyword(\"ABC\", \"ABC\")", stringify(keyword("ABC", "ABC")),
+    "ABCA", no).
+test_case("keyword(\"ABC\", \"ABC\")", stringify(keyword("ABC", "ABC")),
+    "ABC 123", yes("unit")).
+
+test_case("identifier(\"ABC\", \"ABCabc_\")", stringify(identifier("ABC", "ABCabc_")),
+    "", no).
+test_case("identifier(\"ABC\", \"ABCabc_\")", stringify(identifier("ABC", "ABCabc_")),
+    "abc", no).
+test_case("identifier(\"ABC\", \"ABCabc_\")", stringify(identifier("ABC", "ABCabc_")),
+    "_", no).
+test_case("identifier(\"ABC\", \"ABCabc_\")", stringify(identifier("ABC", "ABCabc_")),
+    "A", yes("\"A\"")).
+test_case("identifier(\"ABC\", \"ABCabc_\")", stringify(identifier("ABC", "ABCabc_")),
+    "Ab_c", yes("\"Ab_c\"")).
+test_case("identifier(\"ABC\", \"ABCabc_\")", stringify(identifier("ABC", "ABCabc_")),
+    "*", no).
+test_case("identifier(\"ABC\", \"ABCabc_\")", stringify(identifier("ABC", "ABCabc_")),
+    "Abc !", yes("\"Abc\"")).
+
+test_case("whitespace", stringify(whitespace),
+    "", yes("unit")).
+test_case("whitespace", stringify(whitespace),
+    "123", yes("unit")).
+test_case("whitespace", stringify(whitespace),
+    "   ", yes("unit")).
+test_case("whitespace", stringify(whitespace),
+    "   123", yes("unit")).
+
+test_case("skip_to_eol", stringify(skip_to_eol),
+    "", no).
+test_case("skip_to_eol", stringify(skip_to_eol),
+    "blah blah\n", yes("unit")).
+test_case("skip_to_eol", stringify(skip_to_eol),
+    "blah blah\n123", yes("unit")).
+
+test_case("eof", stringify(eof),
+    "123", no).
+test_case("eof", stringify(eof),
+    "", yes("unit")).
+
+test_case("float_literal_as_string", stringify(float_literal_as_string),
+    "", no).
+test_case("float_literal_as_string", stringify(float_literal_as_string),
+    "abc", no).
+test_case("float_literal_as_string", stringify(float_literal_as_string),
+    "123", no).
+test_case("float_literal_as_string", stringify(float_literal_as_string),
+    "123.0   abc", yes("\"123.0\"")).
+test_case("float_literal_as_string", stringify(float_literal_as_string),
+    "123.0e1   abc", yes("\"123.0e1\"")).
+test_case("float_literal_as_string", stringify(float_literal_as_string),
+    "-123.0   abc", yes("\"-123.0\"")).
+test_case("float_literal_as_string", stringify(float_literal_as_string),
+    "-123.0e1   abc", yes("\"-123.0e1\"")).
+test_case("float_literal_as_string", stringify(float_literal_as_string),
+    "-123.0E-1   abc", yes("\"-123.0E-1\"")).
+
+test_case("float_literal", stringify(float_literal),
+    "", no).
+test_case("float_literal", stringify(float_literal),
+    "abc", no).
+test_case("float_literal", stringify(float_literal),
+    "123", no).
+test_case("float_literal", stringify(float_literal),
+    "123.0   abc", yes("123.0")).
+test_case("float_literal", stringify(float_literal),
+    "123.0e1   abc", yes("1230.0")).
+test_case("float_literal", stringify(float_literal),
+    "-123.0   abc", yes("-123.0")).
+test_case("float_literal", stringify(float_literal),
+    "-123.0e1   abc", yes("-1230.0")).
+test_case("float_literal", stringify(float_literal),
+    "-123.0E-1   abc", yes("-12.3")).
+
+test_case("int_literal_as_string", stringify(int_literal_as_string),
+    "", no).
+test_case("int_literal_as_string", stringify(int_literal_as_string),
+    "abc", no).
+test_case("int_literal_as_string", stringify(int_literal_as_string),
+    "123.0", no).
+test_case("int_literal_as_string", stringify(int_literal_as_string),
+    "123   abc", yes("\"123\"")).
+test_case("int_literal_as_string", stringify(int_literal_as_string),
+    "-123   abc", yes("\"-123\"")).
+test_case("int_literal_as_string", stringify(int_literal_as_string),
+    "999999999999999999999   abc", yes("\"999999999999999999999\"")).
+
+test_case("int_literal", stringify(int_literal),
+    "", no).
+test_case("int_literal", stringify(int_literal),
+    "abc", no).
+test_case("int_literal", stringify(int_literal),
+    "123.0", no).
+test_case("int_literal", stringify(int_literal),
+    "123   abc", yes("123")).
+test_case("int_literal", stringify(int_literal),
+    "-123   abc", yes("-123")).
+test_case("int_literal", stringify(int_literal),
+    "999999999999999999999   abc", no).
+
+test_case("string_literal('\\\"')", stringify(string_literal('\"')),
+    "", no).
+test_case("string_literal('\\\"')", stringify(string_literal('\"')),
+    "\"123\"   abc", yes("\"123\"")).
+test_case("string_literal('\\\"')", stringify(string_literal('\"')),
+    "\"1\\\"2\\\"3\"   abc", yes("\"1\\\\\\\"2\\\\\\\"3\"")).
+test_case("string_literal('\\\'')", stringify(string_literal('\'')),
+    "", no).
+test_case("string_literal('\\\'')", stringify(string_literal('\'')),
+    "\'123\'   abc", yes("\"123\"")).
+test_case("string_literal('\\\'')", stringify(string_literal('\'')),
+    "\'1\\\'2\\\'3\'   abc", yes("\"1\\\\\\\'2\\\\\\\'3\"")).
+
+test_case("optional(punct(\"!\"))", stringify(optional(punct("!"))),
+    "", yes("no")).
+test_case("optional(punct(\"!\"))", stringify(optional(punct("!"))),
+    "abc", yes("no")).
+test_case("optional(punct(\"!\"))", stringify(optional(punct("!"))),
+    "!   ", yes("yes(unit)")).
+
+test_case("zero_or_more(punct(\"!\"))", stringify(zero_or_more(punct("!"))),
+    "", yes("[]")).
+test_case("zero_or_more(punct(\"!\"))", stringify(zero_or_more(punct("!"))),
+    "abc", yes("[]")).
+test_case("zero_or_more(punct(\"!\"))", stringify(zero_or_more(punct("!"))),
+    "!!!   abc", yes("[unit, unit, unit]")).
+
+test_case("one_or_more(punct(\"!\"))", stringify(one_or_more(punct("!"))),
+    "", no).
+test_case("one_or_more(punct(\"!\"))", stringify(one_or_more(punct("!"))),
+    "abc", no).
+test_case("one_or_more(punct(\"!\"))", stringify(one_or_more(punct("!"))),
+    "!!!   abc", yes("[unit, unit, unit]")).
+
+test_case("brackets(\"(\", \")\", punct(\"!\"))",
+    stringify(brackets("(", ")", punct("!"))),
+    "", no).
+test_case("brackets(\"(\", \")\", punct(\"!\"))",
+    stringify(brackets("(", ")", punct("!"))),
+    "abc", no).
+test_case("brackets(\"(\", \")\", punct(\"!\"))",
+    stringify(brackets("(", ")", punct("!"))),
+    "(abc)", no).
+test_case("brackets(\"(\", \")\", punct(\"!\"))",
+    stringify(brackets("(", ")", punct("!"))),
+    "(!)   abc", yes("unit")).
+
+test_case("separated_list(\"+\", punct(\"!\"))",
+    stringify(separated_list("+", punct("!"))),
+    "", no).
+test_case("separated_list(\"+\", punct(\"!\"))",
+    stringify(separated_list("+", punct("!"))),
+    "abc", no).
+test_case("separated_list(\"+\", punct(\"!\"))",
+    stringify(separated_list("+", punct("!"))),
+    "!   abc", yes("[unit]")).
+test_case("separated_list(\"+\", punct(\"!\"))",
+    stringify(separated_list("+", punct("!"))),
+    "!+ ! + !   abc", yes("[unit, unit, unit]")).
+
+test_case("comma_separated_list(punct(\"!\"))",
+    stringify(comma_separated_list(punct("!"))),
+    "", no).
+test_case("comma_separated_list(punct(\"!\"))",
+    stringify(comma_separated_list(punct("!"))),
+    "abc", no).
+test_case("comma_separated_list(punct(\"!\"))",
+    stringify(comma_separated_list(punct("!"))),
+    "!   abc", yes("[unit]")).
+test_case("comma_separated_list(punct(\"!\"))",
+    stringify(comma_separated_list(punct("!"))),
+    "!, ! , !   abc", yes("[unit, unit, unit]")).
+
+%-----------------------------------------------------------------------------%
+
+:- pred stringify(
+        pred(src, T, ps, ps)::in(pred(in, out, in, out) is semidet),
+        src::in,
+        string::out,
+        ps::in,
+        ps::out)
+        is semidet.
+
+stringify(P, Src, String, !PS) :-
+    P(Src, X, !PS),
+    String = string.string(X).
+
+%-----------------------------------------------------------------------------%
+%-----------------------------------------------------------------------------%
--------------------------------------------------------------------------
mercury-reviews mailing list
Post messages to:       mercury-reviews at csse.unimelb.edu.au
Administrative Queries: owner-mercury-reviews at csse.unimelb.edu.au
Subscriptions:          mercury-reviews-request at csse.unimelb.edu.au
--------------------------------------------------------------------------



More information about the reviews mailing list