[m-rev.] add character ranges to extras/lex

Sebastian Godelet sebastian.godelet+github at gmail.com
Sun Feb 23 09:15:16 AEDT 2014


Hello,

here is an improved version of my patch, it uses a bitset to represent
ranges,
this allows for negated ranges and non-consecutive ranges, which is very
useful for Unicode lexing,
as some scripts are in non-consecutive Unicode blocks.

diff --git a/extras/lex/lex.m b/extras/lex/lex.m
index c6c7930..8876b02 100644
--- a/extras/lex/lex.m
+++ b/extras/lex/lex.m
@@ -30,6 +30,8 @@
 :- import_module maybe.
 :- import_module pair.
 :- import_module string.
+:- import_module sparse_bitset.
+:- import_module enum.

 %-----------------------------------------------------------------------------%

@@ -72,6 +74,11 @@
 :- inst ignore_pred
     ==      ( pred(in) is semidet ).

+    % Represents a set of Unicode characters
+    %
+:- type charset
+    ==      sparse_bitset(char).
+
     % The type of regular expressions.
     %
 :- type regexp.
@@ -100,6 +107,7 @@
 :- instance regexp(regexp).
 :- instance regexp(char).
 :- instance regexp(string).
+:- instance regexp(sparse_bitset(T)) <= (regexp(T),enum(T)).

     % Some basic non-primitive regexps.
     %
@@ -107,6 +115,7 @@
 :- func anybut(string) = regexp.     % anybut("abc") is complement of
any("abc")
 :- func ?(T) = regexp <= regexp(T).  % ?(R)       = R or null
 :- func +(T) = regexp <= regexp(T).  % +(R)       = R ++ *(R)
+:- func range(char, char) = regexp.  % range('a', 'z') = any("ab...xyz")

     % Some useful single-char regexps.
     %
@@ -117,7 +126,6 @@
 :- func alphanum = regexp.      % alphanum   = alpha or digit
 :- func identstart = regexp.    % identstart = alpha or "_"
 :- func ident = regexp.         % ident      = alphanum or "_"
-:- func nl = regexp.            % nl         = re("\n")
 :- func tab = regexp.           % tab        = re("\t")
 :- func spc = regexp.           % spc        = re(" ")
 :- func wspc = regexp.          % wspc       = any(" \t\n\r\f\v")
@@ -125,6 +133,7 @@

     % Some useful compound regexps.
     %
+:- func nl = regexp.            % nl         = ?("\r") ++ re("\n")
 :- func nat = regexp.           % nat        = +(digit)
 :- func signed_int = regexp.    % signed_int = ?("+" or "-") ++ nat
 :- func real = regexp.          % real       =
\d+((.\d+([eE]int)?)|[eE]int)
@@ -247,6 +256,7 @@
 :- import_module bool.
 :- import_module char.
 :- import_module exception.
+:- import_module require.
 :- import_module int.
 :- import_module map.

@@ -702,6 +712,14 @@ read_from_string(Offset, Result, String,
unsafe_promise_unique(String)) :-
         )
 ].

+:- instance regexp(sparse_bitset(T)) <= (regexp(T),enum(T)) where [
+    re(Charset) = R :-
+       R = sparse_bitset.foldl(
+               func(Char::in, R0::in) = (R1::out) is det :-
+                   if R0 = eps then R1 = re(Char) else R1 = (R0 or
re(Char)),
+               Charset, eps)
+].
+
 %-----------------------------------------------------------------------------%
 % Basic primitive regexps.

@@ -714,38 +732,64 @@ read_from_string(Offset, Result, String,
unsafe_promise_unique(String)) :-
 %-----------------------------------------------------------------------------%
 % Some basic non-primitive regexps.

+    % succeeds iff the int value is in [0x0..0x10ffff] and not a surrogate.
+:- func int_is_valid_char(int) = char is semidet.
+
+int_is_valid_char(Value) = Char :-
+    char.from_int(Value, Char),
+    not char.is_surrogate(Char).
+
+:- func make_charset(int, int) = charset.
+
+make_charset(Start, End) = Charset :-
+    ( if Start =< End then
+        Chars = list.filter_map(
+            int_is_valid_char,
+            Start `..` End
+        ),
+        Charset = sparse_bitset.sorted_list_to_set(Chars)
+      else
+        unexpected($file, $pred,
+            format("should: Start < End, but: %d > %d", [i(Start),
i(End)]))
+    ).
+
+    % Latin comprises following Unicode blocks:
+    %  * C0 Controls and Basic Latin
+    %  * C1 Controls and Latin1 Suplement
+    %  * Latin Extended-A
+    %  * Latin Extended-B
+:- func latin_chars = charset is det.
+
+latin_chars = make_charset(0x01, 0x02af).
+
 any(S) = R :-
     ( if S = "" then
         R = null
       else
-        L = string.length(S),
-        C = string.det_index(S, L - 1),
-        R = str_foldr(func(Cx, Rx) = (Cx or Rx), S, re(C), L - 2)
+        R = re(sparse_bitset.list_to_set(string.to_char_list(S)))
     ).

-anybut(S0) = R :-
-    S = string.from_char_list(
-            list.filter_map(
-                ( func(X) = C is semidet :-
-                    char.to_int(C, X),
-                    not string.contains_char(S0, C)
-                ),
-                0x01 `..` 0xff
-            )
-        ),
-    R = any(S).
+anybut(S) = R :-
+    ( if S = "" then
+        R = re(latin_chars)
+      else
+        ExcludedChars = sparse_bitset.list_to_set(string.to_char_list(S)),
+        R = re(sparse_bitset.difference(latin_chars, ExcludedChars))
+    ).

 :- func str_foldr(func(char, T) = T, string, T, int) = T.

 str_foldr(Fn, S, X, I) =
     ( if I < 0 then X
                else str_foldr(Fn, S, Fn(string.det_index(S, I), X), I - 1)
-    ).
+    ).

 ?(R) = (R or null).

 +(R) = (R ++ *(R)).

+range(Start, End) = re(make_charset(char.to_int(Start), char.to_int(End))).
+
 %-----------------------------------------------------------------------------%
 % Some useful single-char regexps.

@@ -763,18 +807,18 @@ digit      = any("0123456789").
 lower      = any("abcdefghijklmnopqrstuvwxyz").
 upper      = any("ABCDEFGHIJKLMNOPQRSTUVWXYZ").
 wspc       = any(" \t\n\r\f\v").
-dot        = anybut("\n").
+dot        = anybut("\r\n").
 alpha      = (lower or upper).
 alphanum   = (alpha or digit).
 identstart = (alpha or ('_')).
 ident      = (alphanum or ('_')).
-nl         = re('\n').
 tab        = re('\t').
 spc        = re(' ').

 %-----------------------------------------------------------------------------%
 % Some useful compound regexps.

+nl         = (?('\r') ++ '\n').  % matches both Posix and Windows newline.
 nat        = +(digit).
 signed_int = ?("+" or "-") ++ nat.
 real       = signed_int ++ (
diff --git a/extras/lex/samples/lex_demo.m b/extras/lex/samples/lex_demo.m
index 6d30ac2..256408c 100644
--- a/extras/lex/samples/lex_demo.m
+++ b/extras/lex/samples/lex_demo.m
@@ -86,7 +86,7 @@ tokenise_stdin(!LS) :-
     ;       prep(string)
     ;       punc
     ;       space
-    ;       unrecognised(string).
+    ;       word(string).

 :- func lexemes = list(lexeme(token)).

@@ -125,7 +125,9 @@ lexemes = [
     ( any("~!@#$%^&*()_+`-={}|[]\\:"";'<>?,./")
                         -> return(punc) ),
     ( whitespace        -> return(space) ),
-    ( dot               -> func(Match) = unrecognised(Match) )
+    ( +(range('a', 'z') or
+        range('A', 'Z')
+       )                -> func(Match) = word(Match) )
 ].

 %-----------------------------------------------------------------------------%


On 21 February 2014 02:38, Paul Bone <paul at bone.id.au> wrote:

> On Sat, Feb 15, 2014 at 08:30:32PM +0100, Sebastian Godelet wrote:
> > For review by anyone.
> >
> > To facilitate easier lexeme definition,
> > add a new range/2 function which works as a simple character class like
> in
> > Perl regular expressions.
> > For example: range('a', 'f') = any("abcdef").
> >
> > extras/lex/lex.m:
> >    adds func range(char, char) = regexp.
> >
> > extras/lex/samples/lex_demo.m
> >    adds a word recognizer just before the "junk" lexeme.
>
> I can't find the changes to lex_demo.m in your attached patch.
>
> >
> > I hope you find this useful.
> > If my changes get approved in some form (this is my first contribution)
> > I'd happily enhance the basic lexer to become more expressive and
> powerful.
> > I was thinking of range/3: range(From, To, Exclude = type set(char).
>
> Thanks Sebastian,
>
> These changes are good in principal, although I cannot yet review
> lex_demo.m.
>
> If you want more flexibility it looks like the existing code could be made
> more flexible as well.  One idea is to create a new version of the any/1
> function which takes a list.
>
>     :- func any_list(list(char)) = regexp.
>
> Then your range example with the exclude list can be written easily:
>
>     any_list(not_in(Exclude), char_range(From .. To))
>
> Of course now you need a predicate not_in/1 and a function char_range.  But
> those should be simple.
>
> Many of the functions and predicates in the list module can then be used to
> describe sets of characters.
>
>
> > +range(S, E) = R :-
> > +    char.to_int(S, Si),
> > +    char.to_int(E, Ei),
> > +    ( if Si < Ei then
> > +        R = build_range(Si + 1, Ei, re(S))
> > +      else if Si = Ei then
> > +        R = re(S)
> > +      else
> > +        R = null
> > +    ).
> > +
> > +:- func build_range(int, int, regexp) = regexp.
> > +
> > +build_range(S, E, R0) = R :-
> > +    ( if S < E then
> > +        char.det_from_int(S, C),
> > +        R1 = (R0 or re(C)),
> > +        R = build_range(S + 1, E, R1)
> > +      else if S = E then
> > +        R = R0
> > +      else
> > +        throw(exception.software_error("invalid range!"))
> > +    ).
> > +
>
> Try to use more meaningful variable names, rather than S, E and C call
> these
> Start End and Char.  I was able to work this out by looking at your code
> however you can avoid many misunderstandings with well written code.
>
> We also have some useful exception throwing functions in the module
> require.
> error($file, $pred, "invalid range") will throw a software error exception
> that describes the location of the error.
>
> >
>  %-----------------------------------------------------------------------------%
> >  % Some useful single-char regexps.
> >
> > @@ -768,13 +793,13 @@ alpha      = (lower or upper).
> >  alphanum   = (alpha or digit).
> >  identstart = (alpha or ('_')).
> >  ident      = (alphanum or ('_')).
> > -nl         = re('\n').
> >  tab        = re('\t').
> >  spc        = re(' ').
> >
> >
>  %-----------------------------------------------------------------------------%
> >  % Some useful compound regexps.
> >
> > +nl         = (?('\r') ++ '\n').  % matches both Posix and Windows
> newline.
> >  nat        = +(digit).
> >  signed_int = ?("+" or "-") ++ nat.
> >  real       = signed_int ++ (
>
> Good idea.
>
> > diff --git a/extras/lex/samples/lex_demo.m
> b/extras/lex/samples/lex_demo.m
> > index 6d30ac2..68aef0d 100644
> > --- a/extras/lex/samples/lex_demo.m
> > +++ b/extras/lex/samples/lex_demo.m
>
> The changes to this file seem to be missing.
>
>
> --
> Paul Bone
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mercurylang.org/archives/reviews/attachments/20140222/27da0798/attachment.html>


More information about the reviews mailing list