[m-rev.] add character ranges to extras/lex
Sebastian Godelet
sebastian.godelet+github at gmail.com
Sun Feb 23 09:15:16 AEDT 2014
Hello,
here is an improved version of my patch, it uses a bitset to represent
ranges,
this allows for negated ranges and non-consecutive ranges, which is very
useful for Unicode lexing,
as some scripts are in non-consecutive Unicode blocks.
diff --git a/extras/lex/lex.m b/extras/lex/lex.m
index c6c7930..8876b02 100644
--- a/extras/lex/lex.m
+++ b/extras/lex/lex.m
@@ -30,6 +30,8 @@
:- import_module maybe.
:- import_module pair.
:- import_module string.
+:- import_module sparse_bitset.
+:- import_module enum.
%-----------------------------------------------------------------------------%
@@ -72,6 +74,11 @@
:- inst ignore_pred
== ( pred(in) is semidet ).
+ % Represents a set of Unicode characters
+ %
+:- type charset
+ == sparse_bitset(char).
+
% The type of regular expressions.
%
:- type regexp.
@@ -100,6 +107,7 @@
:- instance regexp(regexp).
:- instance regexp(char).
:- instance regexp(string).
+:- instance regexp(sparse_bitset(T)) <= (regexp(T),enum(T)).
% Some basic non-primitive regexps.
%
@@ -107,6 +115,7 @@
:- func anybut(string) = regexp. % anybut("abc") is complement of
any("abc")
:- func ?(T) = regexp <= regexp(T). % ?(R) = R or null
:- func +(T) = regexp <= regexp(T). % +(R) = R ++ *(R)
+:- func range(char, char) = regexp. % range('a', 'z') = any("ab...xyz")
% Some useful single-char regexps.
%
@@ -117,7 +126,6 @@
:- func alphanum = regexp. % alphanum = alpha or digit
:- func identstart = regexp. % identstart = alpha or "_"
:- func ident = regexp. % ident = alphanum or "_"
-:- func nl = regexp. % nl = re("\n")
:- func tab = regexp. % tab = re("\t")
:- func spc = regexp. % spc = re(" ")
:- func wspc = regexp. % wspc = any(" \t\n\r\f\v")
@@ -125,6 +133,7 @@
% Some useful compound regexps.
%
+:- func nl = regexp. % nl = ?("\r") ++ re("\n")
:- func nat = regexp. % nat = +(digit)
:- func signed_int = regexp. % signed_int = ?("+" or "-") ++ nat
:- func real = regexp. % real =
\d+((.\d+([eE]int)?)|[eE]int)
@@ -247,6 +256,7 @@
:- import_module bool.
:- import_module char.
:- import_module exception.
+:- import_module require.
:- import_module int.
:- import_module map.
@@ -702,6 +712,14 @@ read_from_string(Offset, Result, String,
unsafe_promise_unique(String)) :-
)
].
+:- instance regexp(sparse_bitset(T)) <= (regexp(T),enum(T)) where [
+ re(Charset) = R :-
+ R = sparse_bitset.foldl(
+ func(Char::in, R0::in) = (R1::out) is det :-
+ if R0 = eps then R1 = re(Char) else R1 = (R0 or
re(Char)),
+ Charset, eps)
+].
+
%-----------------------------------------------------------------------------%
% Basic primitive regexps.
@@ -714,38 +732,64 @@ read_from_string(Offset, Result, String,
unsafe_promise_unique(String)) :-
%-----------------------------------------------------------------------------%
% Some basic non-primitive regexps.
+ % succeeds iff the int value is in [0x0..0x10ffff] and not a surrogate.
+:- func int_is_valid_char(int) = char is semidet.
+
+int_is_valid_char(Value) = Char :-
+ char.from_int(Value, Char),
+ not char.is_surrogate(Char).
+
+:- func make_charset(int, int) = charset.
+
+make_charset(Start, End) = Charset :-
+ ( if Start =< End then
+ Chars = list.filter_map(
+ int_is_valid_char,
+ Start `..` End
+ ),
+ Charset = sparse_bitset.sorted_list_to_set(Chars)
+ else
+ unexpected($file, $pred,
+ format("should: Start < End, but: %d > %d", [i(Start),
i(End)]))
+ ).
+
+ % Latin comprises following Unicode blocks:
+ % * C0 Controls and Basic Latin
+ % * C1 Controls and Latin1 Suplement
+ % * Latin Extended-A
+ % * Latin Extended-B
+:- func latin_chars = charset is det.
+
+latin_chars = make_charset(0x01, 0x02af).
+
any(S) = R :-
( if S = "" then
R = null
else
- L = string.length(S),
- C = string.det_index(S, L - 1),
- R = str_foldr(func(Cx, Rx) = (Cx or Rx), S, re(C), L - 2)
+ R = re(sparse_bitset.list_to_set(string.to_char_list(S)))
).
-anybut(S0) = R :-
- S = string.from_char_list(
- list.filter_map(
- ( func(X) = C is semidet :-
- char.to_int(C, X),
- not string.contains_char(S0, C)
- ),
- 0x01 `..` 0xff
- )
- ),
- R = any(S).
+anybut(S) = R :-
+ ( if S = "" then
+ R = re(latin_chars)
+ else
+ ExcludedChars = sparse_bitset.list_to_set(string.to_char_list(S)),
+ R = re(sparse_bitset.difference(latin_chars, ExcludedChars))
+ ).
:- func str_foldr(func(char, T) = T, string, T, int) = T.
str_foldr(Fn, S, X, I) =
( if I < 0 then X
else str_foldr(Fn, S, Fn(string.det_index(S, I), X), I - 1)
- ).
+ ).
?(R) = (R or null).
+(R) = (R ++ *(R)).
+range(Start, End) = re(make_charset(char.to_int(Start), char.to_int(End))).
+
%-----------------------------------------------------------------------------%
% Some useful single-char regexps.
@@ -763,18 +807,18 @@ digit = any("0123456789").
lower = any("abcdefghijklmnopqrstuvwxyz").
upper = any("ABCDEFGHIJKLMNOPQRSTUVWXYZ").
wspc = any(" \t\n\r\f\v").
-dot = anybut("\n").
+dot = anybut("\r\n").
alpha = (lower or upper).
alphanum = (alpha or digit).
identstart = (alpha or ('_')).
ident = (alphanum or ('_')).
-nl = re('\n').
tab = re('\t').
spc = re(' ').
%-----------------------------------------------------------------------------%
% Some useful compound regexps.
+nl = (?('\r') ++ '\n'). % matches both Posix and Windows newline.
nat = +(digit).
signed_int = ?("+" or "-") ++ nat.
real = signed_int ++ (
diff --git a/extras/lex/samples/lex_demo.m b/extras/lex/samples/lex_demo.m
index 6d30ac2..256408c 100644
--- a/extras/lex/samples/lex_demo.m
+++ b/extras/lex/samples/lex_demo.m
@@ -86,7 +86,7 @@ tokenise_stdin(!LS) :-
; prep(string)
; punc
; space
- ; unrecognised(string).
+ ; word(string).
:- func lexemes = list(lexeme(token)).
@@ -125,7 +125,9 @@ lexemes = [
( any("~!@#$%^&*()_+`-={}|[]\\:"";'<>?,./")
-> return(punc) ),
( whitespace -> return(space) ),
- ( dot -> func(Match) = unrecognised(Match) )
+ ( +(range('a', 'z') or
+ range('A', 'Z')
+ ) -> func(Match) = word(Match) )
].
%-----------------------------------------------------------------------------%
On 21 February 2014 02:38, Paul Bone <paul at bone.id.au> wrote:
> On Sat, Feb 15, 2014 at 08:30:32PM +0100, Sebastian Godelet wrote:
> > For review by anyone.
> >
> > To facilitate easier lexeme definition,
> > add a new range/2 function which works as a simple character class like
> in
> > Perl regular expressions.
> > For example: range('a', 'f') = any("abcdef").
> >
> > extras/lex/lex.m:
> > adds func range(char, char) = regexp.
> >
> > extras/lex/samples/lex_demo.m
> > adds a word recognizer just before the "junk" lexeme.
>
> I can't find the changes to lex_demo.m in your attached patch.
>
> >
> > I hope you find this useful.
> > If my changes get approved in some form (this is my first contribution)
> > I'd happily enhance the basic lexer to become more expressive and
> powerful.
> > I was thinking of range/3: range(From, To, Exclude = type set(char).
>
> Thanks Sebastian,
>
> These changes are good in principal, although I cannot yet review
> lex_demo.m.
>
> If you want more flexibility it looks like the existing code could be made
> more flexible as well. One idea is to create a new version of the any/1
> function which takes a list.
>
> :- func any_list(list(char)) = regexp.
>
> Then your range example with the exclude list can be written easily:
>
> any_list(not_in(Exclude), char_range(From .. To))
>
> Of course now you need a predicate not_in/1 and a function char_range. But
> those should be simple.
>
> Many of the functions and predicates in the list module can then be used to
> describe sets of characters.
>
>
> > +range(S, E) = R :-
> > + char.to_int(S, Si),
> > + char.to_int(E, Ei),
> > + ( if Si < Ei then
> > + R = build_range(Si + 1, Ei, re(S))
> > + else if Si = Ei then
> > + R = re(S)
> > + else
> > + R = null
> > + ).
> > +
> > +:- func build_range(int, int, regexp) = regexp.
> > +
> > +build_range(S, E, R0) = R :-
> > + ( if S < E then
> > + char.det_from_int(S, C),
> > + R1 = (R0 or re(C)),
> > + R = build_range(S + 1, E, R1)
> > + else if S = E then
> > + R = R0
> > + else
> > + throw(exception.software_error("invalid range!"))
> > + ).
> > +
>
> Try to use more meaningful variable names, rather than S, E and C call
> these
> Start End and Char. I was able to work this out by looking at your code
> however you can avoid many misunderstandings with well written code.
>
> We also have some useful exception throwing functions in the module
> require.
> error($file, $pred, "invalid range") will throw a software error exception
> that describes the location of the error.
>
> >
> %-----------------------------------------------------------------------------%
> > % Some useful single-char regexps.
> >
> > @@ -768,13 +793,13 @@ alpha = (lower or upper).
> > alphanum = (alpha or digit).
> > identstart = (alpha or ('_')).
> > ident = (alphanum or ('_')).
> > -nl = re('\n').
> > tab = re('\t').
> > spc = re(' ').
> >
> >
> %-----------------------------------------------------------------------------%
> > % Some useful compound regexps.
> >
> > +nl = (?('\r') ++ '\n'). % matches both Posix and Windows
> newline.
> > nat = +(digit).
> > signed_int = ?("+" or "-") ++ nat.
> > real = signed_int ++ (
>
> Good idea.
>
> > diff --git a/extras/lex/samples/lex_demo.m
> b/extras/lex/samples/lex_demo.m
> > index 6d30ac2..68aef0d 100644
> > --- a/extras/lex/samples/lex_demo.m
> > +++ b/extras/lex/samples/lex_demo.m
>
> The changes to this file seem to be missing.
>
>
> --
> Paul Bone
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mercurylang.org/archives/reviews/attachments/20140222/27da0798/attachment.html>
More information about the reviews
mailing list