<div dir="ltr"><div><div>Hello,</div><div> </div><div>here is an improved version of my patch, it uses a bitset to represent ranges,</div><div>this allows for negated ranges and non-consecutive ranges, which is very useful for Unicode lexing,</div> <div>as some scripts are in non-consecutive Unicode blocks.</div></div><div> </div><div>diff --git a/extras/lex/lex.m b/extras/lex/lex.m index c6c7930..8876b02 100644 --- a/extras/lex/lex.m +++ b/extras/lex/lex.m @@ -30,6 +30,8 @@ :- import_module maybe. :- import_module pair. :- import_module string. +:- import_module sparse_bitset. +:- import_module enum. %-----------------------------------------------------------------------------% @@ -72,6 +74,11 @@ :- inst ignore_pred == ( pred(in) is semidet ). + % Represents a set of Unicode characters + % +:- type charset + == sparse_bitset(char). + % The type of regular expressions. % :- type regexp. @@ -100,6 +107,7 @@ :- instance regexp(regexp). :- instance regexp(char). :- instance regexp(string). +:- instance regexp(sparse_bitset(T)) <= (regexp(T),enum(T)). % Some basic non-primitive regexps. % @@ -107,6 +115,7 @@ :- func anybut(string) = regexp. % anybut("abc") is complement of any("abc") :- func ?(T) = regexp <= regexp(T). % ?(R) = R or null :- func +(T) = regexp <= regexp(T). % +(R) = R ++ *(R) +:- func range(char, char) = regexp. % range('a', 'z') = any("ab...xyz") % Some useful single-char regexps. % @@ -117,7 +126,6 @@ :- func alphanum = regexp. % alphanum = alpha or digit :- func identstart = regexp. % identstart = alpha or "_" :- func ident = regexp. % ident = alphanum or "_" -:- func nl = regexp. % nl = re("\n") :- func tab = regexp. % tab = re("\t") :- func spc = regexp. % spc = re(" ") :- func wspc = regexp. % wspc = any(" \t\n\r\f\v") @@ -125,6 +133,7 @@ % Some useful compound regexps. % +:- func nl = regexp. % nl = ?("\r") ++ re("\n") :- func nat = regexp. % nat = +(digit) :- func signed_int = regexp. % signed_int = ?("+" or "-") ++ nat :- func real = regexp. % real = \d+((.\d+([eE]int)?)|[eE]int) @@ -247,6 +256,7 @@ :- import_module bool. :- import_module char. :- import_module exception. +:- import_module require. :- import_module int. :- import_module map. @@ -702,6 +712,14 @@ read_from_string(Offset, Result, String, unsafe_promise_unique(String)) :- ) ]. +:- instance regexp(sparse_bitset(T)) <= (regexp(T),enum(T)) where [ + re(Charset) = R :- + R = sparse_bitset.foldl( + func(Char::in, R0::in) = (R1::out) is det :- + if R0 = eps then R1 = re(Char) else R1 = (R0 or re(Char)), + Charset, eps) +]. + %-----------------------------------------------------------------------------% % Basic primitive regexps. @@ -714,38 +732,64 @@ read_from_string(Offset, Result, String, unsafe_promise_unique(String)) :- %-----------------------------------------------------------------------------% % Some basic non-primitive regexps. + % succeeds iff the int value is in [0x0..0x10ffff] and not a surrogate. +:- func int_is_valid_char(int) = char is semidet. + +int_is_valid_char(Value) = Char :- + char.from_int(Value, Char), + not char.is_surrogate(Char). + +:- func make_charset(int, int) = charset. + +make_charset(Start, End) = Charset :- + ( if Start =< End then + Chars = list.filter_map( + int_is_valid_char, + Start `..` End + ), + Charset = sparse_bitset.sorted_list_to_set(Chars) + else + unexpected($file, $pred, + format("should: Start < End, but: %d > %d", [i(Start), i(End)])) + ). + + % Latin comprises following Unicode blocks: + % * C0 Controls and Basic Latin + % * C1 Controls and Latin1 Suplement + % * Latin Extended-A + % * Latin Extended-B +:- func latin_chars = charset is det. + +latin_chars = make_charset(0x01, 0x02af). + any(S) = R :- ( if S = "" then R = null else - L = string.length(S), - C = string.det_index(S, L - 1), - R = str_foldr(func(Cx, Rx) = (Cx or Rx), S, re(C), L - 2) + R = re(sparse_bitset.list_to_set(string.to_char_list(S))) ). -anybut(S0) = R :- - S = string.from_char_list( - list.filter_map( - ( func(X) = C is semidet :- - char.to_int(C, X), - not string.contains_char(S0, C) - ), - 0x01 `..` 0xff - ) - ), - R = any(S). +anybut(S) = R :- + ( if S = "" then + R = re(latin_chars) + else + ExcludedChars = sparse_bitset.list_to_set(string.to_char_list(S)), + R = re(sparse_bitset.difference(latin_chars, ExcludedChars)) + ). :- func str_foldr(func(char, T) = T, string, T, int) = T. str_foldr(Fn, S, X, I) = ( if I < 0 then X else str_foldr(Fn, S, Fn(string.det_index(S, I), X), I - 1) - ). + ). ?(R) = (R or null). +(R) = (R ++ *(R)). +range(Start, End) = re(make_charset(char.to_int(Start), char.to_int(End))). + %-----------------------------------------------------------------------------% % Some useful single-char regexps. @@ -763,18 +807,18 @@ digit = any("0123456789"). lower = any("abcdefghijklmnopqrstuvwxyz"). upper = any("ABCDEFGHIJKLMNOPQRSTUVWXYZ"). wspc = any(" \t\n\r\f\v"). -dot = anybut("\n"). +dot = anybut("\r\n"). alpha = (lower or upper). alphanum = (alpha or digit). identstart = (alpha or ('_')). ident = (alphanum or ('_')). -nl = re('\n'). tab = re('\t'). spc = re(' '). %-----------------------------------------------------------------------------% % Some useful compound regexps. +nl = (?('\r') ++ '\n'). % matches both Posix and Windows newline. nat = +(digit). signed_int = ?("+" or "-") ++ nat. real = signed_int ++ ( diff --git a/extras/lex/samples/lex_demo.m b/extras/lex/samples/lex_demo.m index 6d30ac2..256408c 100644 --- a/extras/lex/samples/lex_demo.m +++ b/extras/lex/samples/lex_demo.m @@ -86,7 +86,7 @@ tokenise_stdin(!LS) :- ; prep(string) ; punc ; space - ; unrecognised(string). + ; word(string). :- func lexemes = list(lexeme(token)). @@ -125,7 +125,9 @@ lexemes = [ ( any("~!@#$%^&*()_+`-={}|[]\\:"";'<>?,./") -> return(punc) ), ( whitespace -> return(space) ), - ( dot -> func(Match) = unrecognised(Match) ) + ( +(range('a', 'z') or + range('A', 'Z') + ) -> func(Match) = word(Match) ) ]. %-----------------------------------------------------------------------------% </div><div class="gmail_extra"> <div class="gmail_quote">On 21 February 2014 02:38, Paul Bone <<a href="mailto:paul@bone.id.au" target="_blank">paul@bone.id.au</a>> wrote: <blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;padding-left:1ex;border-left-color:rgb(204,204,204);border-left-width:1px;border-left-style:solid"><div>On Sat, Feb 15, 2014 at 08:30:32PM +0100, Sebastian Godelet wrote: > For review by anyone. > > To facilitate easier lexeme definition, > add a new range/2 function which works as a simple character class like in > Perl regular expressions. > For example: range('a', 'f') = any("abcdef"). > > extras/lex/lex.m: > adds func range(char, char) = regexp. > > extras/lex/samples/lex_demo.m > adds a word recognizer just before the "junk" lexeme. </div>I can't find the changes to lex_demo.m in your attached patch. <div> > > I hope you find this useful. > If my changes get approved in some form (this is my first contribution) > I'd happily enhance the basic lexer to become more expressive and powerful. > I was thinking of range/3: range(From, To, Exclude = type set(char). </div>Thanks Sebastian, These changes are good in principal, although I cannot yet review lex_demo.m. If you want more flexibility it looks like the existing code could be made more flexible as well. One idea is to create a new version of the any/1 function which takes a list. :- func any_list(list(char)) = regexp. Then your range example with the exclude list can be written easily: any_list(not_in(Exclude), char_range(From .. To)) Of course now you need a predicate not_in/1 and a function char_range. But those should be simple. Many of the functions and predicates in the list module can then be used to describe sets of characters. <div> > +range(S, E) = R :- > + char.to_int(S, Si), > + char.to_int(E, Ei), > + ( if Si < Ei then > + R = build_range(Si + 1, Ei, re(S)) > + else if Si = Ei then > + R = re(S) > + else > + R = null > + ). > + > +:- func build_range(int, int, regexp) = regexp. > + > +build_range(S, E, R0) = R :- > + ( if S < E then > + char.det_from_int(S, C), > + R1 = (R0 or re(C)), > + R = build_range(S + 1, E, R1) > + else if S = E then > + R = R0 > + else > + throw(exception.software_error("invalid range!")) > + ). > + </div>Try to use more meaningful variable names, rather than S, E and C call these Start End and Char. I was able to work this out by looking at your code however you can avoid many misunderstandings with well written code. We also have some useful exception throwing functions in the module require. error($file, $pred, "invalid range") will throw a software error exception that describes the location of the error. <div> > %-----------------------------------------------------------------------------% > % Some useful single-char regexps. > > @@ -768,13 +793,13 @@ alpha = (lower or upper). > alphanum = (alpha or digit). > identstart = (alpha or ('_')). > ident = (alphanum or ('_')). > -nl = re('\n'). > tab = re('\t'). > spc = re(' '). > > %-----------------------------------------------------------------------------% > % Some useful compound regexps. > > +nl = (?('\r') ++ '\n'). % matches both Posix and Windows newline. > nat = +(digit). > signed_int = ?("+" or "-") ++ nat. > real = signed_int ++ ( </div>Good idea. <div> > diff --git a/extras/lex/samples/lex_demo.m b/extras/lex/samples/lex_demo.m > index 6d30ac2..68aef0d 100644 > --- a/extras/lex/samples/lex_demo.m > +++ b/extras/lex/samples/lex_demo.m </div>The changes to this file seem to be missing. -- Paul Bone </blockquote></div> </div></div>