<div dir="ltr"><div><div>Hello,</div><div><br></div><div>here is an improved version of my patch, it uses a bitset to represent ranges,</div><div>this allows for negated ranges and non-consecutive ranges, which is very useful for Unicode lexing,</div>
<div>as some scripts are in non-consecutive Unicode blocks.</div></div><div><br></div><div>diff --git a/extras/lex/lex.m b/extras/lex/lex.m<br>index c6c7930..8876b02 100644<br>--- a/extras/lex/lex.m<br>+++ b/extras/lex/lex.m<br>
@@ -30,6 +30,8 @@<br> :- import_module maybe.<br> :- import_module pair.<br> :- import_module string.<br>+:- import_module sparse_bitset.<br>+:- import_module enum.<br> <br> %-----------------------------------------------------------------------------%<br>
<br>@@ -72,6 +74,11 @@<br> :- inst ignore_pred<br> == ( pred(in) is semidet ).<br> <br>+ % Represents a set of Unicode characters<br>+ % <br>+:- type charset<br>+ == sparse_bitset(char).<br>+<br> % The type of regular expressions.<br>
%<br> :- type regexp.<br>@@ -100,6 +107,7 @@<br> :- instance regexp(regexp).<br> :- instance regexp(char).<br> :- instance regexp(string).<br>+:- instance regexp(sparse_bitset(T)) <= (regexp(T),enum(T)).<br> <br>
% Some basic non-primitive regexps.<br>
%<br>@@ -107,6 +115,7 @@<br> :- func anybut(string) = regexp. % anybut("abc") is complement of any("abc")<br> :- func ?(T) = regexp <= regexp(T). % ?(R) = R or null<br> :- func +(T) = regexp <= regexp(T). % +(R) = R ++ *(R)<br>
+:- func range(char, char) = regexp. % range('a', 'z') = any("ab...xyz")<br> <br> % Some useful single-char regexps.<br> %<br>@@ -117,7 +126,6 @@<br> :- func alphanum = regexp. % alphanum = alpha or digit<br>
:- func identstart = regexp. % identstart = alpha or "_"<br> :- func ident = regexp. % ident = alphanum or "_"<br>-:- func nl = regexp. % nl = re("\n")<br>
:- func tab = regexp. % tab = re("\t")<br> :- func spc = regexp. % spc = re(" ")<br> :- func wspc = regexp. % wspc = any(" \t\n\r\f\v")<br>@@ -125,6 +133,7 @@<br>
<br> % Some useful compound regexps.<br> %<br>+:- func nl = regexp. % nl = ?("\r") ++ re("\n")<br> :- func nat = regexp. % nat = +(digit)<br> :- func signed_int = regexp. % signed_int = ?("+" or "-") ++ nat<br>
:- func real = regexp. % real = \d+((.\d+([eE]int)?)|[eE]int)<br>@@ -247,6 +256,7 @@<br> :- import_module bool.<br> :- import_module char.<br> :- import_module exception.<br>+:- import_module require.<br>
:- import_module int.<br>
:- import_module map.<br> <br>@@ -702,6 +712,14 @@ read_from_string(Offset, Result, String, unsafe_promise_unique(String)) :-<br> )<br> ].<br> <br>+:- instance regexp(sparse_bitset(T)) <= (regexp(T),enum(T)) where [<br>
+ re(Charset) = R :-<br>+ R = sparse_bitset.foldl(<br>+ func(Char::in, R0::in) = (R1::out) is det :- <br>+ if R0 = eps then R1 = re(Char) else R1 = (R0 or re(Char)), <br>+ Charset, eps)<br>
+].<br>+<br> %-----------------------------------------------------------------------------%<br> % Basic primitive regexps.<br> <br>@@ -714,38 +732,64 @@ read_from_string(Offset, Result, String, unsafe_promise_unique(String)) :-<br>
%-----------------------------------------------------------------------------%<br> % Some basic non-primitive regexps.<br> <br>+ % succeeds iff the int value is in [0x0..0x10ffff] and not a surrogate.<br>+:- func int_is_valid_char(int) = char is semidet.<br>
+<br>+int_is_valid_char(Value) = Char :- <br>+ char.from_int(Value, Char), <br>+ not char.is_surrogate(Char).<br>+<br>+:- func make_charset(int, int) = charset.<br>+<br>+make_charset(Start, End) = Charset :-<br>+ ( if Start =< End then<br>
+ Chars = list.filter_map(<br>+ int_is_valid_char,<br>+ Start `..` End<br>+ ),<br>+ Charset = sparse_bitset.sorted_list_to_set(Chars)<br>+ else <br>+ unexpected($file, $pred, <br>
+ format("should: Start < End, but: %d > %d", [i(Start), i(End)]))<br>+ ).<br>+ <br>+ % Latin comprises following Unicode blocks:<br>+ % * C0 Controls and Basic Latin<br>+ % * C1 Controls and Latin1 Suplement<br>
+ % * Latin Extended-A<br>+ % * Latin Extended-B<br>+:- func latin_chars = charset is det.<br>+<br>+latin_chars = make_charset(0x01, 0x02af). <br>+<br> any(S) = R :-<br> ( if S = "" then<br> R = null<br>
else<br>- L = string.length(S),<br>- C = string.det_index(S, L - 1),<br>- R = str_foldr(func(Cx, Rx) = (Cx or Rx), S, re(C), L - 2)<br>+ R = re(sparse_bitset.list_to_set(string.to_char_list(S)))<br>
).<br> <br>-anybut(S0) = R :-<br>- S = string.from_char_list(<br>- list.filter_map(<br>- ( func(X) = C is semidet :-<br>- char.to_int(C, X),<br>- not string.contains_char(S0, C)<br>
- ),<br>- 0x01 `..` 0xff<br>- )<br>- ),<br>- R = any(S).<br>+anybut(S) = R :-<br>+ ( if S = "" then<br>+ R = re(latin_chars)<br>+ else<br>+ ExcludedChars = sparse_bitset.list_to_set(string.to_char_list(S)),<br>
+ R = re(sparse_bitset.difference(latin_chars, ExcludedChars))<br>+ ).<br> <br> :- func str_foldr(func(char, T) = T, string, T, int) = T.<br> <br> str_foldr(Fn, S, X, I) =<br> ( if I < 0 then X<br> else str_foldr(Fn, S, Fn(string.det_index(S, I), X), I - 1)<br>
- ).<br>+ ). <br> <br> ?(R) = (R or null).<br> <br> +(R) = (R ++ *(R)).<br> <br>+range(Start, End) = re(make_charset(char.to_int(Start), char.to_int(End))).<br>+<br> %-----------------------------------------------------------------------------%<br>
% Some useful single-char regexps.<br> <br>@@ -763,18 +807,18 @@ digit = any("0123456789").<br> lower = any("abcdefghijklmnopqrstuvwxyz").<br> upper = any("ABCDEFGHIJKLMNOPQRSTUVWXYZ").<br>
wspc = any(" \t\n\r\f\v").<br>-dot = anybut("\n").<br>+dot = anybut("\r\n").<br> alpha = (lower or upper).<br> alphanum = (alpha or digit).<br> identstart = (alpha or ('_')).<br>
ident = (alphanum or ('_')).<br>-nl = re('\n').<br> tab = re('\t').<br> spc = re(' ').<br> <br> %-----------------------------------------------------------------------------%<br>
% Some useful compound regexps.<br> <br>+nl = (?('\r') ++ '\n'). % matches both Posix and Windows newline.<br> nat = +(digit).<br> signed_int = ?("+" or "-") ++ nat.<br>
real = signed_int ++ (<br>diff --git a/extras/lex/samples/lex_demo.m b/extras/lex/samples/lex_demo.m<br>index 6d30ac2..256408c 100644<br>--- a/extras/lex/samples/lex_demo.m<br>+++ b/extras/lex/samples/lex_demo.m<br>
@@ -86,7 +86,7 @@ tokenise_stdin(!LS) :-<br> ; prep(string)<br> ; punc<br> ; space<br>- ; unrecognised(string).<br>+ ; word(string).<br> <br> :- func lexemes = list(lexeme(token)).<br>
<br>@@ -125,7 +125,9 @@ lexemes = [<br> ( any("~!@#$%^&*()_+`-={}|[]\\:"";'<>?,./")<br> -> return(punc) ),<br> ( whitespace -> return(space) ),<br>
- ( dot -> func(Match) = unrecognised(Match) )<br>+ ( +(range('a', 'z') or<br>+ range('A', 'Z')<br>+ ) -> func(Match) = word(Match) )<br>
].<br> <br> %-----------------------------------------------------------------------------%<br></div><div class="gmail_extra"><br><br><div class="gmail_quote">On 21 February 2014 02:38, Paul Bone <span dir="ltr"><<a href="mailto:paul@bone.id.au" target="_blank">paul@bone.id.au</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;padding-left:1ex;border-left-color:rgb(204,204,204);border-left-width:1px;border-left-style:solid"><div>On Sat, Feb 15, 2014 at 08:30:32PM +0100, Sebastian Godelet wrote:<br>
> For review by anyone.<br>
><br>
> To facilitate easier lexeme definition,<br>
> add a new range/2 function which works as a simple character class like in<br>
> Perl regular expressions.<br>
> For example: range('a', 'f') = any("abcdef").<br>
><br>
> extras/lex/lex.m:<br>
> adds func range(char, char) = regexp.<br>
><br>
> extras/lex/samples/lex_demo.m<br>
> adds a word recognizer just before the "junk" lexeme.<br>
<br>
</div>I can't find the changes to lex_demo.m in your attached patch.<br>
<div><br>
><br>
> I hope you find this useful.<br>
> If my changes get approved in some form (this is my first contribution)<br>
> I'd happily enhance the basic lexer to become more expressive and powerful.<br>
> I was thinking of range/3: range(From, To, Exclude = type set(char).<br>
<br>
</div>Thanks Sebastian,<br>
<br>
These changes are good in principal, although I cannot yet review<br>
lex_demo.m.<br>
<br>
If you want more flexibility it looks like the existing code could be made<br>
more flexible as well. One idea is to create a new version of the any/1<br>
function which takes a list.<br>
<br>
:- func any_list(list(char)) = regexp.<br>
<br>
Then your range example with the exclude list can be written easily:<br>
<br>
any_list(not_in(Exclude), char_range(From .. To))<br>
<br>
Of course now you need a predicate not_in/1 and a function char_range. But<br>
those should be simple.<br>
<br>
Many of the functions and predicates in the list module can then be used to<br>
describe sets of characters.<br>
<div><br>
<br>
> +range(S, E) = R :-<br>
> + char.to_int(S, Si),<br>
> + char.to_int(E, Ei),<br>
> + ( if Si < Ei then<br>
> + R = build_range(Si + 1, Ei, re(S))<br>
> + else if Si = Ei then<br>
> + R = re(S)<br>
> + else<br>
> + R = null<br>
> + ).<br>
> +<br>
> +:- func build_range(int, int, regexp) = regexp.<br>
> +<br>
> +build_range(S, E, R0) = R :-<br>
> + ( if S < E then<br>
> + char.det_from_int(S, C),<br>
> + R1 = (R0 or re(C)),<br>
> + R = build_range(S + 1, E, R1)<br>
> + else if S = E then<br>
> + R = R0<br>
> + else<br>
> + throw(exception.software_error("invalid range!"))<br>
> + ).<br>
> +<br>
<br>
</div>Try to use more meaningful variable names, rather than S, E and C call these<br>
Start End and Char. I was able to work this out by looking at your code<br>
however you can avoid many misunderstandings with well written code.<br>
<br>
We also have some useful exception throwing functions in the module require.<br>
error($file, $pred, "invalid range") will throw a software error exception<br>
that describes the location of the error.<br>
<div><br>
> %-----------------------------------------------------------------------------%<br>
> % Some useful single-char regexps.<br>
><br>
> @@ -768,13 +793,13 @@ alpha = (lower or upper).<br>
> alphanum = (alpha or digit).<br>
> identstart = (alpha or ('_')).<br>
> ident = (alphanum or ('_')).<br>
> -nl = re('\n').<br>
> tab = re('\t').<br>
> spc = re(' ').<br>
><br>
> %-----------------------------------------------------------------------------%<br>
> % Some useful compound regexps.<br>
><br>
> +nl = (?('\r') ++ '\n'). % matches both Posix and Windows newline.<br>
> nat = +(digit).<br>
> signed_int = ?("+" or "-") ++ nat.<br>
> real = signed_int ++ (<br>
<br>
</div>Good idea.<br>
<div><br>
> diff --git a/extras/lex/samples/lex_demo.m b/extras/lex/samples/lex_demo.m<br>
> index 6d30ac2..68aef0d 100644<br>
> --- a/extras/lex/samples/lex_demo.m<br>
> +++ b/extras/lex/samples/lex_demo.m<br>
<br>
</div>The changes to this file seem to be missing.<br>
<span class="HOEnZb"><font color="#888888"><br>
<br>
--<br>
Paul Bone<br>
</font></span></blockquote></div><br></div></div>