[m-dev.] more on regular expressions

Thomas Conway conway at cs.mu.OZ.AU
Wed Dec 22 16:27:07 AEDT 1999


Hi

Here's 2c while I wait for the C++ compiler to consent to build
stuff (templates are awful; C++'s lack of a module system is awful -
well to be more accurate, it has several disjoint pieces of module
systems, but no integrated, complete one...)... I haven't really
been following the regex thread closely, but here's some observations,
and some code. Feel free to do with as you please...

I played around with regular expressions some time ago. I found that
to be practical, you actually need to do some preprocessing to ensure
termination. The typical problem is where you get a regex like a**
(ie star(star(const('a')))) and you give it the string "b". The natural
implementation of star leads to an infinite loop in this situation
because it parses off an infinite number of epsilons (obtainable from
the inner a*).

So, you need to do some processing so that you don't loop in these
situations. One approach that handles this is to have a function that
takes the regex and returns a DFA, and then have a match predicate that
interprets the DFA.

[hunts through various old code...]

Ah, here's some code that does exactly that. It assumes the existence
of a 'getchar' predicate that has the usual C "ignore errors" kind of
behaviour. :-(


:- module scan.

:- interface.

:- import_module int, io, list, std_util.

:- type key	== int.

:- type regex
	--->	c(key)
	;	s(regex, regex)
	;	a(regex, regex)
	;	'*'(regex)
	;	'+'(regex)
	.

:- type dfa.

:- func dfa(regex) = dfa.

:- pred scan(dfa, maybe(list(key)), io__state, io__state).
:- mode scan(in, out, di, uo) is det.

%------------------------------------------------------------------------------%

:- implementation.

:- import_module array, bool, char, map, set.

:- type dfa
	--->	dfa(
			array(dfa_state)
		).

:- type dfa_state
	--->	state(
			terminal,
			array(char),
			array(index)	% corresponding transition
		).

:- type terminal
	--->	terminal ; nonterminal .

:- type index	== int.

%------------------------------------------------------------------------------%

scan(dfa(States), Result) -->
	scan(0, States, [], Result).

:- pred scan(index, array(dfa_state), list(key), maybe(list(key)),
		io__state, io__state).
:- mode scan(in, in, in, out, di, uo) is det.

scan(Index, States, Keys0, Result) -->
	{ lookup(States, Index, state(Term, Syms, Follow)) },
	( { Term = terminal } ->
		{ reverse(Keys0, Keys) },
		{ Result = yes(Keys) }
	;
		getchar(Key),
		{ bsearch(Syms, Key, cmp, MInd) },
		(
			{ MInd = yes(Loc) },
			{ lookup(Follow, Loc, Index1) },
			scan(Index1, States, [Key|Keys0], Result)
		;
			{ MInd = no },
			{ Result = no }
		)
	).

:- pred cmp(key, key, comparison_result).
:- mode cmp(in, in, out) is det.

cmp(A, B, Res) :-
	compare(Res, A, B).

%------------------------------------------------------------------------------%

:- type augregex
	--->	c(key, int)
	;	s(augregex, augregex)
	;	a(augregex, augregex)
	;	'*'(augregex)
	.

:- type ttable ==  map(set(int), map(key, set(int))).

dfa(Regex) = DFA :-
	augregex(Regex, 0, AugRegex0, N),
	AugRegex = s(AugRegex0, c(-1, N)),
	Start = firstpos(AugRegex),
	UnMarked = [Start],
	init(Trans0),
	mark(UnMarked, AugRegex, Trans0, Trans),
	keys(Trans, States),
	init(SNums0),
	number_states(States, 0, SNums0, SNums),
	cons_states(States, SNums, Trans, N, StateList),
	DFA = dfa(array(StateList)).

%------------------------------------------------------------------------------%

:- pred mark(list(set(int)), augregex, ttable, ttable).
:- mode mark(in, in, in, out) is det.

mark([], _, Trans, Trans).
mark([T|Ts0], AugRegex, Trans0, Trans) :-
	% SymPos = sympos(T, AugRegex),
	makeU(T, AugRegex, SymPos),
	init(TTrans0),
	det_insert(Trans0, T, TTrans0, Trans1),
	mark2(SymPos, T, AugRegex, Ts0, Ts1, Trans1, Trans2),
	mark(Ts1, AugRegex, Trans2, Trans).

:- pred mark2(list(pair(key, set(int))), set(int), augregex,
		list(set(int)), list(set(int)),
		ttable, ttable).
:- mode mark2(in, in, in, in, out, in, out) is det.

mark2([], _, _, Ts, Ts, Trans, Trans).
mark2([Sym - U|SymPos], T, AugRegex, Ts0, Ts, Trans0, Trans) :-
	(
		not empty(U),
		not member(U, Ts0),
		not U = T,
		not contains(Trans0, U)
	->
		Ts1 = [U|Ts0]
	;
		Ts1 = Ts0
	),
	( search(Trans0, T, TTrans0) ->
		TTrans1 = TTrans0
	;
		init(TTrans1)
	),
	( search(TTrans1, Sym, U) ->
		TTrans = TTrans1
	;
		map__det_insert(TTrans1, Sym, U, TTrans)
	),
	map__set(Trans0, T, TTrans, Trans1),
	mark2(SymPos, T, AugRegex, Ts1, Ts, Trans1, Trans).

%------------------------------------------------------------------------------%

:- pred number_states(list(set(int)), int,
		map(set(int), int), map(set(int), int)).
:- mode number_states(in, in, in, out) is det.

number_states([], _, SNums, SNums).
number_states([S|Ss], N, SNums0, SNums) :-
	det_insert(SNums0, S, N, SNums1),
	number_states(Ss, N+1, SNums1, SNums).

:- pred cons_states(list(set(int)), map(set(int), int), ttable, int,
		list(dfa_state)).
:- mode cons_states(in, in, in, in, out) is det.

cons_states([], _, _, _, []).
cons_states([S|Ss], Nums, Trans, Term, [State|States]) :-
	lookup(Trans, S, SymTrans),
	keys(SymTrans, Symbols),
	values(SymTrans, Transitions0),
	map((pred(TS::in, N::out) is det :-
		( search(Nums, TS, N0) ->
			N = N0
		;
			N = -1
		)
	), Transitions0, Transitions),
	( member(Term, S) ->
		Terminal = terminal
	;
		Terminal = nonterminal
	),
	State = state(Terminal, array(Symbols), array(Transitions)),
	cons_states(Ss, Nums, Trans, Term, States).

%------------------------------------------------------------------------------%

:- pred augregex(regex, int, augregex, int).
:- mode augregex(in, in, out, out) is det.

augregex(c(Key), N, c(Key, N), N+1).
augregex(s(A0, B0), N0, s(A, B), N2) :-
	augregex(A0, N0, A, N1),
	augregex(B0, N1, B, N2).
augregex(a(A0, B0), N0, a(A, B), N2) :-
	augregex(A0, N0, A, N1),
	augregex(B0, N1, B, N2).
augregex('*'(A0), N0, '*'(A), N1) :-
	augregex(A0, N0, A, N1).
augregex('+'(A0), N0, A, N1) :-
	augregex(s(A0, '*'(A0)), N0, A, N1).

%------------------------------------------------------------------------------%

:- func followpos(augregex, int) = set(int).

followpos(s(A, B), Pos) = Follow :-
	Follow0 = union(followpos(A, Pos), followpos(B, Pos)),
	( member(Pos, lastpos(A)) ->
		Follow = union(firstpos(B), Follow0)
	;
		Follow = Follow0
	).
followpos('*'(A), Pos) = Follow :-
	Follow0 = followpos(A, Pos),
	( member(Pos, lastpos(A)) ->
		Follow = union(firstpos(A), Follow0)
	;
		Follow = Follow0
	).
followpos(c(_, _), _Pos) = emp.
followpos(a(A, B), Pos) = union(followpos(A, Pos), followpos(B, Pos)).

:- func emp = set(int).
emp = Set :- set__init(Set).

%------------------------------------------------------------------------------%

:- pred makeU(set(int), augregex, list(pair(key, set(int)))).
:- mode makeU(in, in, out) is det.

makeU(T, AugRegex, Us) :-
	init(UsMap0),
	aggregate((pred((A - FollowPoA)::out) is nondet :-
		augregex_member(AugRegex, A, P),
		member(P, T),
		FollowPoA = followpos(AugRegex, P)
	), (pred((B - FoB)::in, Map0::in, Map::out) is det  :-
		( search(Map0, B, Set0) ->
			Set1 = Set0
		;
			init(Set1)
		),
		Set = union(FoB, Set1),
		set(Map0, B, Set, Map)
	), UsMap0, UsMap),
	to_assoc_list(UsMap, Us).

:- pred augregex_member(augregex::in, key::out, int::out) is nondet.

augregex_member(c(Key, Pos), Key, Pos).
augregex_member(s(A, B), Key, Pos) :-
	(
		augregex_member(A, Key, Pos)
	;
		augregex_member(B, Key, Pos)
	).
augregex_member(a(A, B), Key, Pos) :-
	(
		augregex_member(A, Key, Pos)
	;
		augregex_member(B, Key, Pos)
	).
augregex_member('*'(A), Key, Pos) :-
	augregex_member(A, Key, Pos).

%------------------------------------------------------------------------------%

:- func nullable(augregex) = bool.
nullable(c(_, _)) = no.
nullable(s(A, B)) = (nullable(A) and nullable(B)).
nullable(a(A, B)) = (nullable(A) or nullable(B)).
nullable('*'(_)) = yes.

%------------------------------------------------------------------------------%

:- func firstpos(augregex) = set(int).
firstpos(c(_, I)) = set(I).
firstpos(a(A, B)) = union(firstpos(A), firstpos(B)).
firstpos(s(A, B)) = ( nullable(A) = yes  ->
			union(firstpos(A), firstpos(B))
		    ;	firstpos(A)
		    ).
firstpos('*'(A))  = firstpos(A).

%------------------------------------------------------------------------------%

:- func lastpos(augregex) = set(int).
lastpos(c(_, I)) = set(I).
lastpos(a(A, B)) = union(lastpos(A), lastpos(B)).
lastpos(s(A, B)) = ( nullable(B) = yes  ->
			union(lastpos(A), lastpos(B))
		    ;	lastpos(B)
		    ).
lastpos('*'(A))  = lastpos(A).

%------------------------------------------------------------------------------%

:- func (bool and bool) = bool.
(no and _) = no.
(yes and no) = no.
(yes and yes) = yes.

:- func (bool or bool) = bool.
(yes or _) = yes.
(no or yes) = yes.
(no or no) = no.

%------------------------------------------------------------------------------%

:- func set(int) = set(int).
set(I) = Set :- set__singleton_set(Set, I).

:- func union(set(int), set(int)) = set(int).
union(As, Bs) = Cs :- union(As, Bs, Cs).

%------------------------------------------------------------------------------%

-- 
 Thomas Conway )O+     Every sword has two edges.
     Mercurian            <conway at cs.mu.oz.au>
--------------------------------------------------------------------------
mercury-developers mailing list
Post messages to:       mercury-developers at cs.mu.oz.au
Administrative Queries: owner-mercury-developers at cs.mu.oz.au
Subscriptions:          mercury-developers-request at cs.mu.oz.au
--------------------------------------------------------------------------



More information about the developers mailing list