[mercury-users] Tag-soup parser

Thomas Conway conway at cs.mu.OZ.AU
Tue Jun 19 15:47:52 AEST 2001


Hi
For anyone who cares, I've written a very simple tag-soup parser
that more-or-less parses some html (i.e.
	some [HTML] parses(HTML) <=> YMMV
), so you can convert it to xhtml, &c, and parse it properly.
It assumes that tags with minimised close tags are siblings.
For example:
	<a>foo<b>bar<c>baz
corresponds to the list of elements:
	[elem("a", []),
	 cdata("foo"),
	 elem("b", []),
	 cdata("bar"),
	 elem("c", []),
	 cdata("baz"),
whereas
	<a>foo<b>bar<c>baz</a>
corresponds to
	elem("a", [
	    cdata("foo"),
	    elem("b", []),
	    cdata("bar"),
	    elem("c", []),
	    cdata("baz")])

This at least gives you enough structure that with a bit of information
about the *real* dtd, to munge the data into the tree you want.
For example, you can go through the tree converting "P" elem-cdata
siblings into the expected nested structure. Note a flaw though, is
that there isn't a flag in the datatype to indicate if the tag was
explicitly or implicitly closed, and it doesn't do the right thing
with extra close tags e.g.
	<a>foo<b>bar</a>baz</a>
is not an error - it just does the wrong thing. This would be easy
to fix - feel free. :-)

-- 
  Thomas Conway )O+
 <conway at cs.mu.oz.au>       499 User error! Replace user, and press any key.
-------------- next part --------------
:- module soup.

:- interface.

:- import_module list, string.

:- type [A|B] == list(A).
:- type [] ---> [].

:- type content
	--->	element(tagName, [{attrName,attrValue}], [content])
	;	comment(text)
	;	pi(text)
	;	cdata(text)
	.

:- type tagName == string.
:- type attrName == string.
:- type attrValue == string.
:- type text == string.

:- pred parse(string, [content]).
:- mode parse(in, out) is semidet.

:- implementation.

:- import_module char.

:- type token
	--->	startTag(tagName, [{attrName,attrValue}])
	;	endTag(tagName)
	;	comment(text)
	;	pi(text)
	;	cdata(string)
	.

parse(Str, Content) :-
    string__to_char_list(Str, Chars),
    parse([], Content, Chars, []).

:- pred parse([content], [content], [char], [char]).
:- mode parse(in, out, in, out) is semidet.

parse(Stack0, Result) -->
    ( token(Tok) ->
    	(
	    { Tok = startTag(Name, Attrs) },
	    { Stack = [element(Name, Attrs, [])|Stack0] }
	;
	    { Tok = endTag(Name) },
	    { findTag(Name, Stack0, Stack1, Attrs, Content0) },
	    { reverse(Content0, Content) },
	    { Stack = [element(Name, Attrs, Content)|Stack1] }
	;
	    { Tok = comment(Text) },
	    { Stack = [comment(Text)|Stack0] }
	;
	    { Tok = pi(Text) },
	    { Stack = [pi(Text)|Stack0] }
	;
	    { Tok = cdata(Text) },
	    { Stack = [cdata(Text)|Stack0] }
	),
	parse(Stack, Result)
    ;
    	{ Result = Stack0 }
    ).

:- pred findTag(tagName, [content], [content],
		[{attrName,attrValue}], [content]).
:- mode findTag(in, in, out, out, out) is semidet.

findTag(Name, Stack0, Stack, Attrs, Content) :-
    Stack0 = [Content0|Stack1],
    ( Content0 = element(Name, FoundAttrs, FoundContent) ->
    	Stack = Stack1,
	Attrs = FoundAttrs,
	Content = FoundContent
    ;
    	Content = [Content0|RestContent],
	findTag(Name, Stack1, Stack, Attrs, RestContent)
    ).

:- pred token(token, [char], [char]).
:- mode token(out, in, out) is semidet.

token(Token) -->
    ( ['<'], name(Name), ws, attrs([], Attrs), ['>'] ->
    	{ Token = startTag(Name, Attrs) }
    ; ['<'], ['/'], name(Name), ['>'] ->
    	{ Token = endTag(Name) }
    ; ['<'], ['!'], ['-'], ['-'], uptoMM(Text), ['-'], ['-'], ['>'] ->
    	{ Token = comment(Text) }
    ; ['<'], ['?'], cdata(Text), ['>'] ->
    	{ Token = pi(Text) }
    ;
	cdata(Text),
	{ Token = cdata(Text) }
    ).

:- pred attrs([{attrName,attrValue}], [{attrName,attrValue}], [char], [char]).
:- mode attrs(in, out, in, out) is semidet.

attrs(Attrs0, Attrs) -->
    ( attr(Attr) ->
    	ws,
    	attrs([Attr|Attrs0], Attrs)
    ;
    	{ Attrs = Attrs0 }
    ).

:- pred attr({attrName,attrValue}, [char], [char]).
:- mode attr(out, in, out) is semidet.

attr({AttrName, AttrValue}) -->
    name(AttrName), ['='],
    ( ws, ['\''] ->
    	upto('\'', [], Chars), ['\''],
	{ string__from_rev_char_list(Chars, AttrValue) }
    ; ws, ['"'] ->
    	upto('"', [], Chars), ['"'],
	{ string__from_rev_char_list(Chars, AttrValue) }
    ;
    	uptoWs([], Chars),
	{ string__from_rev_char_list(Chars, AttrValue) }
    ).

:- pred cdata(string, [char], [char]).
:- mode cdata(out, in, out) is semidet.

cdata(Text) -->
    cdata([], Chars),
    { Chars = [_|_] },
    { string__from_rev_char_list(Chars, Text) }.

:- pred cdata([char], [char], [char], [char]).
:- mode cdata(in, out, in, out) is det.

cdata(Chars0, Chars) -->
    (
    	=([C|_]),
    	{ C = ('<') ; C = ('>') }
    ->
    	{ Chars = Chars0 }
    ; [Char] ->
	cdata([Char|Chars0], Chars)
    ;
    	{ Chars = Chars0 }
    ).

:- pred name(string, [char], [char]).
:- mode name(out, in, out) is semidet.

name(Name) -->
    name([], Chars),
    { Chars = [_|_] },
    { string__from_rev_char_list(Chars, Name) }.

:- pred name([char], [char], [char], [char]).
:- mode name(in, out, in, out) is semidet.

name(Chars0, Chars) -->
    ( nameChar(Char) ->
    	name([Char|Chars0], Chars)
    ;
    	{ Chars = Chars0 }
    ).

:- pred upto(char, [char], [char], [char], [char]).
:- mode upto(in, in, out, in, out) is det.

upto(EndChar, Chars0, Chars) -->
    ( =([EndChar|_]) ->
    	{ Chars = Chars0 }
    ; [Char] ->
    	upto(EndChar, [Char|Chars0], Chars)
    ;
    	{ Chars = Chars0 }
    ).

:- pred uptoWs([char], [char], [char], [char]).
:- mode uptoWs(in, out, in, out) is det.

uptoWs(Chars0, Chars) -->
    ( =([Char|_]), { char__is_whitespace(Char) } ->
    	{ Chars = Chars0 }
    ; [Char] ->
    	uptoWs([Char|Chars0], Chars)
    ;
    	{ Chars = Chars0 }
    ).

:- pred uptoMM(string, [char], [char]).
:- mode uptoMM(out, in, out) is det.

uptoMM(Str) -->
    uptoMM([], Chars),
    { string__from_rev_char_list(Chars, Str) }.

:- pred uptoMM([char], [char], [char], [char]).
:- mode uptoMM(in, out, in, out) is det.

uptoMM(Chars0, Chars) -->
    ( =([('-'),('-')|_]) ->
    	{ Chars = Chars0 }
    ; [Char] ->
    	uptoMM([Char|Chars0], Chars)
    ;
    	{ Chars = Chars0 }
    ).

:- pred ws([char], [char]).
:- mode ws(in, out) is det.

ws -->
    ( [C], { char__is_whitespace(C) } ->
    	ws
    ;
    	[]
    ).

:- pred nameChar(char, [char], [char]).
:- mode nameChar(out, in, out) is semidet.

nameChar('a') --> ['a'].
nameChar('b') --> ['b'].
nameChar('c') --> ['c'].
nameChar('d') --> ['d'].
nameChar('e') --> ['e'].
nameChar('f') --> ['f'].
nameChar('g') --> ['g'].
nameChar('h') --> ['h'].
nameChar('i') --> ['i'].
nameChar('j') --> ['j'].
nameChar('k') --> ['k'].
nameChar('l') --> ['l'].
nameChar('m') --> ['m'].
nameChar('n') --> ['n'].
nameChar('o') --> ['o'].
nameChar('p') --> ['p'].
nameChar('q') --> ['q'].
nameChar('r') --> ['r'].
nameChar('s') --> ['s'].
nameChar('t') --> ['t'].
nameChar('u') --> ['u'].
nameChar('v') --> ['v'].
nameChar('w') --> ['w'].
nameChar('x') --> ['x'].
nameChar('y') --> ['y'].
nameChar('z') --> ['z'].
nameChar('A') --> ['A'].
nameChar('B') --> ['B'].
nameChar('C') --> ['C'].
nameChar('D') --> ['D'].
nameChar('E') --> ['E'].
nameChar('F') --> ['F'].
nameChar('G') --> ['G'].
nameChar('H') --> ['H'].
nameChar('I') --> ['I'].
nameChar('J') --> ['J'].
nameChar('K') --> ['K'].
nameChar('L') --> ['L'].
nameChar('M') --> ['M'].
nameChar('N') --> ['N'].
nameChar('O') --> ['O'].
nameChar('P') --> ['P'].
nameChar('Q') --> ['Q'].
nameChar('R') --> ['R'].
nameChar('S') --> ['S'].
nameChar('T') --> ['T'].
nameChar('U') --> ['U'].
nameChar('V') --> ['V'].
nameChar('W') --> ['W'].
nameChar('X') --> ['X'].
nameChar('Y') --> ['Y'].
nameChar('Z') --> ['Z'].
nameChar('0') --> ['0'].
nameChar('1') --> ['1'].
nameChar('2') --> ['2'].
nameChar('3') --> ['3'].
nameChar('4') --> ['4'].
nameChar('5') --> ['5'].
nameChar('6') --> ['6'].
nameChar('7') --> ['7'].
nameChar('8') --> ['8'].
nameChar('9') --> ['9'].
nameChar('_') --> ['_'].
nameChar(':') --> [':'].
nameChar('-') --> ['-'].



More information about the users mailing list