[mercury-users] Tag-soup parser
Thomas Conway
conway at cs.mu.OZ.AU
Tue Jun 19 15:47:52 AEST 2001
Hi
For anyone who cares, I've written a very simple tag-soup parser
that more-or-less parses some html (i.e.
some [HTML] parses(HTML) <=> YMMV
), so you can convert it to xhtml, &c, and parse it properly.
It assumes that tags with minimised close tags are siblings.
For example:
<a>foo<b>bar<c>baz
corresponds to the list of elements:
[elem("a", []),
cdata("foo"),
elem("b", []),
cdata("bar"),
elem("c", []),
cdata("baz"),
whereas
<a>foo<b>bar<c>baz</a>
corresponds to
elem("a", [
cdata("foo"),
elem("b", []),
cdata("bar"),
elem("c", []),
cdata("baz")])
This at least gives you enough structure that with a bit of information
about the *real* dtd, to munge the data into the tree you want.
For example, you can go through the tree converting "P" elem-cdata
siblings into the expected nested structure. Note a flaw though, is
that there isn't a flag in the datatype to indicate if the tag was
explicitly or implicitly closed, and it doesn't do the right thing
with extra close tags e.g.
<a>foo<b>bar</a>baz</a>
is not an error - it just does the wrong thing. This would be easy
to fix - feel free. :-)
--
Thomas Conway )O+
<conway at cs.mu.oz.au> 499 User error! Replace user, and press any key.
-------------- next part --------------
:- module soup.
:- interface.
:- import_module list, string.
:- type [A|B] == list(A).
:- type [] ---> [].
:- type content
---> element(tagName, [{attrName,attrValue}], [content])
; comment(text)
; pi(text)
; cdata(text)
.
:- type tagName == string.
:- type attrName == string.
:- type attrValue == string.
:- type text == string.
:- pred parse(string, [content]).
:- mode parse(in, out) is semidet.
:- implementation.
:- import_module char.
:- type token
---> startTag(tagName, [{attrName,attrValue}])
; endTag(tagName)
; comment(text)
; pi(text)
; cdata(string)
.
parse(Str, Content) :-
string__to_char_list(Str, Chars),
parse([], Content, Chars, []).
:- pred parse([content], [content], [char], [char]).
:- mode parse(in, out, in, out) is semidet.
parse(Stack0, Result) -->
( token(Tok) ->
(
{ Tok = startTag(Name, Attrs) },
{ Stack = [element(Name, Attrs, [])|Stack0] }
;
{ Tok = endTag(Name) },
{ findTag(Name, Stack0, Stack1, Attrs, Content0) },
{ reverse(Content0, Content) },
{ Stack = [element(Name, Attrs, Content)|Stack1] }
;
{ Tok = comment(Text) },
{ Stack = [comment(Text)|Stack0] }
;
{ Tok = pi(Text) },
{ Stack = [pi(Text)|Stack0] }
;
{ Tok = cdata(Text) },
{ Stack = [cdata(Text)|Stack0] }
),
parse(Stack, Result)
;
{ Result = Stack0 }
).
:- pred findTag(tagName, [content], [content],
[{attrName,attrValue}], [content]).
:- mode findTag(in, in, out, out, out) is semidet.
findTag(Name, Stack0, Stack, Attrs, Content) :-
Stack0 = [Content0|Stack1],
( Content0 = element(Name, FoundAttrs, FoundContent) ->
Stack = Stack1,
Attrs = FoundAttrs,
Content = FoundContent
;
Content = [Content0|RestContent],
findTag(Name, Stack1, Stack, Attrs, RestContent)
).
:- pred token(token, [char], [char]).
:- mode token(out, in, out) is semidet.
token(Token) -->
( ['<'], name(Name), ws, attrs([], Attrs), ['>'] ->
{ Token = startTag(Name, Attrs) }
; ['<'], ['/'], name(Name), ['>'] ->
{ Token = endTag(Name) }
; ['<'], ['!'], ['-'], ['-'], uptoMM(Text), ['-'], ['-'], ['>'] ->
{ Token = comment(Text) }
; ['<'], ['?'], cdata(Text), ['>'] ->
{ Token = pi(Text) }
;
cdata(Text),
{ Token = cdata(Text) }
).
:- pred attrs([{attrName,attrValue}], [{attrName,attrValue}], [char], [char]).
:- mode attrs(in, out, in, out) is semidet.
attrs(Attrs0, Attrs) -->
( attr(Attr) ->
ws,
attrs([Attr|Attrs0], Attrs)
;
{ Attrs = Attrs0 }
).
:- pred attr({attrName,attrValue}, [char], [char]).
:- mode attr(out, in, out) is semidet.
attr({AttrName, AttrValue}) -->
name(AttrName), ['='],
( ws, ['\''] ->
upto('\'', [], Chars), ['\''],
{ string__from_rev_char_list(Chars, AttrValue) }
; ws, ['"'] ->
upto('"', [], Chars), ['"'],
{ string__from_rev_char_list(Chars, AttrValue) }
;
uptoWs([], Chars),
{ string__from_rev_char_list(Chars, AttrValue) }
).
:- pred cdata(string, [char], [char]).
:- mode cdata(out, in, out) is semidet.
cdata(Text) -->
cdata([], Chars),
{ Chars = [_|_] },
{ string__from_rev_char_list(Chars, Text) }.
:- pred cdata([char], [char], [char], [char]).
:- mode cdata(in, out, in, out) is det.
cdata(Chars0, Chars) -->
(
=([C|_]),
{ C = ('<') ; C = ('>') }
->
{ Chars = Chars0 }
; [Char] ->
cdata([Char|Chars0], Chars)
;
{ Chars = Chars0 }
).
:- pred name(string, [char], [char]).
:- mode name(out, in, out) is semidet.
name(Name) -->
name([], Chars),
{ Chars = [_|_] },
{ string__from_rev_char_list(Chars, Name) }.
:- pred name([char], [char], [char], [char]).
:- mode name(in, out, in, out) is semidet.
name(Chars0, Chars) -->
( nameChar(Char) ->
name([Char|Chars0], Chars)
;
{ Chars = Chars0 }
).
:- pred upto(char, [char], [char], [char], [char]).
:- mode upto(in, in, out, in, out) is det.
upto(EndChar, Chars0, Chars) -->
( =([EndChar|_]) ->
{ Chars = Chars0 }
; [Char] ->
upto(EndChar, [Char|Chars0], Chars)
;
{ Chars = Chars0 }
).
:- pred uptoWs([char], [char], [char], [char]).
:- mode uptoWs(in, out, in, out) is det.
uptoWs(Chars0, Chars) -->
( =([Char|_]), { char__is_whitespace(Char) } ->
{ Chars = Chars0 }
; [Char] ->
uptoWs([Char|Chars0], Chars)
;
{ Chars = Chars0 }
).
:- pred uptoMM(string, [char], [char]).
:- mode uptoMM(out, in, out) is det.
uptoMM(Str) -->
uptoMM([], Chars),
{ string__from_rev_char_list(Chars, Str) }.
:- pred uptoMM([char], [char], [char], [char]).
:- mode uptoMM(in, out, in, out) is det.
uptoMM(Chars0, Chars) -->
( =([('-'),('-')|_]) ->
{ Chars = Chars0 }
; [Char] ->
uptoMM([Char|Chars0], Chars)
;
{ Chars = Chars0 }
).
:- pred ws([char], [char]).
:- mode ws(in, out) is det.
ws -->
( [C], { char__is_whitespace(C) } ->
ws
;
[]
).
:- pred nameChar(char, [char], [char]).
:- mode nameChar(out, in, out) is semidet.
nameChar('a') --> ['a'].
nameChar('b') --> ['b'].
nameChar('c') --> ['c'].
nameChar('d') --> ['d'].
nameChar('e') --> ['e'].
nameChar('f') --> ['f'].
nameChar('g') --> ['g'].
nameChar('h') --> ['h'].
nameChar('i') --> ['i'].
nameChar('j') --> ['j'].
nameChar('k') --> ['k'].
nameChar('l') --> ['l'].
nameChar('m') --> ['m'].
nameChar('n') --> ['n'].
nameChar('o') --> ['o'].
nameChar('p') --> ['p'].
nameChar('q') --> ['q'].
nameChar('r') --> ['r'].
nameChar('s') --> ['s'].
nameChar('t') --> ['t'].
nameChar('u') --> ['u'].
nameChar('v') --> ['v'].
nameChar('w') --> ['w'].
nameChar('x') --> ['x'].
nameChar('y') --> ['y'].
nameChar('z') --> ['z'].
nameChar('A') --> ['A'].
nameChar('B') --> ['B'].
nameChar('C') --> ['C'].
nameChar('D') --> ['D'].
nameChar('E') --> ['E'].
nameChar('F') --> ['F'].
nameChar('G') --> ['G'].
nameChar('H') --> ['H'].
nameChar('I') --> ['I'].
nameChar('J') --> ['J'].
nameChar('K') --> ['K'].
nameChar('L') --> ['L'].
nameChar('M') --> ['M'].
nameChar('N') --> ['N'].
nameChar('O') --> ['O'].
nameChar('P') --> ['P'].
nameChar('Q') --> ['Q'].
nameChar('R') --> ['R'].
nameChar('S') --> ['S'].
nameChar('T') --> ['T'].
nameChar('U') --> ['U'].
nameChar('V') --> ['V'].
nameChar('W') --> ['W'].
nameChar('X') --> ['X'].
nameChar('Y') --> ['Y'].
nameChar('Z') --> ['Z'].
nameChar('0') --> ['0'].
nameChar('1') --> ['1'].
nameChar('2') --> ['2'].
nameChar('3') --> ['3'].
nameChar('4') --> ['4'].
nameChar('5') --> ['5'].
nameChar('6') --> ['6'].
nameChar('7') --> ['7'].
nameChar('8') --> ['8'].
nameChar('9') --> ['9'].
nameChar('_') --> ['_'].
nameChar(':') --> [':'].
nameChar('-') --> ['-'].
More information about the users
mailing list