[m-dev.] For review: add my XML parser to the extras

Thomas Conway conway at cs.mu.OZ.AU
Mon Aug 7 12:28:24 AEST 2000


Hi

Here's my XML parser, which would make a good addition to the extras.

-- 
 Thomas Conway              Unicode for terrorists: U+0001 300C
 <conway at cs.mu.oz.au>         "Tie his hands behind his back"
 Mercurian )O+  

-------------- next part --------------

    Add an XML parser to the extras directory

extras/xml/README:
    A short readme describing the parser.

extras/xml/Mmake:
    The Mmake file for the sample program.

extras/xml/tryit.m:
    A sample program that just parses the files named on the command line.

extras/xml/xml*.m:
    The parser itself.

extras/xml/unicode.m:
    A bunch of functions that return the unicode values for numerous ascii
    symbols.

extras/xml/parsing.m:
    The parsing combinator set used by the xml parser.

-------------- next part --------------

default_target : tryit

depend : tryit.depend
-------------- next part --------------
%---------------------------------------------------------------------------%
% Copyright (C) 2000 The University of Melbourne.
% This file may only be copied under the terms of the GNU Library General
% Public License - see the file COPYING.LIB in the Mercury distribution.
%---------------------------------------------------------------------------%
%
% Main author: conway at cs.mu.oz.au.
%
%---------------------------------------------------------------------------%
:- module parsing.

:- interface.

:- import_module string, unicode.
:- import_module io, list, map, std_util.

:- mode pdi == in.
:- mode puo == out.

:- type entityName
	--->	anon
	;	internal(string)
	;	external(string)
	.

:- type entity
	--->	entity(
		    curr	:: int,
		    leng	:: int,
		    text	:: string,
		    name	:: entityName
		).

:- type encoding
	--->	some [Enc] (enc(Enc) => encoding(Enc)).

:- func mkEntity(string) = entity.
:- func mkEntity(entityName, string) = entity.

:- typeclass encoding(Enc) where [
	(pred decode(Enc, unicode, entity, entity)),
	(mode decode(in, out, in, out) is semidet),
	(pred encode(Enc, list(unicode), string)),
	(mode encode(in, in, out) is det)
].

:- func (mkEncoding(Enc) = encoding) <= encoding(Enc).

:- typeclass global(K, V) where [].

:- type globals == map(univ, univ).

:- type pstate(T).

:- type parse(T)
	--->	ok(T)
	;	error(string).

:- pred pstate(entity, encoding, globals, io__state, pstate('')).
:- mode pstate(in, in, in, di, puo) is det.

:- pred finish(parse(T1), pstate(T1), io__state).
:- mode finish(out, pdi, uo) is det.

:- pred try(parser(T1, T2),
	    pred(T2, pstate(T2), pstate(T3)),
	    pred(string, pstate(T1), pstate(T3)),
	    pred(string, pstate(T1), pstate(T3)),
	    pstate(T1), pstate(T3)).
:- mode try(in(parser),
	    pred(in, pdi, puo) is det,
	    pred(in, pdi, puo) is det,
	    pred(in, pdi, puo) is det, pdi, puo) is det.

:- pred parse(parser(T1, T2), parse(T2), pstate(T1), pstate(T2)).
:- mode parse(in(parser), out, pdi, puo) is det.

:- pred parseEntity(parser(T1, T2), entity, pstate(T1), pstate(T2)).
:- mode parseEntity(in(parser), in, pdi, puo) is det.

:- pred tok(pstate(_), pstate(unicode)).
:- mode tok(pdi, puo) is det.

:- pred return(T, pstate(_), pstate(T)).
:- mode return(in, pdi, puo) is det.

:- pred return(pstate(_), pstate('')).
:- mode return(pdi, puo) is det.

:- pred fail(string, pstate(_), pstate(_)).
:- mode fail(in, pdi, puo) is det.

:- pred error(string, pstate(_), pstate(_)).
:- mode error(in, pdi, puo) is det.

:- pred setEncoding(encoding, pstate(T1), pstate(T1)).
:- mode setEncoding(in, pdi, puo) is det.

:- pred getEncoding(encoding, pstate(T1), pstate(T1)).
:- mode getEncoding(out, pdi, puo) is det.

:- pred lit1(unicode, T, pstate(_), pstate(T)).
:- mode lit1(in, in, pdi, puo) is det.

:- pred lit1(unicode, pstate(_), pstate(unicode)).
:- mode lit1(in, pdi, puo) is det.

:- pred lit(string, pstate(_), pstate(string)).
:- mode lit(in, pdi, puo) is det.

:- pred lit(string, T, pstate(_), pstate(T)).
:- mode lit(in, in, pdi, puo) is det.

:- pred quote(pstate(_), pstate(unicode)).
:- mode quote(pdi, puo) is det.

:- pred io(pred(T1, io__state, io__state), T1, pstate(T2), pstate(T2)).
:- mode io(pred(out, di, uo) is det, out, pdi, puo) is det.

:- pred io(pred(io__state, io__state), pstate(T2), pstate(T2)).
:- mode io(pred(di, uo) is det, pdi, puo) is det.

:- pred mkString(list(unicode), string, pstate(T1), pstate(T1)).
:- mode mkString(in, out, pdi, puo) is det.

:- type (A, B) ---> (A, B).

:- type opt(T) ---> no ; yes(T).

:- type '' ---> ''.

:- type parser(T1, T2) == pred(pstate(T1), pstate(T2)).

:- inst parser = (pred(pdi, puo) is det).

:- pred and(parser(T1, T2), parser(T2, T3), pstate(T1), pstate((T2, T3))).
:- mode and(in(parser), in(parser), pdi, puo) is det.

:- pred or(parser(T1, T2), parser(T1, T2), pstate(T1), pstate(T2)).
:- mode or(in(parser), in(parser), pdi, puo) is det.

:- pred then(parser(W, T), pred(T, pstate(T), pstate(U)), pstate(W), pstate(U)).
:- mode then(in(parser), pred(in, pdi, puo) is det, pdi, puo) is det.

:- pred star(parser(T1, T2), pstate(T1), pstate(list(T2))).
:- mode star(in(parser), pdi, puo) is det.

:- pred plus(parser(T1, T2), pstate(T1), pstate(list(T2))).
:- mode plus(in(parser), pdi, puo) is det.

:- pred opt(parser(T1, T2), T2, pstate(T1), pstate(T2)).
:- mode opt(in(parser), in, pdi, puo) is det.

:- pred opt(parser(T1, T2), pstate(T1), pstate(opt(T2))).
:- mode opt(in(parser), pdi, puo) is det.

:- pred opt(opt(T0), pred(T0, pstate(T1), pstate(T2)),
		parser(T1, T2), pstate(T1), pstate(T2)).
:- mode opt(in, in(pred(in, pdi, puo) is det), in(parser), pdi, puo) is det.

:- pred upto(parser(T1, T2), parser(T1, T3),
		pstate(T1), pstate((list(T2), T3))).
:- mode upto(in(parser), in(parser), pdi, puo) is det.

:- pred range(unicode, unicode, pstate(_), pstate(unicode)).
:- mode range(in, in, pdi, puo) is det.

:- pred '-'(unicode, unicode, pstate(T1), pstate(unicode)).
:- mode '-'(in, in, pdi, puo) is det.

:- pred wrap(parser(T1, T2), pred(T2, T3), pstate(T1), pstate(T3)).
:- mode wrap(in(parser), pred(in, out) is det, pdi, puo) is det.

:- pred x(parser(T1, T2), pstate(T1), pstate('')).
:- mode x(in(parser), pdi, puo) is det.

:- pred fst(parser(S, (T,U)), pstate(S), pstate(T)).
:- mode fst(in(parser), pdi, puo) is det.

:- pred snd(parser(S, (T,U)), pstate(S), pstate(U)).
:- mode snd(in(parser), pdi, puo) is det.

:- pred except(list(unicode), pstate(T1), pstate(unicode)).
:- mode except(in, pdi, puo) is det.

:- pred no(parser(T1, T2), pstate(T1), pstate(opt(T3))).
:- mode no(in(parser), pdi, puo) is det.

:- pred yes(parser(T1, T2), pstate(T1), pstate(opt(T2))).
:- mode yes(in(parser), pdi, puo) is det.

:- pred filter(parser(T1, list(opt(T2))), pstate(T1), pstate(list(T2))).
:- mode filter(in(parser), pdi, puo) is det.

:- pred no(T1, opt(T2)).
:- mode no(in, out) is det.

:- pred yes(T, opt(T)).
:- mode yes(in, out) is det.

:- pred list(parser(T1, T2), pstate(T1), pstate(list(T2))).
:- mode list(in(parser), pdi, puo) is det.

:- pred get(K, V, pstate(T3), pstate(T3)) <= global(K, V).
:- mode get(in, out, pdi, puo) is det.

:- pred set(K, V, pstate(T3), pstate(T3)) <= global(K, V).
:- mode set(in, in, pdi, puo) is det.

:- implementation.

:- import_module char, int, list, string.

:- type pstate(T)
	--->	s(
		    count	:: int,
		    entity	:: entity,
		    encoding	:: encoding,
		    status	:: status(T),
		    globals	:: globals,
		    io		:: io__state
		).

:- type status(T)
	--->	ok(T)
	;	fail(string)
	;	error(string)
	.

mkEntity(Str) = entity(0, Leng, Str, anon) :-
    length(Str, Leng).

mkEntity(Name, Str) = entity(0, Leng, Str, Name) :-
    length(Str, Leng).

mkEncoding(Enc) = 'new enc'(Enc).

pstate(Entity, Enc, Globs, IO, PS) :-
    PS = s(0, Entity, Enc, ok(''), Globs, IO).

finish(Res, PS0, IO) :-
    status(Status, PS0, PS),
    (
	Status = ok(Stuff),
	Res = ok(Stuff)
    ;
	Status = fail(Msg),
	Res = error(Msg)
    ;
	Status = error(Msg),
	Res = error(Msg)
    ),
    IO = u(PS^io).

parse(P, Res) -->
    call(P),
    status(Status),
    {
	Status = ok(Stuff),
	Res = ok(Stuff)
    ;
	Status = fail(Msg),
	Res = error(Msg)
    ;
	Status = error(Msg),
	Res = error(Msg)
    }.

parseEntity(Parser, Entity, PS0, PS) :-
    E0 = PS0^entity,
    PS1 = PS0^entity := Entity,
    call(Parser, PS1, PS2),
    E1 = PS2^entity,
    (
        E1^curr = E1^leng
    ->
    	PS = PS2^entity := E0
    ;
    	error("parse finished before the end of the entity", PS2, PS)
    ).

:- pred actuate(parser(T1, T2), pstate(T1), pstate(T2)).
:- mode actuate(in(parser), pdi, puo) is det.

actuate(P) -->
    status(Status),
    (
    	{ Status = ok(_) },
	call(P)
    ;
    	{ Status = fail(Msg) },
	fail(Msg)
    ;
    	{ Status = error(Msg) },
	error(Msg)
    ).

try(P, S, F, E) -->
    mark(M, Ent),
    status(Status0),
    actuate(P),
    status(Status),
    (
    	{ Status = ok(X) },
	call(S, X)
    ;
    	{ Status = fail(Msg) },
	setStatus(Status0),
	reset(M, Ent),
	call(F, Msg)
    ;
    	{ Status = error(Msg) },
	setStatus(Status0),
	call(E, Msg)
    ).

then(P, T) -->
    actuate(P),
    status(Status1),
    (
    	{ Status1 = ok(X) },
	call(T, X)
    ;
    	{ Status1 = fail(Msg) },
	setStatus(fail(Msg))
    ;
    	{ Status1 = error(Msg) },
	setStatus(error(Msg))
    ).

:- pred mark(int, entity, pstate(T), pstate(T)).
:- mode mark(out, out, pdi, puo) is det.

mark(PS^count, PS^entity, PS, PS).

:- pred reset(int, entity, pstate(T), pstate(T)).
:- mode reset(in, in, pdi, puo) is det.

reset(Curr, Entity, PS0, PS) :-
    PS1 = PS0^count := Curr,
    PS = PS1^entity := Entity.

tok(PS0, PS) :-
    Entity0 = PS0^entity,
    enc(Enc) = PS0^encoding,
    ( decode(Enc, Uni, Entity0, Entity) ->
        PS1 = PS0^status := ok(Uni),
	PS2 = PS1^entity := Entity,
	PS = PS2^count := (PS2^count + 1)
    ;
        PS = PS0^status := fail("eof")
    ).

return(X, PS0, PS) :-
    PS = PS0^status := ok(X).

return -->
    return('').

fail(Msg, PS0, PS) :-
    PS = PS0^status := fail(Msg).

error(Msg, PS0, PS) :-
    PS = PS0^status := error(Msg).

setEncoding(Enc, PS0, PS) :-
	PS = PS0^encoding := Enc.

getEncoding(PS^encoding, PS, PS).

:- pred status(status(T), pstate(T), pstate(T)).
:- mode status(out, pdi, puo) is det.

status(PS^status, PS, PS).

:- pred setStatus(status(T1), pstate(T2), pstate(T1)).
:- mode setStatus(in, pdi, puo) is det.

setStatus(S, PS, PS^status := S).

lit1(U, R) -->
    tok				    then (pred(C::in, pdi, puo) is det -->
    ( { U = C } ->
    	return(R)
    ;
    	fail("character didn't match")
    )).

lit1(U) -->
    tok				    then (pred(C::in, pdi, puo) is det -->
    ( { U = C } ->
    	return(U)
    ;
    	fail("character didn't match")
    )).

lit(Str) -->
    { string__to_char_list(Str, Chars) },
    (lit2(Chars)		    then (pred(_::in, pdi, puo) is det -->
    return(Str)
    )).

lit(Str, Thing) -->
    { string__to_char_list(Str, Chars) },
    (lit2(Chars)		    then (pred(_::in, pdi, puo) is det -->
    return(Thing)
    )).

:- pred lit2(list(char), pstate(_), pstate('')).
:- mode lit2(in, pdi, puo) is det.

lit2([]) -->
    return('').
lit2([C|Is]) -->
    { char__to_int(C, I) },
    (tok			    then (pred(I0::in, pdi, puo) is det -->
    ( { I = I0 } ->
	lit2(Is)
    ;
    	fail("literal failed to match")
    ))).

quote -->
    tok				    then (pred(Q::in, pdi, puo) is det -->
    ( {
        Q = ('''')
    ;
        Q = ('"')
    } ->
    	return(Q)
    ;
    	fail("expected a quote")
    )).

io(Pred, Res, PS0, PS) :-
    call(Pred, Res, u(PS0^io), IO),
    PS = PS0^io := IO.

io(Pred, PS0, PS) :-
    call(Pred, u(PS0^io), IO),
    PS = PS0^io := IO.

mkString(UniCodes, String, PS, PS) :-
    enc(Enc) = PS^encoding,
    encode(Enc, UniCodes, String).

(A and B) -->
    actuate(A)			    then (pred(X::in, pdi, puo) is det -->
    actuate(B)			    then (pred(Y::in, pdi, puo) is det -->
    return((X, Y))
    )).

(A or B) -->
    try(A,
    	return,
	(pred(_::in, pdi, puo) is det --> call(B)),
	error).

star(P) -->
    star(P, []).

:- pred star(parser(T1, T2), list(T2), pstate(T1), pstate(list(T2))).
:- mode star(in(parser), in, pdi, puo) is det.

star(P, Xs0) -->
    status(Status0),
    mark(Start, _Ent),
    try(P,
    	(pred(X::in, pdi, puo) is det -->
	    mark(End, _EEnt),
	    ( { Start \= End } ->
		setStatus(Status0),
		star(P, [X|Xs0])
	    ;
	    	fail("star(null)")
	    )
	),
	(pred(_::in, pdi, puo) is det -->
	    { reverse(Xs0, Xs) },
	    return(Xs)
	),
	error
    ).

plus(P) -->
    status(Status0),
    (actuate(P)			   then (pred(X::in, pdi, puo) is det -->
    setStatus(Status0),
    star(P, [X])
    )).

opt(P, Def) -->
    try(P,
    	return,
	(pred(_::in, pdi, puo) is det -->
	    return(Def)
	),
	error
    ).

opt(no, _Yes, No) -->
    call(No).
opt(yes(Thing), Yes, _No) -->
    call(Yes, Thing).

opt(P) -->
    try(P,
    	(pred(X::in, pdi, puo) is det -->
	    return(yes(X))
	),
    	(pred(_::in, pdi, puo) is det -->
	    return(no)
	),
	error
    ).

upto(Rep, Fin) -->
    upto(Rep, Fin, []).

:- pred upto(parser(T1, T2), parser(T1, T3), list(T2),
		pstate(T1), pstate((list(T2), T3))).
:- mode upto(in(parser), in(parser), in, pdi, puo) is det.

upto(Rep, Fin, Rs0) -->
    status(Status0),
    try(Fin,
    	(pred(F::in, pdi, puo) is det -->
	    { reverse(Rs0, Rs) },
	    return((Rs, F))
	),
	(pred(_::in, pdi, puo) is det -->
	    setStatus(Status0),
	    (Rep		    then (pred(R::in, pdi, puo) is det -->
	    setStatus(Status0),
	    upto(Rep, Fin, [R|Rs0])
	))),
	error
    ).

range(F, L) -->
    tok				    then (pred(C::in, pdi, puo) is det -->
    ( { F =< C, C =< L } ->
    	return(C)
    ;
    	fail("not in range")
    )).

(F - L) -->
    range(F, L).

wrap(P, Q) -->
    P				    then (pred(X::in, pdi, puo) is det -->
    { call(Q, X, W) },
    return(W)
    ).

x(P) -->
    P				    then (pred(_::in, pdi, puo) is det -->
    return('')
    ).

fst(P) -->
    P				    then (pred((T, _)::in, pdi, puo) is det -->
    return(T)
    ).

snd(P) -->
    P				    then (pred((_, T)::in, pdi, puo) is det -->
    return(T)
    ).

except(Exclusions) -->
    tok				    then (pred(C::in, pdi, puo) is det -->
    ( { not member(C, Exclusions) } ->
        return(C)
    ;
    	fail("excluded character")
    )).

no(Parser) -->
    Parser			    then (pred(_::in, pdi, puo) is det -->
    return(no)
    ).

yes(Parser) -->
    Parser			    then (pred(X::in, pdi, puo) is det -->
    return(yes(X))
    ).

filter(Parser) -->
    Parser			    then (pred(Xs0::in, pdi, puo) is det -->
    { filter1(Xs0, Xs) },
    return(Xs)
    ).

:- pred filter1(list(opt(T)), list(T)) is det.
:- mode filter1(in, out) is det.

filter1([], []).
filter1([X0|Xs0], Xs) :-
    (
    	X0 = yes(X),
	filter1(Xs0, Xs1),
	Xs = [X|Xs1]
    ;
    	X0 = no,
	filter1(Xs0, Xs)
    ).

list(P) -->
    P				    then (pred(X::in, pdi, puo) is det -->
    return([X])
    ).

no(_, no).

yes(T, yes(T)).

get(Key, Val, PS, PS) :-
	lookup(PS^globals, univ(Key), Val0),
	det_univ_to_type(Val0, Val).

set(Key, Val, PS0, PS) :-
	set(PS0^globals, univ(Key), univ(Val), Globals),
	PS = PS0^globals := Globals.

:- func u(T) = T.
:- mode (u(in) = uo) is det.

u(X) = Y :-
    unsafe_promise_unique(X, Y).

-------------- next part --------------
%---------------------------------------------------------------------------%
% Copyright (C) 2000 The University of Melbourne.
% This file may only be copied under the terms of the GNU Library General
% Public License - see the file COPYING.LIB in the Mercury distribution.
%---------------------------------------------------------------------------%
%
% Main author: conway at cs.mu.oz.au.
%
%---------------------------------------------------------------------------%
:- module tryit.

:- interface.

:- import_module io.

:- pred main(io__state::di, io__state::uo) is cc_multi.

:- implementation.

:- import_module parsing, xml, xml:cat, xml:encoding, xml:parse.
:- import_module char, list, map, std_util, string.

main -->
    io__command_line_arguments(Args),
    main(Args).

:- pred main(list(string), io__state, io__state).
:- mode main(in, di, uo) is det.

main([]) --> [].
main([File|Files]) -->
    see(File, Res0),
    ( { Res0 = ok } ->
	    io__read_file_as_string(_, Text),
	    pstate(mkEntity(Text), mkEncoding(utf8), init),
	    io((pred(Dirs0::out, di, uo) is det -->
		get_environment_var("XML_DIRS", MStr),
		(
		    { MStr = no },
		    { Str = "." }
		;
		    { MStr = yes(Str) }
		),
		{ split((':'), Str, Dirs0) }
	    ), Dirs),
	    set(gDirs, dirs(Dirs)),
	    io((pred(Cat0::out, di, uo) is det -->
		load("catalog", Dirs, Res1),
		(
		    { Res1 = ok(Cat0) }
		;
		    { Res1 = error(Err0) },
		    stderr_stream(StdErr0),
		    format(StdErr0, "error reading catalog: %s\n", [s(Err0)]),
		    { init(Catalog0) },
		    { Cat0 = catalog(Catalog0) }
		)
	    ), Cat),
	    set(gCatalog, Cat),
	    { map__from_assoc_list([
		"ASCII"		- mkEncoding(ascii7),
		"ascii"		- mkEncoding(ascii7),
		"Latin-1"	- mkEncoding(latin1),
		"Latin1"	- mkEncoding(latin1),
		"UTF-8"		- mkEncoding(utf8),
		"utf-8"		- mkEncoding(utf8)
	    ], Encodings) },
	    set(gEncodings, encodings(Encodings)),
	    document,
	    finish(Res),
	    (
	    	{ Res = ok(_) }
	    ;
	    	{ Res = error(Err) },
		stderr_stream(StdErr),
		format(StdErr, "%s: %s\n", [s(File), s(Err)])
	    ),
	    %write(Res), nl,
	    []
    ;
	[]
    ),
    main(Files).

:- pred split(char, string, list(string)).
:- mode split(in, in, out) is det.

split(C, Str0, Strs) :-
    string__to_char_list(Str0, Chars),
    split1(C, [], Strs0, Chars, _),
    reverse(Strs0, Strs).

:- pred split1(char, list(string), list(string), list(char), list(char)).
:- mode split1(in, in, out, in, out) is det.

split1(_C, Strs, Strs, [], []).
split1(C, Strs0, Strs) -->
	=([_|_]),
	split2(C, [], Cs0),
	{ reverse(Cs0, Cs) },
	( { Cs \= [] } ->
	    { string__from_char_list(Cs, Str) },
	    { Strs1 = [Str|Strs0] }
	;
	    { Strs1 = Strs0 }
	),
	split1(C, Strs1, Strs).

:- pred split2(char, list(char), list(char), list(char), list(char)).
:- mode split2(in, in, out, in, out) is det.

split2(_C, Cs, Cs, [], []).
split2(C, Cs0, Cs) -->
    [C0],
    ( { C = C0 } ->
    	{ Cs = Cs0 }
    ;
    	split2(C, [C0|Cs0], Cs)
    ).

-------------- next part --------------
%---------------------------------------------------------------------------%
% Copyright (C) 2000 The University of Melbourne.
% This file may only be copied under the terms of the GNU Library General
% Public License - see the file COPYING.LIB in the Mercury distribution.
%---------------------------------------------------------------------------%
%
% Main author: conway at cs.mu.oz.au.
%
%---------------------------------------------------------------------------%
:- module unicode.

:- interface.

:- import_module list.

:- type unicode == int.

:- func ('!') = unicode.
:- func ('"') = unicode.
:- func ('#') = unicode.
:- func ('$') = unicode.
:- func ('%') = unicode.
:- func ('&') = unicode.
:- func ('''') = unicode.
:- func ('(') = unicode.
:- func (')') = unicode.
:- func ('*') = unicode.
:- func ('+') = unicode.
:- func (',') = unicode.
:- func ('-') = unicode.
:- func ('.') = unicode.
:- func ('/') = unicode.
:- func ('0') = unicode.
:- func ('1') = unicode.
:- func ('2') = unicode.
:- func ('3') = unicode.
:- func ('4') = unicode.
:- func ('5') = unicode.
:- func ('6') = unicode.
:- func ('7') = unicode.
:- func ('8') = unicode.
:- func ('9') = unicode.
:- func (':') = unicode.
:- func (';') = unicode.
:- func ('<') = unicode.
:- func ('=') = unicode.
:- func ('>') = unicode.
:- func ('?') = unicode.
:- func ('@') = unicode.
:- func ('A') = unicode.
:- func ('B') = unicode.
:- func ('C') = unicode.
:- func ('D') = unicode.
:- func ('E') = unicode.
:- func ('F') = unicode.
:- func ('G') = unicode.
:- func ('H') = unicode.
:- func ('I') = unicode.
:- func ('J') = unicode.
:- func ('K') = unicode.
:- func ('L') = unicode.
:- func ('M') = unicode.
:- func ('N') = unicode.
:- func ('O') = unicode.
:- func ('P') = unicode.
:- func ('Q') = unicode.
:- func ('R') = unicode.
:- func ('S') = unicode.
:- func ('T') = unicode.
:- func ('U') = unicode.
:- func ('V') = unicode.
:- func ('W') = unicode.
:- func ('X') = unicode.
:- func ('Y') = unicode.
:- func ('Z') = unicode.
:- func ('[') = unicode.
:- func ('\\') = unicode.
:- func (']') = unicode.
:- func ('^') = unicode.
:- func ('_') = unicode.
:- func ('`') = unicode.
:- func ('a') = unicode.
:- func ('b') = unicode.
:- func ('c') = unicode.
:- func ('d') = unicode.
:- func ('e') = unicode.
:- func ('f') = unicode.
:- func ('g') = unicode.
:- func ('h') = unicode.
:- func ('i') = unicode.
:- func ('j') = unicode.
:- func ('k') = unicode.
:- func ('l') = unicode.
:- func ('m') = unicode.
:- func ('n') = unicode.
:- func ('o') = unicode.
:- func ('p') = unicode.
:- func ('q') = unicode.
:- func ('r') = unicode.
:- func ('s') = unicode.
:- func ('t') = unicode.
:- func ('u') = unicode.
:- func ('v') = unicode.
:- func ('w') = unicode.
:- func ('x') = unicode.
:- func ('y') = unicode.
:- func ('z') = unicode.
:- func ('{') = unicode.
:- func ('|') = unicode.
:- func ('}') = unicode.
:- func ('~') = unicode.

:- implementation.

:- import_module char, require, string.

('!') =	0x21.
('"') =	0x22.
('#') =	0x23.
('$') =	0x24.
('%') =	0x25.
('&') =	0x26.
('''') = 0x27.
('(') =	0x28.
(')') =	0x29.
('*') =	0x2A.
('+') =	0x2B.
(',') =	0x2C.
('-') =	0x2D.
('.') =	0x2E.
('/') =	0x2F.
('0') =	0x30.
('1') =	0x31.
('2') =	0x32.
('3') =	0x33.
('4') =	0x34.
('5') =	0x35.
('6') =	0x36.
('7') =	0x37.
('8') =	0x38.
('9') =	0x39.
(':') =	0x3A.
(';') =	0x3B.
('<') =	0x3C.
('=') =	0x3D.
('>') =	0x3E.
('?') =	0x3F.
('@') =	0x40.
('A') =	0x41.
('B') =	0x42.
('C') =	0x43.
('D') =	0x44.
('E') =	0x45.
('F') =	0x46.
('G') =	0x47.
('H') =	0x48.
('I') =	0x49.
('J') =	0x4A.
('K') =	0x4B.
('L') =	0x4C.
('M') =	0x4D.
('N') =	0x4E.
('O') =	0x4F.
('P') =	0x50.
('Q') =	0x51.
('R') =	0x52.
('S') =	0x53.
('T') =	0x54.
('U') =	0x55.
('V') =	0x56.
('W') =	0x57.
('X') =	0x58.
('Y') =	0x59.
('Z') =	0x5A.
('[') =	0x5B.
('\\') = 0x5C.
(']') =	0x5D.
('^') =	0x5E.
('_') =	0x5F.
('`') =	0x60.
('a') =	0x61.
('b') =	0x62.
('c') =	0x63.
('d') =	0x64.
('e') =	0x65.
('f') =	0x66.
('g') =	0x67.
('h') =	0x68.
('i') =	0x69.
('j') =	0x6A.
('k') =	0x6B.
('l') =	0x6C.
('m') =	0x6D.
('n') =	0x6E.
('o') =	0x6F.
('p') =	0x70.
('q') =	0x71.
('r') =	0x72.
('s') =	0x73.
('t') =	0x74.
('u') =	0x75.
('v') =	0x76.
('w') =	0x77.
('x') =	0x78.
('y') =	0x79.
('z') =	0x7A.
('{') =	0x7B.
('|') =	0x7C.
('}') =	0x7D.
('~') =	0x7E.

-------------- next part --------------
%---------------------------------------------------------------------------%
% Copyright (C) 2000 The University of Melbourne.
% This file may only be copied under the terms of the GNU Library General
% Public License - see the file COPYING.LIB in the Mercury distribution.
%---------------------------------------------------------------------------%
%
% Main author: conway at cs.mu.oz.au.
%
%---------------------------------------------------------------------------%
:- module xml:cat.

:- interface.

:- import_module xml:dtd.
:- import_module io.

:- type catalog	 ---> catalog(publicId -> systemId).

:- type dirs	 == [path].

:- type publicId == string.

:- type systemId == string.

:- type path	 == string.

:- type catRes(T)
	--->	ok(T)
	;	error(string)
	.

:- pred load(string, dirs, catRes(catalog), io__state, io__state).
:- mode load(in, in, out, di, uo) is det.

:- pred find(string, dirs, catRes(string), io__state, io__state).
:- mode find(in, in, out, di, uo) is det.

:- implementation.

:- import_module char, int, list, map, string.

:- type entry
	--->	dtd(publicId, systemId)
	;	none
	.

load(Name, Dirs, Res) -->
    find(Name, Dirs, Res0),
    (
    	{ Res0 = ok(Path) },
	read_file(Res1),
	(
	    { Res1 = ok(CatChars) },
	    { lines(1, CatLines0, CatChars, _) },
	    { decomment(CatLines0, CatLines) },
	    { parse(Entries, Errors, CatLines) },
	    { init(Cat0) },
	    { foldl(addEntry, Entries, catalog(Cat0), Cat) },
	    { Res = ok(Cat) },
	    foldl((pred(Msg::in, di, uo) is det -->
	        stderr_stream(StdErr),
		format(StdErr, "%s: %s\n", [s(Path), s(Msg)])
	    ), Errors)
	;
	    { Res1 = error(Err) },
	    { io__error_message(Err, Msg) },
	    { Res = error(Msg) }
	;
	    { Res1 = eof },
	    { Res = error("empty catalog file") }
	)
    ;
        { Res0 = error(Msg) },
	{ Res = error(Msg) }
    ).

find(Name, [], error(Err)) -->
    { format("`%s' not found", [s(Name)], Err) }.
find(Name, [Dir|Dirs], Res) -->
    { append_list([Dir, "/", Name], Path) },
    see(Path, Res0),
    ( { Res0 = ok } ->
	{ Res = ok(Path) }
    ;
	find(Name, Dirs, Res)
    ).

:- type (A, B) ---> (A, B).

:- pred lines(int, [(int, [char])], [char], [char]).
:- mode lines(in, out, in, out) is det.

lines(_N, [], [], []).
lines(N, [Line|Lines]) -->
    =([_|_]),
    line(N, Line),
    lines(N + 1, Lines).

:- pred line(int, (int, [char]), [char], [char]).
:- mode line(in, out, in, out) is det.

line(N, (N, Cs)) -->
    untilDiscard('\n', Cs).

:- pred decomment([(int, [char])], [(int, [char])]).
:- mode decomment(in, out) is det.

decomment(Lines0, Lines) :-
    map((pred(Line0::in, Line::out) is det :-
    	Line0 = (N, Cs0),
	Line = (N, Cs),
	untilDiscard('#', Cs, Cs0, _)
    ), Lines0, Lines).

:- pred parse([entry], [string], [(int, [char])]).
:- mode parse(out, out, in) is det.

parse([], [], []).
parse(Entries, Errors, [Line|Lines]) :-
    Line = (N, Cs),
    ( parseEntry(Entry, Cs, _) ->
    	Entries = [Entry|Entries0],
	parse(Entries0, Errors, Lines)
    ;
        format("%d: syntax error", [i(N)], Msg),
	Errors = [Msg|Errors0],
	parse(Entries, Errors0, Lines)
    ).

:- pred addEntry(entry, catalog, catalog).
:- mode addEntry(in, in, out) is det.

addEntry(none, Cat, Cat).
addEntry(dtd(PublicId, SystemId), catalog(Cat0), catalog(Cat)) :-
    det_insert(Cat0, PublicId, SystemId, Cat).

:- pred parseEntry(entry, [char], [char]).
:- mode parseEntry(out, in, out) is semidet.

parseEntry(Entry) -->
    ws,
    ( ['P','U','B','L','I','C'], ws, string(PublicId), ws, string(SystemId) ->
    	{ Entry = dtd(PublicId, SystemId) }
    ; =([]) ->
    	{ Entry = none }
    ;
    	{ fail }
    ).

:- pred ws([char], [char]).
:- mode ws(in, out) is det.

ws -->
    ( [C], { char__is_whitespace(C) } ->
    	ws
    ;
    	[]
    ).

:- pred string(string, [char], [char]).
:- mode string(out, in, out) is semidet.

string(Str) -->
    ['"'], untilDiscard('"', Cs),
    { string__from_char_list(Cs, Str) }.

:- pred untilDiscard(char, [char], [char], [char]).
:- mode untilDiscard(in, out, in, out) is det.

untilDiscard(_C, [], [], []).
untilDiscard(C, Cs) -->
    =([_|_]),
    [C0],
    ( { C = C0 } ->
    	{ Cs = [] }
    ;
   	{ Cs = [C0|Cs0] },
	untilDiscard(C, Cs0)
    ).

-------------- next part --------------
%---------------------------------------------------------------------------%
% Copyright (C) 2000 The University of Melbourne.
% This file may only be copied under the terms of the GNU Library General
% Public License - see the file COPYING.LIB in the Mercury distribution.
%---------------------------------------------------------------------------%
%
% Main author: conway at cs.mu.oz.au.
%
%---------------------------------------------------------------------------%
:- module xml:doc.

:- interface.

:- import_module array, list, map.

:- type document
	--->	doc(
		    prestuff	:: list(ref(content)),
		    root	:: ref(content),
		    poststuff	:: list(ref(content)),
		    content	:: array(content)
		).

:- type content
	--->	element(element)
	;	pi(string, string)
	;	comment(string)
	;	data(string)
	.

:- type contentStore
	--->	content(
			eNext	:: ref(content),
			eMap	:: map(ref(content), content)
		).

:- type element
	--->	element(
		    eName	:: string,
		    eAttrs	:: list(attribute),
		    eContent	:: list(ref(content))
		).

:- type attribute
	--->	attribute(
		    aName	:: string,
		    aValue	:: string
		).

:- type ref(T)	== int.

:- func ref(contentStore, ref(content)) = content.

:- pred add(content, ref(content), contentStore, contentStore).
:- mode add(in, out, in, out) is det.

:- implementation.

:- import_module int.

ref(Elems, Ref) = Elem :-
	lookup(Elems^eMap, Ref, Elem).

add(Elem, Ref, Elems0, Elems) :-
	Ref = Elems0^eNext,
	Elems1 = Elems0^eNext := Ref + 1,
	set(Elems1^eMap, Ref, Elem, Map),
	Elems = Elems1^eMap := Map.

-------------- next part --------------
%---------------------------------------------------------------------------%
% Copyright (C) 2000 The University of Melbourne.
% This file may only be copied under the terms of the GNU Library General
% Public License - see the file COPYING.LIB in the Mercury distribution.
%---------------------------------------------------------------------------%
%
% Main author: conway at cs.mu.oz.au.
%
%---------------------------------------------------------------------------%
:- module xml:dtd.

:- interface.

:- import_module list, map.

:- type (A -> B) == map(A, B).

:- type name == string.

:- type token == string.

:- type [] ---> [].

:- type [T1|T2] == list(T1).

:- type dtd
	--->	dtd(
		root		:: name,
		elements	:: (name -> element),
		entities	:: (name -> entityDef),
		pentities	:: (name -> entityDef)
	).

:- type element
	--->	element(
		    eName	:: name,
		    eAttrs	:: (name -> attribute),
		    eContent	:: content
		).

:- type content
	--->	empty
	;	any
	;	children(contentParticle)
	;	mixed(mixed).

:- type contentParticle
	--->	(children - multiplicity).

:- type children
	--->	seq(list(contentParticle))
	;	alt(list(contentParticle))
	;	element(name)
	.

:- type mixed
	--->	mixed(list(name)).

:- type multiplicity
	--->	('1')
	;	('*')
	;	('+')
	;	('?')
	.

:- type attribute
	--->	attribute(
		    aName	:: name,
		    aType	:: (type),
		    aDefault	:: default
		).

:- type (type)
	--->	cdata
	;	id
	;	idref
	;	idrefs
	;	entity
	;	entities
	;	nmtoken
	;	nmtokens
	;	notation(list(token))
	;	enum(list(token))
	.

:- type default
	--->	required
	;	implied
	;	defaulted(string)
	;	fixed(string)
	.

:- type entityDef
	--->	internal(entity)
	;	external(externalId)
	.

:- type entity == string.

:- type externalId
	--->	system(string)
	;	public(string, string)
	.

-------------- next part --------------
%---------------------------------------------------------------------------%
% Copyright (C) 2000 The University of Melbourne.
% This file may only be copied under the terms of the GNU Library General
% Public License - see the file COPYING.LIB in the Mercury distribution.
%---------------------------------------------------------------------------%
%
% Main author: conway at cs.mu.oz.au.
%
%---------------------------------------------------------------------------%
:- module xml:encoding.

:- interface.

:- import_module parsing.

:- type ascii7 ---> ascii7.

:- instance encoding(ascii7).

:- type latin1 ---> latin1.

:- instance encoding(latin1).

:- type utf8 ---> utf8.

:- instance encoding(utf8).

:- implementation.

:- import_module unicode.
:- import_module char, int, list, require, string.

:- instance encoding(ascii7) where [
	pred(decode/4) is decode_ascii7,
	pred(encode/3) is encode_ascii7
].

:- pred decode_ascii7(ascii7, unicode, entity, entity).
:- mode decode_ascii7(in, out, in, out) is semidet.

decode_ascii7(_, U) -->
    [U].

:- pred encode_ascii7(ascii7, list(unicode), string).
:- mode encode_ascii7(in, in, out) is det.

encode_ascii7(_, Us, Str) :-
    unicodesToAscii7(Us, Cs, []),
    string__from_char_list(Cs, Str).

:- pred unicodesToAscii7(list(unicode), list(char), list(char)).
:- mode unicodesToAscii7(in, out, in) is det.

unicodesToAscii7([]) --> [].
unicodesToAscii7([U|Us]) -->
    ( { U > 0x00, U < 0x80, char__to_int(C, U) } ->
    	[C],
	unicodesToAscii7(Us)
    ;
        { format("unicodesToAscii7: couldn't convert U-%x to 7bit ascii",
		[i(U)], Msg) },
	{ error(Msg) }
    ).

:- instance encoding(latin1) where [
	pred(decode/4) is decode_latin1,
	pred(encode/3) is encode_latin1
].

:- pred decode_latin1(latin1, unicode, entity, entity).
:- mode decode_latin1(in, out, in, out) is semidet.

decode_latin1(_, U) -->
    [U].

:- pred encode_latin1(latin1, list(unicode), string).
:- mode encode_latin1(in, in, out) is det.

encode_latin1(_, Us, Str) :-
    unicodesToLatin1(Us, Cs, []),
    string__from_char_list(Cs, Str).

:- pred unicodesToLatin1(list(unicode), list(char), list(char)).
:- mode unicodesToLatin1(in, out, in) is det.

unicodesToLatin1([]) --> [].
unicodesToLatin1([U|Us]) -->
    ( { char__to_int(C, U) } ->
    	[C],
	unicodesToLatin1(Us)
    ;
        { format("unicodesToLatin1: couldn't convert U-%x to Latin-1",
		[i(U)], Msg) },
	{ error(Msg) }
    ).

:- instance encoding(utf8) where [
	pred(decode/4) is decode_utf8,
	pred(encode/3) is encode_utf8
].

:- pred decode_utf8(utf8, unicode, entity, entity).
:- mode decode_utf8(in, out, in, out) is semidet.

decode_utf8(_, U) -->
    [U0],
    ( { U0 /\ 0x80  = 0 } ->
        { U = U0 }
    ; { U0 /\ 0x20 = 0 } ->
        [U1],
	{ U = ((U0 /\ 0x1F) << 6) \/ (U1 /\ 0x3F) }
    ; { U0 /\ 0x10 = 0 } ->
    	[U1], [U2],
	{ U = ((U0 /\ 0x0F) << 12) \/ ((U1 /\ 0x3F) << 6) \/ (U2 /\ 0x3F) }
    ; { U0 /\ 0x08 = 0 } ->
    	[U1], [U2], [U3],
	{ U = ((U0 /\ 0x07) << 18) \/ ((U1 /\ 0x3F) << 12) \/ 
	      ((U2 /\ 0x3F) << 6) \/ (U3 /\ 0x3F) }
    ; { U0 /\ 0x04  = 0 } ->
    	[U1], [U2], [U3], [U4],
	{ U = ((U0 /\ 0x03) << 24) \/ ((U1 /\ 0x3F) << 18) \/ 
	      ((U2 /\ 0x3F) << 12) \/ ((U3 /\ 0x3F) << 6) \/ (U4 /\ 0x3F) }
    ; { U0 /\ 0x02  = 0 } ->
    	[U1], [U2], [U3], [U4], [U5],
	{ U = ((U0 /\ 0x01) << 30) \/ ((U1 /\ 0x3F) << 24) \/ 
	      ((U2 /\ 0x3F) << 18) \/ ((U3 /\ 0x3F) << 12) \/
	      ((U4 /\ 0x3F) << 6) \/ (U5 /\ 0x3F) }
    ;
        %{ error("decode_utf8: bad value!") }
	{ fail }
    ).

:- pred encode_utf8(utf8, list(unicode), string).
:- mode encode_utf8(in, in, out) is det.

encode_utf8(_, Us, Str) :-
    unicodesToUTF8(Us, Cs, []),
    string__from_char_list(Cs, Str).

:- pred unicodesToUTF8(list(unicode), list(char), list(char)).
:- mode unicodesToUTF8(in, out, in) is det.

unicodesToUTF8([]) --> [].
unicodesToUTF8([U|Us]) -->
    (
        { U > 0x00, U =< 0x7F },
        { char__to_int(C, U) }
    ->
    	[C]
    ;
        { U >= 0x80, U =< 0x07FF },
	{ U0 = 0xC0 \/ (0x1F /\ (U >> 6)) },
	{ U1 = 0x80 \/ (0x3F /\ U) },
        { char__to_int(C0, U0) },
        { char__to_int(C1, U1) }
    ->
    	[C0, C1]
    ;
        { U >= 0x0800, U =< 0xFFFF },
	{ U0 = 0xE0 \/ (0x0F /\ (U >> 12)) },
	{ U1 = 0x80 \/ (0x3F /\ (U >> 6)) },
	{ U2 = 0x80 \/ (0x3F /\ U) },
        { char__to_int(C0, U0) },
        { char__to_int(C1, U1) },
        { char__to_int(C2, U2) }
    ->
    	[C0, C1, C2]
    ;
        { U >= 0x010000, U =< 0x1FFFFF },
	{ U0 = 0xF0 \/ (0x07 /\ (U >> 18)) },
	{ U1 = 0x80 \/ (0x3F /\ (U >> 12)) },
	{ U2 = 0x80 \/ (0x3F /\ (U >> 6)) },
	{ U3 = 0x80 \/ (0x3F /\ U) },
        { char__to_int(C0, U0) },
        { char__to_int(C1, U1) },
        { char__to_int(C2, U2) },
        { char__to_int(C3, U3) }
    ->
    	[C0, C1, C2, C3]
    ;
        { U >= 0x200000, U =< 0x03FFFFFF },
	{ U0 = 0xF8 \/ (0x03 /\ (U >> 24)) },
	{ U1 = 0x80 \/ (0x3F /\ (U >> 18)) },
	{ U2 = 0x80 \/ (0x3F /\ (U >> 12)) },
	{ U3 = 0x80 \/ (0x3F /\ (U >> 6)) },
	{ U4 = 0x80 \/ (0x3F /\ U) },
        { char__to_int(C0, U0) },
        { char__to_int(C1, U1) },
        { char__to_int(C2, U2) },
        { char__to_int(C3, U3) },
        { char__to_int(C4, U4) }
    ->
    	[C0, C1, C2, C3, C4]
    ;
        { U >= 0x04000000, U =< 0x7FFFFFFF },
	{ U0 = 0xFC \/ (0x01 /\ (U >> 30)) },
	{ U1 = 0x80 \/ (0x3F /\ (U >> 24)) },
	{ U2 = 0x80 \/ (0x3F /\ (U >> 18)) },
	{ U3 = 0x80 \/ (0x3F /\ (U >> 12)) },
	{ U4 = 0x80 \/ (0x3F /\ (U >> 6)) },
	{ U5 = 0x80 \/ (0x3F /\ U) },
        { char__to_int(C0, U0) },
        { char__to_int(C1, U1) },
        { char__to_int(C2, U2) },
        { char__to_int(C3, U3) },
        { char__to_int(C4, U4) },
        { char__to_int(C5, U5) }
    ->
    	[C0, C1, C2, C3, C4, C5]
    ;
        { format("unicodesToUTF8: couldn't convert U-%x to UTF-8",
		[i(U)], Msg) },
	{ error(Msg) }
    ),
    unicodesToUTF8(Us).

:- func unicode.entity = entity.
:- mode (out.out = in) is semidet.

U.E = E0 :-
    E0^curr < E0^leng,
    string__unsafe_index(E0^text, E0^curr, C),
    char__to_int(C, U),
    E = E0^curr := (E0^curr + 1).

-------------- next part --------------
%---------------------------------------------------------------------------%
% Copyright (C) 2000 The University of Melbourne.
% This file may only be copied under the terms of the GNU Library General
% Public License - see the file COPYING.LIB in the Mercury distribution.
%---------------------------------------------------------------------------%
%
% Main author: conway at cs.mu.oz.au.
%
%---------------------------------------------------------------------------%
:- module xml.

:- interface.

:- include_module xml:cat.
:- include_module xml:doc.
:- include_module xml:dtd.
:- include_module xml:encoding.
:- include_module xml:parse.

-------------- next part --------------
%---------------------------------------------------------------------------%
% Copyright (C) 2000 The University of Melbourne.
% This file may only be copied under the terms of the GNU Library General
% Public License - see the file COPYING.LIB in the Mercury distribution.
%---------------------------------------------------------------------------%
%
% Main author: conway at cs.mu.oz.au.
%
%---------------------------------------------------------------------------%
:- module xml:parse:chars.

:- interface.

:- import_module parsing, unicode.

:- pred letter(pstate(_), pstate(unicode)).
:- mode letter(in, out) is det.

:- pred baseChar(pstate(T1), pstate(unicode)).
:- mode baseChar(pdi, puo) is det.

:- pred ideographic(pstate(T1), pstate(unicode)).
:- mode ideographic(pdi, puo) is det.

:- pred combiningChar(pstate(T1), pstate(unicode)).
:- mode combiningChar(pdi, puo) is det.

:- pred digit(pstate(_), pstate(unicode)).
:- mode digit(in, out) is det.

:- pred extender(pstate(T1), pstate(unicode)).
:- mode extender(pdi, puo) is det.

:- implementation.

%B. Character Classes
%
%   Following the characteristics defined in the Unicode standard,
%   characters are classed as base characters (among others, these contain
%   the alphabetic characters of the Latin alphabet, without diacritics),
%   ideographic characters, and combining characters (among others, this
%   class contains most diacritics); these classes combine to form the
%   class of letters. Digits and extenders are also distinguished.
%   
%   Characters
%   [84]  Letter ::= BaseChar | Ideographic

letter -->
    baseChar or ideographic.

%   [85]  BaseChar ::= [#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6]
%   | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131]
%   | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E]
%   | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5]
%   | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386
%   | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE]
%   | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0
%   | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F]
%   | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4]
%   | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB]
%   | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559
%   | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2]
%   | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7]
%   | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5
%   | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961]
%   | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8]
%   | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD]
%   | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A]
%   | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30]
%   | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39]
%   | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B]
%   | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0]
%   | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0
%   | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28]
%   | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D
%   | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A]
%   | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C
%   | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA]
%   | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C]
%   | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33]
%   | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C]
%   | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3]
%   | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C]
%   | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39]
%   | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33]
%   | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88]
%   | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F]
%   | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB]
%   | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD
%   | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69]
%   | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103]
%   | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112]
%   | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150
%   | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165
%   | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175
%   | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8]
%   | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9
%   | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15]
%   | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D]
%   | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D]
%   | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4]
%   | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB]
%   | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126
%   | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094]
%   | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]

baseChar -->
    (0x0041-0x005A) or (0x0061-0x007A) or (0x00C0-0x00D6)
    or (0x00D8-0x00F6) or (0x00F8-0x00FF) or (0x0100-0x0131)
    or (0x0134-0x013E) or (0x0141-0x0148) or (0x014A-0x017E)
    or (0x0180-0x01C3) or (0x01CD-0x01F0) or (0x01F4-0x01F5)
    or (0x01FA-0x0217) or (0x0250-0x02A8) or (0x02BB-0x02C1) or lit1(0x0386)
    or (0x0388-0x038A) or lit1(0x038C) or (0x038E-0x03A1) or (0x03A3-0x03CE)
    or (0x03D0-0x03D6) or lit1(0x03DA) or lit1(0x03DC) or lit1(0x03DE)
    or lit1(0x03E0)
    or (0x03E2-0x03F3) or (0x0401-0x040C) or (0x040E-0x044F)
    or (0x0451-0x045C) or (0x045E-0x0481) or (0x0490-0x04C4)
    or (0x04C7-0x04C8) or (0x04CB-0x04CC) or (0x04D0-0x04EB)
    or (0x04EE-0x04F5) or (0x04F8-0x04F9) or (0x0531-0x0556) or lit1(0x0559)
    or (0x0561-0x0586) or (0x05D0-0x05EA) or (0x05F0-0x05F2)
    or (0x0621-0x063A) or (0x0641-0x064A) or (0x0671-0x06B7)
    or (0x06BA-0x06BE) or (0x06C0-0x06CE) or (0x06D0-0x06D3) or lit1(0x06D5)
    or (0x06E5-0x06E6) or (0x0905-0x0939) or lit1(0x093D) or (0x0958-0x0961)
    or (0x0985-0x098C) or (0x098F-0x0990) or (0x0993-0x09A8)
    or (0x09AA-0x09B0) or lit1(0x09B2) or (0x09B6-0x09B9) or (0x09DC-0x09DD)
    or (0x09DF-0x09E1) or (0x09F0-0x09F1) or (0x0A05-0x0A0A)
    or (0x0A0F-0x0A10) or (0x0A13-0x0A28) or (0x0A2A-0x0A30)
    or (0x0A32-0x0A33) or (0x0A35-0x0A36) or (0x0A38-0x0A39)
    or (0x0A59-0x0A5C) or lit1(0x0A5E) or (0x0A72-0x0A74) or (0x0A85-0x0A8B)
    or lit1(0x0A8D) or (0x0A8F-0x0A91) or (0x0A93-0x0AA8) or (0x0AAA-0x0AB0)
    or (0x0AB2-0x0AB3) or (0x0AB5-0x0AB9) or lit1(0x0ABD) or lit1(0x0AE0)
    or (0x0B05-0x0B0C) or (0x0B0F-0x0B10) or (0x0B13-0x0B28)
    or (0x0B2A-0x0B30) or (0x0B32-0x0B33) or (0x0B36-0x0B39) or lit1(0x0B3D)
    or (0x0B5C-0x0B5D) or (0x0B5F-0x0B61) or (0x0B85-0x0B8A)
    or (0x0B8E-0x0B90) or (0x0B92-0x0B95) or (0x0B99-0x0B9A) or lit1(0x0B9C)
    or (0x0B9E-0x0B9F) or (0x0BA3-0x0BA4) or (0x0BA8-0x0BAA)
    or (0x0BAE-0x0BB5) or (0x0BB7-0x0BB9) or (0x0C05-0x0C0C)
    or (0x0C0E-0x0C10) or (0x0C12-0x0C28) or (0x0C2A-0x0C33)
    or (0x0C35-0x0C39) or (0x0C60-0x0C61) or (0x0C85-0x0C8C)
    or (0x0C8E-0x0C90) or (0x0C92-0x0CA8) or (0x0CAA-0x0CB3)
    or (0x0CB5-0x0CB9) or lit1(0x0CDE) or (0x0CE0-0x0CE1) or (0x0D05-0x0D0C)
    or (0x0D0E-0x0D10) or (0x0D12-0x0D28) or (0x0D2A-0x0D39)
    or (0x0D60-0x0D61) or (0x0E01-0x0E2E) or lit1(0x0E30) or (0x0E32-0x0E33)
    or (0x0E40-0x0E45) or (0x0E81-0x0E82) or lit1(0x0E84) or (0x0E87-0x0E88)
    or lit1(0x0E8A) or lit1(0x0E8D) or (0x0E94-0x0E97) or (0x0E99-0x0E9F)
    or (0x0EA1-0x0EA3) or lit1(0x0EA5) or lit1(0x0EA7) or (0x0EAA-0x0EAB)
    or (0x0EAD-0x0EAE) or lit1(0x0EB0) or (0x0EB2-0x0EB3) or lit1(0x0EBD)
    or (0x0EC0-0x0EC4) or (0x0F40-0x0F47) or (0x0F49-0x0F69)
    or (0x10A0-0x10C5) or (0x10D0-0x10F6) or lit1(0x1100) or (0x1102-0x1103)
    or (0x1105-0x1107) or lit1(0x1109) or (0x110B-0x110C) or (0x110E-0x1112)
    or lit1(0x113C) or lit1(0x113E) or lit1(0x1140) or lit1(0x114C)
    or lit1(0x114E) or lit1(0x1150)
    or (0x1154-0x1155) or lit1(0x1159) or (0x115F-0x1161) or lit1(0x1163)
    or lit1(0x1165)
    or lit1(0x1167) or lit1(0x1169) or (0x116D-0x116E) or (0x1172-0x1173)
    or lit1(0x1175)
    or lit1(0x119E) or lit1(0x11A8) or lit1(0x11AB) or (0x11AE-0x11AF)
    or (0x11B7-0x11B8)
    or lit1(0x11BA) or (0x11BC-0x11C2) or lit1(0x11EB) or lit1(0x11F0)
    or lit1(0x11F9)
    or (0x1E00-0x1E9B) or (0x1EA0-0x1EF9) or (0x1F00-0x1F15)
    or (0x1F18-0x1F1D) or (0x1F20-0x1F45) or (0x1F48-0x1F4D)
    or (0x1F50-0x1F57) or lit1(0x1F59) or lit1(0x1F5B) or lit1(0x1F5D)
    or (0x1F5F-0x1F7D)
    or (0x1F80-0x1FB4) or (0x1FB6-0x1FBC) or lit1(0x1FBE) or (0x1FC2-0x1FC4)
    or (0x1FC6-0x1FCC) or (0x1FD0-0x1FD3) or (0x1FD6-0x1FDB)
    or (0x1FE0-0x1FEC) or (0x1FF2-0x1FF4) or (0x1FF6-0x1FFC) or lit1(0x2126)
    or (0x212A-0x212B) or lit1(0x212E) or (0x2180-0x2182) or (0x3041-0x3094)
    or (0x30A1-0x30FA) or (0x3105-0x312C) or (0xAC00-0xD7A3).

%   [86]  Ideographic ::= [#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]

ideographic -->
    (0x4E00-0x9FA5) or lit1(0x3007) or (0x3021-0x3029).

%   [87]  CombiningChar ::= [#x0300-#x0345] | [#x0360-#x0361]
%   | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9]
%   | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4
%   | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF]
%   | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED]
%   | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D
%   | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC
%   | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8]
%   | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C
%   | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48]
%   | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC
%   | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD]
%   | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48]
%   | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83]
%   | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7
%   | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48]
%   | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83]
%   | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD]
%   | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43]
%   | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31
%   | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9]
%   | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35
%   | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84]
%   | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD]
%   | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1
%   | [#x302A-#x302F] | #x3099 | #x309A

combiningChar -->
    (0x0300-0x0345) or (0x0360-0x0361)
    or (0x0483-0x0486) or (0x0591-0x05A1) or (0x05A3-0x05B9)
    or (0x05BB-0x05BD) or lit1(0x05BF) or (0x05C1-0x05C2) or lit1(0x05C4)
    or (0x064B-0x0652) or lit1(0x0670) or (0x06D6-0x06DC) or (0x06DD-0x06DF)
    or (0x06E0-0x06E4) or (0x06E7-0x06E8) or (0x06EA-0x06ED)
    or (0x0901-0x0903) or lit1(0x093C) or (0x093E-0x094C) or lit1(0x094D)
    or (0x0951-0x0954) or (0x0962-0x0963) or (0x0981-0x0983) or lit1(0x09BC)
    or lit1(0x09BE) or lit1(0x09BF) or (0x09C0-0x09C4) or (0x09C7-0x09C8)
    or (0x09CB-0x09CD) or lit1(0x09D7) or (0x09E2-0x09E3) or lit1(0x0A02)
    or lit1(0x0A3C)
    or lit1(0x0A3E) or lit1(0x0A3F) or (0x0A40-0x0A42) or (0x0A47-0x0A48)
    or (0x0A4B-0x0A4D) or (0x0A70-0x0A71) or (0x0A81-0x0A83) or lit1(0x0ABC)
    or (0x0ABE-0x0AC5) or (0x0AC7-0x0AC9) or (0x0ACB-0x0ACD)
    or (0x0B01-0x0B03) or lit1(0x0B3C) or (0x0B3E-0x0B43) or (0x0B47-0x0B48)
    or (0x0B4B-0x0B4D) or (0x0B56-0x0B57) or (0x0B82-0x0B83)
    or (0x0BBE-0x0BC2) or (0x0BC6-0x0BC8) or (0x0BCA-0x0BCD) or lit1(0x0BD7)
    or (0x0C01-0x0C03) or (0x0C3E-0x0C44) or (0x0C46-0x0C48)
    or (0x0C4A-0x0C4D) or (0x0C55-0x0C56) or (0x0C82-0x0C83)
    or (0x0CBE-0x0CC4) or (0x0CC6-0x0CC8) or (0x0CCA-0x0CCD)
    or (0x0CD5-0x0CD6) or (0x0D02-0x0D03) or (0x0D3E-0x0D43)
    or (0x0D46-0x0D48) or (0x0D4A-0x0D4D) or lit1(0x0D57) or lit1(0x0E31)
    or (0x0E34-0x0E3A) or (0x0E47-0x0E4E) or lit1(0x0EB1) or (0x0EB4-0x0EB9)
    or (0x0EBB-0x0EBC) or (0x0EC8-0x0ECD) or (0x0F18-0x0F19) or lit1(0x0F35)
    or lit1(0x0F37) or lit1(0x0F39) or lit1(0x0F3E) or lit1(0x0F3F)
    or (0x0F71-0x0F84)
    or (0x0F86-0x0F8B) or (0x0F90-0x0F95) or lit1(0x0F97) or (0x0F99-0x0FAD)
    or (0x0FB1-0x0FB7) or lit1(0x0FB9) or (0x20D0-0x20DC) or lit1(0x20E1)
    or (0x302A-0x302F) or lit1(0x3099) or lit1(0x309A).

%   [88]  Digit ::= [#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9]
%   | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F]
%   | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF]
%   | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F]
%   | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]

digit -->
    (0x0030-0x0039) or (0x0660-0x0669) or (0x06f0-0x06f9)
    or (0x0966-0x096f) or (0x09e6-0x09ef) or (0x0a66-0x0a6f)
    or (0x0ae6-0x0aef) or (0x0b66-0x0b6f) or (0x0be7-0x0bef)
    or (0x0c66-0x0c6f) or (0x0ce6-0x0cef) or (0x0d66-0x0d6f)
    or (0x0e50-0x0e59) or (0x0ed0-0x0ed9) or (0x0f20-0x0f29).

%   [89]  extender ::= #x00b7 | #x02d0 | #x02d1 | #x0387 | #x0640 | #x0e46
%   | #x0ec6 | #x3005 | [#x3031-#x3035] | [#x309d-#x309e]
%   | [#x30fc-#x30fe]

extender -->
    lit1(0x00b7) or lit1(0x02d0) or lit1(0x02d1) or lit1(0x0387)
    or lit1(0x0640) or lit1(0x0e46)
    or lit1(0x0ec6) or lit1(0x3005) or (0x3031-0x3035) or (0x309d-0x309e)
    or (0x30fc-0x30fe).

-------------- next part --------------
%---------------------------------------------------------------------------%
% Copyright (C) 2000 The University of Melbourne.
% This file may only be copied under the terms of the GNU Library General
% Public License - see the file COPYING.LIB in the Mercury distribution.
%---------------------------------------------------------------------------%
%
% Main author: conway at cs.mu.oz.au.
%
%---------------------------------------------------------------------------%
:- module xml:parse.
%
%                                                        W3C REC-xml-19980210
%                                                                            
%                    Extensible Markup Language (XML) 1.0
%                                      
%                    W3C Recommendation 10-February-1998
%                                      
%   This version:
%          http://www.w3.org/TR/1998/REC-xml-19980210
%          http://www.w3.org/TR/1998/REC-xml-19980210.xml
%          http://www.w3.org/TR/1998/REC-xml-19980210.html
%          http://www.w3.org/TR/1998/REC-xml-19980210.pdf
%          http://www.w3.org/TR/1998/REC-xml-19980210.ps
%          
%   Latest version:
%          http://www.w3.org/TR/REC-xml
%          
%   Previous version:
%          http://www.w3.org/TR/PR-xml-971208
%          
%   Editors:
%          Tim Bray (Textuality and Netscape) <tbray at textuality.com>
%          Jean Paoli (Microsoft) <jeanpa at microsoft.com>
%          C. M. Sperberg-McQueen (University of Illinois at Chicago)
%          <cmsmcq at uic.edu>
%          
%Abstract
%
%   The Extensible Markup Language (XML) is a subset of SGML that is
%   completely described in this document. Its goal is to enable generic
%   SGML to be served, received, and processed on the Web in the way that
%   is now possible with HTML. XML has been designed for ease of
%   implementation and for interoperability with both SGML and HTML.
%   
%Status of this document
%
%   This document has been reviewed by W3C Members and other interested
%   parties and has been endorsed by the Director as a W3C Recommendation.
%   It is a stable document and may be used as reference material or cited
%   as a normative reference from another document. W3C's role in making
%   the Recommendation is to draw attention to the specification and to
%   promote its widespread deployment. This enhances the functionality and
%   interoperability of the Web.
%   
%   This document specifies a syntax created by subsetting an existing,
%   widely used international text processing standard (Standard
%   Generalized Markup Language, ISO 8879:1986(E) as amended and
%   corrected) for use on the World Wide Web. It is a product of the W3C
%   XML Activity, details of which can be found at http://www.w3.org/XML.
%   A list of current W3C Recommendations and other technical documents
%   can be found at http://www.w3.org/TR.
%   
%   This specification uses the term URI, which is defined by [Berners-Lee
%   et al.], a work in progress expected to update [IETF RFC1738] and
%   [IETF RFC1808].
%   
%   The list of known errors in this specification is available at
%   http://www.w3.org/XML/xml-19980210-errata.
%   
%   Please report errors in this document to xml-editor at w3.org.
%   
%                     Extensible Markup Language (XML) 1.0
%                                       
%Table of Contents
%
%   1. Introduction
%       1.1 Origin and Goals
%       1.2 Terminology
%   2. Documents
%       2.1 Well-Formed XML Documents
%       2.2 Characters
%       2.3 Common Syntactic Constructs
%       2.4 Character Data and Markup
%       2.5 Comments
%       2.6 Processing Instructions
%       2.7 CDATA Sections
%       2.8 Prolog and Document Type Declaration
%       2.9 Standalone Document Declaration
%       2.10 White Space Handling
%       2.11 End-of-Line Handling
%       2.12 Language Identification
%   3. Logical Structures
%       3.1 Start-Tags, End-Tags, and Empty-Element Tags
%       3.2 Element Type Declarations
%           3.2.1 Element Content
%           3.2.2 Mixed Content
%       3.3 Attribute-List Declarations
%           3.3.1 Attribute Types
%           3.3.2 Attribute Defaults
%           3.3.3 Attribute-Value Normalization
%       3.4 Conditional Sections
%   4. Physical Structures
%       4.1 Character and Entity References
%       4.2 Entity Declarations
%           4.2.1 Internal Entities
%           4.2.2 External Entities
%       4.3 Parsed Entities
%           4.3.1 The Text Declaration
%           4.3.2 Well-Formed Parsed Entities
%           4.3.3 Character Encoding in Entities
%       4.4 XML Processor Treatment of Entities and References
%           4.4.1 Not Recognized
%           4.4.2 Included
%           4.4.3 Included If Validating
%           4.4.4 Forbidden
%           4.4.5 Included in Literal
%           4.4.6 Notify
%           4.4.7 Bypassed
%           4.4.8 Included as PE
%       4.5 Construction of Internal Entity Replacement Text
%       4.6 Predefined Entities
%       4.7 Notation Declarations
%       4.8 Document Entity
%   5. Conformance
%       5.1 Validating and Non-Validating Processors
%       5.2 Using XML Processors
%   6. Notation
%   
%  Appendices
%  
%   A. References
%       A.1 Normative References
%       A.2 Other References
%   B. Character Classes
%   C. XML and SGML (Non-Normative)
%   D. Expansion of Entity and Character References (Non-Normative)
%   E. Deterministic Content Models (Non-Normative)
%   F. Autodetection of Character Encodings (Non-Normative)
%   G. W3C XML Working Group (Non-Normative)
%     _________________________________________________________________
%   
%1. Introduction
%
%   Extensible Markup Language, abbreviated XML, describes a class of data
%   objects called XML documents and partially describes the behavior of
%   computer programs which process them. XML is an application profile or
%   restricted form of SGML, the Standard Generalized Markup Language [ISO
%   8879]. By construction, XML documents are conforming SGML documents.
%   
%   XML documents are made up of storage units called entities, which
%   contain either parsed or unparsed data. Parsed data is made up of
%   characters, some of which form character data, and some of which form
%   markup. Markup encodes a description of the document's storage layout
%   and logical structure. XML provides a mechanism to impose constraints
%   on the storage layout and logical structure.
%   
%   A software module called an XML processor is used to read XML
%   documents and provide access to their content and structure. It is
%   assumed that an XML processor is doing its work on behalf of another
%   module, called the application. This specification describes the
%   required behavior of an XML processor in terms of how it must read XML
%   data and the information it must provide to the application.
%   
%  1.1 Origin and Goals
%  
%   XML was developed by an XML Working Group (originally known as the
%   SGML Editorial Review Board) formed under the auspices of the World
%   Wide Web Consortium (W3C) in 1996. It was chaired by Jon Bosak of Sun
%   Microsystems with the active participation of an XML Special Interest
%   Group (previously known as the SGML Working Group) also organized by
%   the W3C. The membership of the XML Working Group is given in an
%   appendix. Dan Connolly served as the WG's contact with the W3C.
%   
%   The design goals for XML are:
%    1. XML shall be straightforwardly usable over the Internet.
%    2. XML shall support a wide variety of applications.
%    3. XML shall be compatible with SGML.
%    4. It shall be easy to write programs which process XML documents.
%    5. The number of optional features in XML is to be kept to the
%       absolute minimum, ideally zero.
%    6. XML documents should be human-legible and reasonably clear.
%    7. The XML design should be prepared quickly.
%    8. The design of XML shall be formal and concise.
%    9. XML documents shall be easy to create.
%   10. Terseness in XML markup is of minimal importance.
%       
%   This specification, together with associated standards (Unicode and
%   ISO/IEC 10646 for characters, Internet RFC 1766 for language
%   identification tags, ISO 639 for language name codes, and ISO 3166 for
%   country name codes), provides all the information necessary to
%   understand XML Version 1.0 and construct computer programs to process
%   it.
%   
%   This version of the XML specification may be distributed freely, as
%   long as all text and legal notices remain intact.
%   
%  1.2 Terminology
%  
%   The terminology used to describe XML documents is defined in the body
%   of this specification. The terms defined in the following list are
%   used in building those definitions and in describing the actions of an
%   XML processor:
%   
%   may
%          Conforming documents and XML processors are permitted to but
%          need not behave as described.
%          
%   must
%          Conforming documents and XML processors are required to behave
%          as described; otherwise they are in error.
%          
%   error
%          A violation of the rules of this specification; results are
%          undefined. Conforming software may detect and report an error
%          and may recover from it.
%          
%   fatal error
%          An error which a conforming XML processor must detect and
%          report to the application. After encountering a fatal error,
%          the processor may continue processing the data to search for
%          further errors and may report such errors to the application.
%          In order to support correction of errors, the processor may
%          make unprocessed data from the document (with intermingled
%          character data and markup) available to the application. Once a
%          fatal error is detected, however, the processor must not
%          continue normal processing (i.e., it must not continue to pass
%          character data and information about the document's logical
%          structure to the application in the normal way).
%          
%   at user option
%          Conforming software may or must (depending on the modal verb in
%          the sentence) behave as described; if it does, it must provide
%          users a means to enable or disable the behavior described.
%          
%   validity constraint
%          A rule which applies to all valid XML documents. Violations of
%          validity constraints are errors; they must, at user option, be
%          reported by validating XML processors.
%          
%   well-formedness constraint
%          A rule which applies to all well-formed XML documents.
%          Violations of well-formedness constraints are fatal errors.
%          
%   match
%          (Of strings or names:) Two strings or names being compared must
%          be identical. Characters with multiple possible representations
%          in ISO/IEC 10646 (e.g. characters with both precomposed and
%          base+diacritic forms) match only if they have the same
%          representation in both strings. At user option, processors may
%          normalize such characters to some canonical form. No case
%          folding is performed. (Of strings and rules in the grammar:) A
%          string matches a grammatical production if it belongs to the
%          language generated by that production. (Of content and content
%          models:) An element matches its declaration when it conforms in
%          the fashion described in the constraint "Element Valid".
%          
%   for compatibility
%          A feature of XML included solely to ensure that XML remains
%          compatible with SGML.
%          
%   for interoperability
%          A non-binding recommendation included to increase the chances
%          that XML documents can be processed by the existing installed
%          base of SGML processors which predate the WebSGML Adaptations
%          Annex to ISO 8879.
%          
%2. Documents
%
%   A data object is an XML document if it is well-formed, as defined in
%   this specification. A well-formed XML document may in addition be
%   valid if it meets certain further constraints.
%   
%   Each XML document has both a logical and a physical structure.
%   Physically, the document is composed of units called entities. An
%   entity may refer to other entities to cause their inclusion in the
%   document. A document begins in a "root" or document entity. Logically,
%   the document is composed of declarations, elements, comments,
%   character references, and processing instructions, all of which are
%   indicated in the document by explicit markup. The logical and physical
%   structures must nest properly, as described in "4.3.2 Well-Formed
%   Parsed Entities".
%   
%  2.1 Well-Formed XML Documents
%  
%   A textual object is a well-formed XML document if:
%    1. Taken as a whole, it matches the production labeled document.
%    2. It meets all the well-formedness constraints given in this
%       specification.
%    3. Each of the parsed entities which is referenced directly or
%       indirectly within the document is well-formed.
%       
%   Document
%   [1]  document ::= prolog element Misc*

:- interface.

:- import_module parsing.
:- import_module xml:cat.
:- import_module xml:doc.
:- import_module xml:dtd.
:- import_module list.

    %
    % The following three globals should be set in the globals included
    % in the initial parsing state.
    %
:- type gCatalog	---> gCatalog.
:- instance global(gCatalog, catalog) where [].

:- type gDirs		---> gDirs.
:- type dirs		---> dirs(cat:dirs).
:- instance global(gDirs, parse:dirs) where [].

:- type gEncodings	---> gEncodings.
:- type encodings	---> encodings(string -> encoding).
:- instance global(gEncodings, encodings) where [].

:- pred document(pstate(_), pstate((dtd, document))).
:- mode document(in, out) is det.

:- implementation.

:- import_module xml:cat, xml:encoding.
:- import_module unicode.
:- import_module array, char, int, io, list, map, require, std_util, string.

:- type gContent	---> gContent.
:- instance global(gContent, contentStore) where [].

:- type gElements	---> gElements.
:- type elements	---> elements(name -> dtd:element).
:- instance global(gElements, elements) where [].

:- type gAttributes	---> gAttributes.
:- type attributes	---> attributes(name -> name -> dtd:attribute).
:- instance global(gAttributes, attributes) where [].

:- type gEntities	---> gEntities.
:- type entities	---> entities(name -> entityDef).
:- instance global(gEntities, entities) where [].

:- type gPEntities	---> gPEntities.
:- type pentities	---> pentities(name -> entityDef).
:- instance global(gPEntities, pentities) where [].

:- type gDTD		---> gDTD.
:- instance global(gDTD, dtd) where [].

:- type gExtEntities	---> gExtEntities.
:- type extEntities	---> extEntities(externalId -> dtd:entity).
:- instance global(gExtEntities, extEntities) where [].

document -->
    { contentStore(Content0) },
    set(gContent, Content0),
    set(gExtEntities, extEntities(init)),
    set(gEntities, entities(entities)),
    set(gPEntities, pentities(init)),
    set(gElements, elements(init)),
    set(gAttributes, attributes(init)),
    (prolog		    then (pred((DTD, PreMisc)::in, pdi, puo) is det -->
    (
      set(gDTD, DTD),
      (element		    then (pred(Root::in, pdi, puo) is det -->
      star(misc)	    then (pred(PostMisc0::in, pdi, puo) is det -->
      get(gContent, Content),
      { filterOpt(PostMisc0, PostMisc) },
      { Doc = doc(PreMisc, Root, PostMisc, array(values(Content^eMap))) },
      return((DTD, Doc))
    )))))).

:- pred contentStore(contentStore::out) is det.
contentStore(content(0, Map)) :-
	init(Map).

:- pred same_type(T::unused, T::unused) is det.
same_type(_, _).

:- func entities = (name -> entityDef).
entities = Entities :-
    map__from_assoc_list([
    	"lt"	- internal("<"),
    	"gt"	- internal(">"),
    	"amp"	- internal("&"),
    	"quot"	- internal("'"),
    	"apos"	- internal(""")
    ], Entities).

:- pred initDTD(name, dtd).
:- mode initDTD(in, out) is det.

initDTD(Root, DTD) :-
    init(Elems),
    map__from_assoc_list([
    	"lt"	- internal("<"),
    	"gt"	- internal(">"),
    	"amp"	- internal("&"),
    	"quot"	- internal("'"),
    	"apos"	- internal(""")
    ], Entities),
    init(PEntities),
    DTD = dtd(Root, Elems, Entities, PEntities).

%   
%   Matching the document production implies that:
%    1. It contains one or more elements.
%    2. There is exactly one element, called the root, or document
%       element, no part of which appears in the content of any other
%       element. For all other elements, if the start-tag is in the
%       content of another element, the end-tag is in the content of the
%       same element. More simply stated, the elements, delimited by
%       start- and end-tags, nest properly within each other.
%       
%   As a consequence of this, for each non-root element C in the document,
%   there is one other element P in the document such that C is in the
%   content of P, but is not in the content of any other element that is
%   in the content of P. P is referred to as the parent of C, and C as a
%   child of P.
%   
%  2.2 Characters
%  
%   A parsed entity contains text, a sequence of characters, which may
%   represent markup or character data. A character is an atomic unit of
%   text as specified by ISO/IEC 10646 [ISO/IEC 10646]. Legal characters
%   are tab, carriage return, line feed, and the legal graphic characters
%   of Unicode and ISO/IEC 10646. The use of "compatibility characters",
%   as defined in section 6.8 of [Unicode], is discouraged.
%   
%   Character Range
%   [2]  Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD]
%   | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate
%   blocks, FFFE, and FFFF. */

:- pred char(pstate(_), pstate(unicode)).
:- mode char(in, out) is det.

char -->
    tok					then (pred(C::in, pdi, puo) is det -->
    ( {
    	C = 0x09
    ;
    	C = 0x0A
    ;
    	C = 0x0D
    ;
    	C >= 0x20, C =< 0xD7FF
    ;
    	C >= 0xE000, C =< 0xFFFD
    ;
    	C >= 0x10000, C =< 0x10FFFF
    } ->
    	return(C)
    ;
    	{ format("Unexpected character `%x'.", [i(C)], Msg) },
	error(Msg)
    )).

%   
%   The mechanism for encoding character code points into bit patterns may
%   vary from entity to entity. All XML processors must accept the UTF-8
%   and UTF-16 encodings of 10646; the mechanisms for signaling which of
%   the two is in use, or for bringing other encodings into play, are
%   discussed later, in "4.3.3 Character Encoding in Entities".
%   
%  2.3 Common Syntactic Constructs
%  
%   This section defines some symbols used widely in the grammar.
%   
%   S (white space) consists of one or more space (#x20) characters,
%   carriage returns, line feeds, or tabs.
%   
%   White Space
%   [3]  S ::= (#x20 | #x9 | #xD | #xA)+

:- pred s(pstate(_), pstate(list(unicode))).
:- mode s(in, out) is det.

s -->
    plus(s0).

:- pred s0(pstate(_), pstate(unicode)).
:- mode s0(in, out) is det.

s0 -->
    tok					then (pred(C::in, pdi, puo) is det -->
    ( {
    	C = 0x20
    ;
    	C = 0x09
    ;
    	C = 0x0D
    ;
    	C = 0x0A
    } ->
    	return(C)
    ;
    	fail("not whitespace")
    )).

%   
%   Characters are classified for convenience as letters, digits, or other
%   characters. Letters consist of an alphabetic or syllabic base
%   character possibly followed by one or more combining characters, or of
%   an ideographic character. Full definitions of the specific characters
%   in each class are given in "B. Character Classes".
%   
%   A Name is a token beginning with a letter or one of a few punctuation
%   characters, and continuing with letters, digits, hyphens, underscores,
%   colons, or full stops, together known as name characters. Names
%   beginning with the string "xml", or any string which would match
%   (('X'|'x') ('M'|'m') ('L'|'l')), are reserved for standardization in
%   this or future versions of this specification.
%   
%   Note: The colon character within XML names is reserved for
%   experimentation with name spaces. Its meaning is expected to be
%   standardized at some future point, at which point those documents
%   using the colon for experimental purposes may need to be updated.
%   (There is no guarantee that any name-space mechanism adopted for XML
%   will in fact use the colon as a name-space delimiter.) In practice,
%   this means that authors should not use the colon in XML names except
%   as part of name-space experiments, but that XML processors should
%   accept the colon as a name character.
%   
%   An Nmtoken (name token) is any mixture of name characters.
%   
%   Names and Tokens
%   [4]  NameChar ::= Letter | Digit | '.' | '-' | '_' | ':'
%   | CombiningChar | Extender

:- pred nameChar(pstate(_), pstate(unicode)).
:- mode nameChar(in, out) is det.

nameChar -->
    letter or digit or lit1('.') or lit1('-') or lit1('_') or lit1(':') or
    combiningChar or extender.

%   [5]  Name ::= (Letter | '_' | ':') (NameChar)*

:- pred name(pstate(_), pstate(name)).
:- mode name(in, out) is det.

name -->
    letter or lit1('_') or lit1(':')	then (pred(C::in, pdi, puo) is det -->
    star(nameChar)			then (pred(Cs::in, pdi, puo) is det -->
    mkString([C|Cs], Name),
    return(Name)
    )).

%   [6]  Names ::= Name (S Name)*

:- pred names(pstate(_), pstate(list(name))).
:- mode names(in, out) is det.

names -->
    name			    then (pred(Name::in, pdi, puo) is det -->
    star(snd(s and name))	    then (pred(Names0::in, pdi, puo) is det -->
    { Names = [Name|Names0] },
    return(Names)
    )).

%   [7]  Nmtoken ::= (NameChar)+

:- pred nmtoken(pstate(_), pstate(name)).
:- mode nmtoken(in, out) is det.

nmtoken -->
    plus(nameChar)		    then (pred(Cs::in, pdi, puo) is det -->
    mkString(Cs, Name),
    return(Name)
    ).

%   [8]  Nmtokens ::= Nmtoken (S Nmtoken)*

:- pred nmtokens(pstate(_), pstate(list(name))).
:- mode nmtokens(in, out) is det.

nmtokens -->
    nmtoken			    then (pred(Name::in, pdi, puo) is det -->
    star(snd(s and nmtoken))	    then (pred(Names::in, pdi, puo) is det -->
    return([Name|Names])
    )).

%   
%   Literal data is any quoted string not containing the quotation mark
%   used as a delimiter for that string. Literals are used for specifying
%   the content of internal entities (EntityValue), the values of
%   attributes (AttValue), and external identifiers (SystemLiteral). Note
%   that a SystemLiteral can be parsed without scanning for markup.
%   
%   Literals
%   [9]  EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
%   |  "'" ([^%&'] | PEReference | Reference)* "'"

:- pred entityValue(pstate(_), pstate(string)).
:- mode entityValue(in, out) is det.

entityValue -->
    quote			    then (pred(Q::in, pdi, puo) is det -->
    entityValue1(Q)		    then (pred(Chars::in, pdi, puo) is det -->
    quote			    then (pred(EndQ::in, pdi, puo) is det -->
    ( { Q = EndQ } ->
	mkString(Chars, Val),
	return(Val)
    ;
    	error("mismatched quotes")
    )))).

:- pred entityValue1(unicode, pstate(_), pstate(list(unicode))).
:- mode entityValue1(in, in, out) is det.

entityValue1(Q) -->
    star(list(charRef) or list(except([('%'), Q])) or 
	 pEReference(star(char)))
				    then (pred(Css::in, pdi, puo) is det -->
	{ condense(Css, Cs) },
	return(Cs)
    ).

%   [10]  AttValue ::= '"' ([^<&"] | Reference)* '"'
%   |  "'" ([^<&'] | Reference)* "'"

:- pred attValue(pstate(_), pstate(string)).
:- mode attValue(in, out) is det.

attValue -->
    quote			    then (pred(Q::in, pdi, puo) is det -->
    attValue1(Q)		    then (pred(Chars::in, pdi, puo) is det -->
    quote			    then (pred(EndQ::in, pdi, puo) is det -->
    ( { Q = EndQ } ->
	mkString(Chars, Val),
	return(Val)
    ;
    	error("mismatched quotes")
    )))).

:- pred attValue1(unicode, pstate(_), pstate(list(unicode))).
:- mode attValue1(in, in, out) is det.

attValue1(Q) -->
    star(list(charRef) or list(except([('&'), ('<'), Q])) or
         entityRef(star(char)))
    				    then (pred(Css::in, pdi, puo) is det -->
	{ condense(Css, Cs) },
	return(Cs)
    ).

:- pred attValue2(pstate(_), pstate(list(unicode))).
:- mode attValue2(in, out) is det.

attValue2 -->
    star(list(charRef) or entityRef(attValue2) or list(char))
    				    then (pred(Css::in, pdi, puo) is det -->
	{ condense(Css, Cs) },
	return(Cs)
    ).

%   [11]  SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")

:- pred systemLiteral(pstate(_), pstate(string)).
:- mode systemLiteral(in, out) is det.

systemLiteral -->
    quote			    then (pred(Q::in, pdi, puo) is det -->
    star(except([Q]))		    then (pred(Chars::in, pdi, puo) is det -->
    quote			    then (pred(EndQ::in, pdi, puo) is det -->
    ( { Q = EndQ } ->
	mkString(Chars, Val),
	return(Val)
    ;
    	error("mismatched quotes")
    )))).

%   [12]  PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"

:- pred pubidLiteral(pstate(_), pstate(string)).
:- mode pubidLiteral(in, out) is det.

pubidLiteral -->
    quote			    then (pred(Q::in, pdi, puo) is det -->
    star(pubidChar(Q))		    then (pred(Chars::in, pdi, puo) is det -->
    quote			    then (pred(EndQ::in, pdi, puo) is det -->
    ( { Q = EndQ } ->
	mkString(Chars, Val),
	return(Val)
    ;
    	error("mismatched quotes")
    )))).

%   [13]  PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9]
%   | [-'()+,./:=?;!*#@$_%]

:- pred pubidChar(unicode, pstate(_), pstate(unicode)).
:- mode pubidChar(in, in, out) is det.

pubidChar(Q) -->
    tok				    then (pred(C::in, pdi, puo) is det -->
    (
        { C \= Q },
        {
    	    C = 0x20
        ;
    	    C = 0x0D
        ;
    	    C = 0x0A
        ;
    	    C >= a, C =< z
        ;
    	    C >= 'A', C =< 'Z'
        ;
    	    C >= '0', C =< '9'
        ;
            char__to_int(Ch, C),
    	    contains_char("-'()+,./:=?;!*#@$_%\"", Ch)
        }
    ->
    	return(C)
    ;
    	fail("not a publicId char")
    )).

%   
%  2.4 Character Data and Markup
%  
%   Text consists of intermingled character data and markup. Markup takes
%   the form of start-tags, end-tags, empty-element tags, entity
%   references, character references, comments, CDATA section delimiters,
%   document type declarations, and processing instructions.
%   
%   All text that is not markup constitutes the character data of the
%   document.
%   
%   The ampersand character (&) and the left angle bracket (<) may appear
%   in their literal form only when used as markup delimiters, or within a
%   comment, a processing instruction, or a CDATA section. They are also
%   legal within the literal entity value of an internal entity
%   declaration; see "4.3.2 Well-Formed Parsed Entities". If they are
%   needed elsewhere, they must be escaped using either numeric character
%   references or the strings "&" and "<" respectively. The right
%   angle bracket (>) may be represented using the string ">", and
%   must, for compatibility, be escaped using ">" or a character
%   reference when it appears in the string "]]>" in content, when that
%   string is not marking the end of a CDATA section.
%   
%   In the content of elements, character data is any string of characters
%   which does not contain the start-delimiter of any markup. In a CDATA
%   section, character data is any string of characters not including the
%   CDATA-section-close delimiter, "]]>".
%   
%   To allow attribute values to contain both single and double quotes,
%   the apostrophe or single-quote character (') may be represented as
%   "'", and the double-quote character (") as """.
%   
%   Character Data
%   [14]  CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)

:- pred charData(pstate(_), pstate(ref(doc:content))).
:- mode charData(in, out) is det.

charData -->
    plus(except([('<'), ('&')]) or charRef)
    				    then (pred(Chars::in, pdi, puo) is det -->
    mkString(Chars, Data),
    add(data(Data), Ref),
    return(Ref)
    ).

%   
%  2.5 Comments
%  
%   Comments may appear anywhere in a document outside other markup; in
%   addition, they may appear within the document type declaration at
%   places allowed by the grammar. They are not part of the document's
%   character data; an XML processor may, but need not, make it possible
%   for an application to retrieve the text of comments. For
%   compatibility, the string "--" (double-hyphen) must not occur within
%   comments.
%   
%   Comments
%   [15]  Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'

:- pred comment(pstate(_), pstate(ref(doc:content))).
:- mode comment(in, out) is det.

comment -->
    lit("<!--")			    then (pred(_::in, pdi, puo) is det -->
    upto(char, lit("-->"))	    then (pred((Cs, _)::in, pdi, puo) is det -->
    mkString(Cs, Comment),
    add(comment(Comment), Ref),
    return(Ref)
    )).

%   
%   An example of a comment:
%   
%   <!-- declarations for <head> & <body> -->
%   
%  2.6 Processing Instructions
%  
%   Processing instructions (PIs) allow documents to contain instructions
%   for applications.
%   
%   Processing Instructions
%   [16]  PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'

:- pred pi(pstate(_), pstate(ref(doc:content))).
:- mode pi(in, out) is det.

pi -->
    lit("<?")			    then (pred(_::in, pdi, puo) is det -->
    piTarget			    then (pred(Target::in, pdi, puo) is det -->
    opt(s and upto(char, lit("?>")))
    				    then (pred(MD::in, pdi, puo) is det -->
    ( { MD = yes((_, (Chars, _))) } ->
	mkString(Chars, Data)
    ;
        { Data = "" }
    ),
    add(pi(Target, Data), Ref),
    return(Ref)
    ))).

%   [17]  PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))

:- pred piTarget(pstate(_), pstate(name)).
:- mode piTarget(in, out) is det.

piTarget -->
    name			    then (pred(Target::in, pdi, puo) is det -->
    ( { Target = "XML" ; Target = "xml" } ->
    	fail("(x|X)(m|M)(l|L) is not a valid pi target")
    ;
    	return(Target)
    )).

%   
%   PIs are not part of the document's character data, but must be passed
%   through to the application. The PI begins with a target (PITarget)
%   used to identify the application to which the instruction is directed.
%   The target names "XML", "xml", and so on are reserved for
%   standardization in this or future versions of this specification. The
%   XML Notation mechanism may be used for formal declaration of PI
%   targets.
%   
%  2.7 CDATA Sections
%  
%   CDATA sections may occur anywhere character data may occur; they are
%   used to escape blocks of text containing characters which would
%   otherwise be recognized as markup. CDATA sections begin with the
%   string "<![CDATA[" and end with the string "]]>":
%   
%   CDATA Sections
%   [18]  CDSect ::= CDStart CData CDEnd
%   [19]  CDStart ::= '<![CDATA['
%   [20]  CData ::= (Char* - (Char* ']]>' Char*))
%   [21]  CDEnd ::= ']]>'

:- pred cdSect(pstate(_), pstate(ref(doc:content))).
:- mode cdSect(in, out) is det.

cdSect -->
    lit("<![CDATA[")		    then (pred(_::in, pdi, puo) is det -->
    upto(char, lit("]]>", ''))	    then (pred((Cs, _)::in, pdi, puo) is det -->
    mkString(Cs, Data),
    add(data(Data), Ref),
    return(Ref)
    )).

%   
%   Within a CDATA section, only the CDEnd string is recognized as markup,
%   so that left angle brackets and ampersands may occur in their literal
%   form; they need not (and cannot) be escaped using "<" and "&".
%   CDATA sections cannot nest.
%   
%   An example of a CDATA section, in which "<greeting>" and "</greeting>"
%   are recognized as character data, not markup:
%   
%   <![CDATA[<greeting>Hello, world!</greeting>]]>
%   
%  2.8 Prolog and Document Type Declaration
%  
%   XML documents may, and should, begin with an XML declaration which
%   specifies the version of XML being used. For example, the following is
%   a complete XML document, well-formed but not valid:
%   
%   <?xml version="1.0"?>
%   <greeting>Hello, world!</greeting>
%   
%   and so is this:
%   
%   <greeting>Hello, world!</greeting>
%   
%   The version number "1.0" should be used to indicate conformance to
%   this version of this specification; it is an error for a document to
%   use the value "1.0" if it does not conform to this version of this
%   specification. It is the intent of the XML working group to give later
%   versions of this specification numbers other than "1.0", but this
%   intent does not indicate a commitment to produce any future versions
%   of XML, nor if any are produced, to use any particular numbering
%   scheme. Since future versions are not ruled out, this construct is
%   provided as a means to allow the possibility of automatic version
%   recognition, should it become necessary. Processors may signal an
%   error if they receive documents labeled with versions they do not
%   support.
%   
%   The function of the markup in an XML document is to describe its
%   storage and logical structure and to associate attribute-value pairs
%   with its logical structures. XML provides a mechanism, the document
%   type declaration, to define constraints on the logical structure and
%   to support the use of predefined storage units. An XML document is
%   valid if it has an associated document type declaration and if the
%   document complies with the constraints expressed in it.
%   
%   The document type declaration must appear before the first element in
%   the document.
%   
%   Prolog
%   [22]  prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?

:- pred prolog(pstate(_), pstate((dtd, list(ref(doc:content))))).
:- mode prolog(in, out) is det.

prolog -->
    opt(xmlDecl)		    then (pred(_::in, pdi, puo) is det -->
    star(misc)			    then (pred(Misc0::in, pdi, puo) is det -->
    opt(doctypedecl and star(misc)) then (pred(MStuff::in, pdi, puo) is det -->
    {
    	MStuff = yes((DTD, Misc1)),
	append(Misc0, Misc1, Misc2),
        filterOpt(Misc2, Misc)
    ;
    	MStuff = no,
	init(Elems),
	init(Entities),
	init(PEntities),
	DTD = dtd("", Elems, Entities, PEntities),
	filterOpt(Misc0, Misc)
    },
    return((DTD, Misc))
    ))).

%   [23]  XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'

:- pred xmlDecl(pstate(_), pstate('')).
:- mode xmlDecl(in, out) is det.

xmlDecl -->
    lit("<?xml")		    then (pred(_::in, pdi, puo) is det -->
    versionInfo			    then (pred(_::in, pdi, puo) is det -->
    opt(encodingDecl)		    then (pred(MEnc::in, pdi, puo) is det -->
    opt(sdDecl)			    then (pred(_::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit("?>")			    then (pred(_::in, pdi, puo) is det -->
    (
        { MEnc = yes(EncName) },
	get(gEncodings, encodings(Encodings)),
	( { search(Encodings, EncName, Encoding) } ->
	    setEncoding(Encoding),
	    return
	;
	    { format("unknown encoding `%s'", [s(EncName)], Msg) },
	    error(Msg)
	)
    ;
        { MEnc = no },
	return
    ))))))).

%   [24]  VersionInfo ::= S 'version' Eq (' VersionNum ' | " VersionNum ")

:- pred versionInfo(pstate(_), pstate('')).
:- mode versionInfo(in, out) is det.

versionInfo -->
    s				    then (pred(_::in, pdi, puo) is det -->
    lit("version")		    then (pred(_::in, pdi, puo) is det -->
    eq				    then (pred(_::in, pdi, puo) is det -->
    quote			    then (pred(Q::in, pdi, puo) is det -->
    versionNum			    then (pred(_::in, pdi, puo) is det -->
    quote			    then (pred(EndQ::in, pdi, puo) is det -->
    ( { Q = EndQ } ->
    	return
    ;
    	error("mismatched quotes")
    ))))))).

%   [25]  Eq ::= S? '=' S?

:- pred eq(pstate(_), pstate('')).
:- mode eq(in, out) is det.

eq -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit1('=')			    then (pred(_::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    return
    ))).

%   [26]  VersionNum ::= ([a-zA-Z0-9_.:] | '-')+

:- pred versionNum(pstate(_), pstate(list(unicode))).
:- mode versionNum(in, out) is det.

versionNum -->
    plus(versionNumChar).

:- pred versionNumChar(pstate(_), pstate(unicode)).
:- mode versionNumChar(in, out) is det.

versionNumChar -->
    tok				    then (pred(C::in, pdi, puo) is det -->
    ( {
    	C >= a, C =< z
    ;
    	C >= 'A', C =< 'Z'
    ;
    	C >= '0', C =< '9'
    ;
    	char__to_int(Ch, C),
    	contains_char("_.:-", Ch)
    } ->
    	return(C)
    ;
    	fail("not a versionNum character")
    )).

%   [27]  Misc ::= Comment | PI |  S

:- pred misc(pstate(_), pstate(opt(ref(doc:content)))).
:- mode misc(in, out) is det.

misc -->
    opt(comment or pi)		    then (pred(MContent::in, pdi, puo) is det -->
    (
    	{ MContent = yes(_) },
	return(MContent)
    ;
    	{ MContent = no },
	no(s)
    )).

:- pred filterOpt(list(opt(T)), list(T)).
:- mode filterOpt(in, out) is det.

filterOpt([], []).
filterOpt([M0|Ms0], Ms) :-
    filterOpt(Ms0, Ms1),
    (
    	M0 = yes(M),
	Ms = [M|Ms1]
    ;
    	M0 = no,
    	Ms = Ms1
    ).

%   
%   The XML document type declaration contains or points to markup
%   declarations that provide a grammar for a class of documents. This
%   grammar is known as a document type definition, or DTD. The document
%   type declaration can point to an external subset (a special kind of
%   external entity) containing markup declarations, or can contain the
%   markup declarations directly in an internal subset, or can do both.
%   The DTD for a document consists of both subsets taken together.
%   
%   A markup declaration is an element type declaration, an attribute-list
%   declaration, an entity declaration, or a notation declaration. These
%   declarations may be contained in whole or in part within parameter
%   entities, as described in the well-formedness and validity constraints
%   below. For fuller information, see "4. Physical Structures".
%   
%   Document Type Definition
%   [28]  doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('['
%   (markupdecl | PEReference | S)* ']' S?)? '>' [ VC: Root Element Type ]

:- pred doctypedecl(pstate(_), pstate(dtd)).
:- mode doctypedecl(in, out) is det.

doctypedecl -->
    (lit("<!DOCTYPE")		    then (pred(_::in, pdi, puo) is det -->
    s				    then (pred(_::in, pdi, puo) is det -->
    name			    then (pred(Root::in, pdi, puo) is det -->
    opt(snd(s and externalID))	    then (pred(MExId::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    opt(lit1(('['), '') and
      parseInternalSubSet and
      (lit1((']'), '') and opt(s)))
				    then (pred(_::in, pdi, puo) is det -->
    lit1(('>'))			    then (pred(_::in, pdi, puo) is det -->
    opt(MExId, parseExtSubset, return)
    				    then (pred(_::in, pdi, puo) is det -->
    get(gEntities, entities(Entities)),
    get(gPEntities, pentities(PEntities)),
    get(gElements, elements(Elements)),
    return(dtd(Root, Elements, Entities, PEntities))
    ))))))))).

:- pred parseInternalSubSet(pstate(_), pstate('')).
:- mode parseInternalSubSet(pdi, puo) is det.

parseInternalSubSet -->
    star(no(markupdecl) or no(s) or no(pEReference(parseInternalSubSet)))
    				    then (pred(_::in, pdi, puo) is det -->
    return
    ).

:- pred parseExtSubset(externalId, pstate(_), pstate('')).
:- mode parseExtSubset(in, pdi, puo) is det.

parseExtSubset(ExId) -->
    getEntity(external(ExId))	    then (pred(Entity::in, pdi, puo) is det -->
    parseEntity(extSubset, mkEntity(Entity))
    				    then (pred(_::in, pdi, puo) is det -->
    return
    )).

%   [29]  markupdecl ::= elementdecl | AttlistDecl | EntityDecl
%   | NotationDecl | PI | Comment [ VC: Proper Declaration/PE Nesting ]
%   [ WFC: PEs in Internal Subset ]

:- pred markupdecl(pstate(_), pstate('')).
:- mode markupdecl(in, out) is det.

markupdecl -->
    no(elementdecl) or no(attlistDecl) or no(entityDecl) or
      no(pi) or no(comment)	    then (pred(_::in, pdi, puo) is det -->
    return
    ).

%   
%   The markup declarations may be made up in whole or in part of the
%   replacement text of parameter entities. The productions later in this
%   specification for individual nonterminals (elementdecl, AttlistDecl,
%   and so on) describe the declarations after all the parameter entities
%   have been included.
%   
%   Validity Constraint: Root Element Type
%   The Name in the document type declaration must match the element type
%   of the root element.
%   
%   Validity Constraint: Proper Declaration/PE Nesting
%   Parameter-entity replacement text must be properly nested with markup
%   declarations. That is to say, if either the first character or the
%   last character of a markup declaration (markupdecl above) is contained
%   in the replacement text for a parameter-entity reference, both must be
%   contained in the same replacement text.
%   
%   Well-Formedness Constraint: PEs in Internal Subset
%   In the internal DTD subset, parameter-entity references can occur only
%   where markup declarations can occur, not within markup declarations.
%   (This does not apply to references that occur in external parameter
%   entities or to the external subset.)
%   
%   Like the internal subset, the external subset and any external
%   parameter entities referred to in the DTD must consist of a series of
%   complete markup declarations of the types allowed by the non-terminal
%   symbol markupdecl, interspersed with white space or parameter-entity
%   references. However, portions of the contents of the external subset
%   or of external parameter entities may conditionally be ignored by
%   using the conditional section construct; this is not allowed in the
%   internal subset.
%   
%   External Subset
%   [30]  extSubset ::= TextDecl? extSubsetDecl

:- pred extSubset(pstate(T1), pstate('')).
:- mode extSubset(in, out) is det.

extSubset -->
    getEncoding(Enc),
    (opt(textDecl)		    then (pred(_::in, pdi, puo) is det -->
    extSubsetDecl		    then (pred(_::in, pdi, puo) is det -->
    return
    ))),
    setEncoding(Enc).

%   [31]  extSubsetDecl ::= ( markupdecl | conditionalSect | PEReference
%   | S )*

:- pred extSubsetDecl(pstate(_), pstate('')).
:- mode extSubsetDecl(in, out) is det.

extSubsetDecl -->
    star(no(markupdecl) or no(conditionalSect) or
         no(pEReference(extSubsetDecl)) or no(s))
	 			    then (pred(_::in, pdi, puo) is det -->
    return
    ).

%   
%   The external subset and external parameter entities also differ from
%   the internal subset in that in them, parameter-entity references are
%   permitted within markup declarations, not only between markup
%   declarations.
%   
%   An example of an XML document with a document type declaration:
%   
%   <?xml version="1.0"?>
%   <!DOCTYPE greeting SYSTEM "hello.dtd">
%   <greeting>Hello, world!</greeting>
%   
%   The system identifier "hello.dtd" gives the URI of a DTD for the
%   document.
%   
%   The declarations can also be given locally, as in this example:
%   
%   <?xml version="1.0" encoding="UTF-8" ?>
%   <!DOCTYPE greeting [
%     <!ELEMENT greeting (#PCDATA)>
%   ]>
%   <greeting>Hello, world!</greeting>
%   
%   If both the external and internal subsets are used, the internal
%   subset is considered to occur before the external subset. This has the
%   effect that entity and attribute-list declarations in the internal
%   subset take precedence over those in the external subset.
%   
%  2.9 Standalone Document Declaration
%  
%   Markup declarations can affect the content of the document, as passed
%   from an XML processor to an application; examples are attribute
%   defaults and entity declarations. The standalone document declaration,
%   which may appear as a component of the XML declaration, signals
%   whether or not there are such declarations which appear external to
%   the document entity.
%   
%   Standalone Document Declaration
%   [32]  SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"'
%   ('yes' | 'no') '"')) [ VC: Standalone Document Declaration ]

:- pred sdDecl(pstate(_), pstate('')).
:- mode sdDecl(in, out) is det.

sdDecl -->
    s				    then (pred(_::in, pdi, puo) is det -->
    lit("standalone")		    then (pred(_::in, pdi, puo) is det -->
    eq				    then (pred(_::in, pdi, puo) is det -->
    quote			    then (pred(Q::in, pdi, puo) is det -->
    (lit("yes") or lit("no"))	    then (pred(_::in, pdi, puo) is det -->
    quote			    then (pred(EndQ::in, pdi, puo) is det -->
    ( { Q = EndQ } ->
    	return
    ;
    	error("mismatched quotes")
    ))))))).

%   
%   In a standalone document declaration, the value "yes" indicates that
%   there are no markup declarations external to the document entity
%   (either in the DTD external subset, or in an external parameter entity
%   referenced from the internal subset) which affect the information
%   passed from the XML processor to the application. The value "no"
%   indicates that there are or may be such external markup declarations.
%   Note that the standalone document declaration only denotes the
%   presence of external declarations; the presence, in a document, of
%   references to external entities, when those entities are internally
%   declared, does not change its standalone status.
%   
%   If there are no external markup declarations, the standalone document
%   declaration has no meaning. If there are external markup declarations
%   but there is no standalone document declaration, the value "no" is
%   assumed.
%   
%   Any XML document for which standalone="no" holds can be converted
%   algorithmically to a standalone document, which may be desirable for
%   some network delivery applications.
%   
%   Validity Constraint: Standalone Document Declaration
%   The standalone document declaration must have the value "no" if any
%   external markup declarations contain declarations of:
%     * attributes with default values, if elements to which these
%       attributes apply appear in the document without specifications of
%       values for these attributes, or
%     * entities (other than amp, lt, gt, apos, quot), if references to
%       those entities appear in the document, or
%     * attributes with values subject to normalization, where the
%       attribute appears in the document with a value which will change
%       as a result of normalization, or
%     * element types with element content, if white space occurs directly
%       within any instance of those types.
%       
%   An example XML declaration with a standalone document declaration:
%   
%   <?xml version="1.0" standalone='yes'?>
%   
%  2.10 White Space Handling
%  
%   In editing XML documents, it is often convenient to use "white space"
%   (spaces, tabs, and blank lines, denoted by the nonterminal S in this
%   specification) to set apart the markup for greater readability. Such
%   white space is typically not intended for inclusion in the delivered
%   version of the document. On the other hand, "significant" white space
%   that should be preserved in the delivered version is common, for
%   example in poetry and source code.
%   
%   An XML processor must always pass all characters in a document that
%   are not markup through to the application. A validating XML processor
%   must also inform the application which of these characters constitute
%   white space appearing in element content.
%   
%   A special attribute named xml:space may be attached to an element to
%   signal an intention that in that element, white space should be
%   preserved by applications. In valid documents, this attribute, like
%   any other, must be declared if it is used. When declared, it must be
%   given as an enumerated type whose only possible values are "default"
%   and "preserve". For example:
%   
%       <!ATTLIST poem   xml:space (default|preserve) 'preserve'>
%   
%   The value "default" signals that applications' default white-space
%   processing modes are acceptable for this element; the value "preserve"
%   indicates the intent that applications preserve all the white space.
%   This declared intent is considered to apply to all elements within the
%   content of the element where it is specified, unless overriden with
%   another instance of the xml:space attribute.
%   
%   The root element of any document is considered to have signaled no
%   intentions as regards application space handling, unless it provides a
%   value for this attribute or the attribute is declared with a default
%   value.
%   
%  2.11 End-of-Line Handling
%  
%   XML parsed entities are often stored in computer files which, for
%   editing convenience, are organized into lines. These lines are
%   typically separated by some combination of the characters
%   carriage-return (#xD) and line-feed (#xA).
%   
%   To simplify the tasks of applications, wherever an external parsed
%   entity or the literal entity value of an internal parsed entity
%   contains either the literal two-character sequence "#xD#xA" or a
%   standalone literal #xD, an XML processor must pass to the application
%   the single character #xA. (This behavior can conveniently be produced
%   by normalizing all line breaks to #xA on input, before parsing.)
%   
%  2.12 Language Identification
%  
%   In document processing, it is often useful to identify the natural or
%   formal language in which the content is written. A special attribute
%   named xml:lang may be inserted in documents to specify the language
%   used in the contents and attribute values of any element in an XML
%   document. In valid documents, this attribute, like any other, must be
%   declared if it is used. The values of the attribute are language
%   identifiers as defined by [IETF RFC 1766], "Tags for the
%   Identification of Languages":
%   
%   Language Identification
%   [33]  LanguageID ::= Langcode ('-' Subcode)*

:- pred languageID(pstate(_), pstate('')).
:- mode languageID(in, out) is det.

languageID -->
	langcode		    then (pred(_::in, pdi, puo) is det -->
	star(lit1('-') and subcode) then (pred(_::in, pdi, puo) is det -->
	return
	)).

%   [34]  Langcode ::= ISO639Code |  IanaCode |  UserCode

:- pred langcode(pstate(_), pstate('')).
:- mode langcode(in, out) is det.

langcode -->
    (x(iso639Code) or x(ianaCode) or x(userCode))
    				    then (pred(_::in, pdi, puo) is det -->
    return
    ).

%   [35]  ISO639Code ::= ([a-z] | [A-Z]) ([a-z] | [A-Z])

:- pred iso639Code(pstate(_), pstate((unicode, unicode))).
:- mode iso639Code(in, out) is det.

iso639Code -->
    (range(a, z) or range('A', 'Z')) and (range(a, z) or range('A', 'Z')).

%   [36]  IanaCode ::= ('i' | 'I') '-' ([a-z] | [A-Z])+

:- pred ianaCode(pstate(_), pstate((unicode, unicode, list(unicode)))).
:- mode ianaCode(in, out) is det.

ianaCode -->
    (lit1('i') or lit1('I')) and
    lit1('-') and
    plus(range(a, z) or range('A', 'Z')).

%   [37]  UserCode ::= ('x' | 'X') '-' ([a-z] | [A-Z])+

:- pred userCode(pstate(_), pstate((unicode, unicode, list(unicode)))).
:- mode userCode(in, out) is det.

userCode -->
    (lit1('x') or lit1('X')) and
    lit1('-') and
    plus(range(a, z) or range('A', 'Z')).

%   [38]  Subcode ::= ([a-z] | [A-Z])+

:- pred subcode(pstate(_), pstate(list(unicode))).
:- mode subcode(in, out) is det.

subcode -->
    plus(range(a, z) or range('A', 'Z')).

%   
%   The Langcode may be any of the following:
%     * a two-letter language code as defined by [ISO 639], "Codes for the
%       representation of names of languages"
%     * a language identifier registered with the Internet Assigned
%       Numbers Authority [IANA]; these begin with the prefix "i-" (or
%       "I-")
%     * a language identifier assigned by the user, or agreed on between
%       parties in private use; these must begin with the prefix "x-" or
%       "X-" in order to ensure that they do not conflict with names later
%       standardized or registered with IANA
%       
%   There may be any number of Subcode segments; if the first subcode
%   segment exists and the Subcode consists of two letters, then it must
%   be a country code from [ISO 3166], "Codes for the representation of
%   names of countries." If the first subcode consists of more than two
%   letters, it must be a subcode for the language in question registered
%   with IANA, unless the Langcode begins with the prefix "x-" or "X-".
%   
%   It is customary to give the language code in lower case, and the
%   country code (if any) in upper case. Note that these values, unlike
%   other names in XML documents, are case insensitive.
%   
%   For example:
%   
%   <p xml:lang="en">The quick brown fox jumps over the lazy dog.</p>
%   <p xml:lang="en-GB">What colour is it?</p>
%   <p xml:lang="en-US">What color is it?</p>
%   <sp who="Faust" desc='leise' xml:lang="de">
%     <l>Habe nun, ach! Philosophie,</l>
%     <l>Juristerei, und Medizin</l>
%     <l>und leider auch Theologie</l>
%     <l>durchaus studiert mit heißem Bemüh'n.</l>
%     </sp>
%   
%   The intent declared with xml:lang is considered to apply to all
%   attributes and content of the element where it is specified, unless
%   overridden with an instance of xml:lang on another element within that
%   content.
%   
%   A simple declaration for xml:lang might take the form
%   
%   xml:lang  NMTOKEN  #IMPLIED
%   
%   but specific default values may also be given, if appropriate. In a
%   collection of French poems for English students, with glosses and
%   notes in English, the xml:lang attribute might be declared this way:
%   
%       <!ATTLIST poem   xml:lang NMTOKEN 'fr'>
%       <!ATTLIST gloss  xml:lang NMTOKEN 'en'>
%       <!ATTLIST note   xml:lang NMTOKEN 'en'>
%   
%3. Logical Structures
%
%   Each XML document contains one or more elements, the boundaries of
%   which are either delimited by start-tags and end-tags, or, for empty
%   elements, by an empty-element tag. Each element has a type, identified
%   by name, sometimes called its "generic identifier" (GI), and may have
%   a set of attribute specifications. Each attribute specification has a
%   name and a value.
%   
%   Element
%   [39]  element ::= EmptyElemTag
%   | STag content ETag [ WFC: Element Type Match ]
%   [ VC: Element Valid ]

:- pred element(pstate(_), pstate(ref(doc:content))).
:- mode element(in, out) is det.

element -->
    emptyElemTag or nonEmptyElement.

:- pred nonEmptyElement(pstate(_), pstate(ref(doc:content))).
:- mode nonEmptyElement(in, out) is det.

nonEmptyElement -->
    sTag		    then (pred((Name, Attrs)::in, pdi, puo) is det -->
    content		    then (pred(Content::in, pdi, puo) is det -->
    eTag		    then (pred(EndName::in, pdi, puo) is det -->
    ( { Name = EndName } ->
	{ Element = element(Name, Attrs, Content) },
	add(element(Element), Ref),
	return(Ref)
    ;
    	{ format("start tag name `%s' and end tag name `%s' do not match",
		[s(Name), s(EndName)], Msg) },
	error(Msg)
    )))).

%   
%   This specification does not constrain the semantics, use, or (beyond
%   syntax) names of the element types and attributes, except that names
%   beginning with a match to (('X'|'x')('M'|'m')('L'|'l')) are reserved
%   for standardization in this or future versions of this specification.
%   
%   Well-Formedness Constraint: Element Type Match
%   The Name in an element's end-tag must match the element type in the
%   start-tag.
%   
%   Validity Constraint: Element Valid
%   An element is valid if there is a declaration matching elementdecl
%   where the Name matches the element type, and one of the following
%   holds:
%    1. The declaration matches EMPTY and the element has no content.
%    2. The declaration matches children and the sequence of child
%       elements belongs to the language generated by the regular
%       expression in the content model, with optional white space
%       (characters matching the nonterminal S) between each pair of child
%       elements.
%    3. The declaration matches Mixed and the content consists of
%       character data and child elements whose types match names in the
%       content model.
%    4. The declaration matches ANY, and the types of any child elements
%       have been declared.
%       
%  3.1 Start-Tags, End-Tags, and Empty-Element Tags
%  
%   The beginning of every non-empty XML element is marked by a start-tag.
%   
%   Start-tag
%   [40]  STag ::= '<' Name (S Attribute)* S? '>' [ WFC: Unique Att Spec ]

:- pred sTag(pstate(_), pstate((name, list(doc:attribute)))).
:- mode sTag(in, out) is det.

sTag -->
    lit1('<')			    then (pred(_::in, pdi, puo) is det -->
    name			    then (pred(Name::in, pdi, puo) is det -->
    star(snd(s and attribute))	    then (pred(Attrs::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit1('>')			    then (pred(_::in, pdi, puo) is det -->
    return((Name, Attrs))
    ))))).

%   [41]  Attribute ::= Name Eq AttValue [ VC: Attribute Value Type ]
%   [ WFC: No External Entity References ]
%   [ WFC: No < in Attribute Values ]

:- pred attribute(pstate(_), pstate(doc:attribute)).
:- mode attribute(in, out) is det.

attribute -->
    name			    then (pred(Name::in, pdi, puo) is det -->
    eq				    then (pred(_::in, pdi, puo) is det -->
    attValue			    then (pred(Value::in, pdi, puo) is det -->
    return(attribute(Name, Value))
    ))).

%   
%   The Name in the start- and end-tags gives the element's type. The
%   Name-AttValue pairs are referred to as the attribute specifications
%   of the element, with the Name in each pair referred to as the
%   attribute name and the content of the AttValue (the text between the
%   ' or " delimiters) as the attribute value.
%   
%   Well-Formedness Constraint: Unique Att Spec
%   No attribute name may appear more than once in the same start-tag or
%   empty-element tag.
%   
%   Validity Constraint: Attribute Value Type
%   The attribute must have been declared; the value must be of the type
%   declared for it. (For attribute types, see "3.3 Attribute-List
%   Declarations".)
%   
%   Well-Formedness Constraint: No External Entity References
%   Attribute values cannot contain direct or indirect entity references
%   to external entities.
%   
%   Well-Formedness Constraint: No < in Attribute Values
%   The replacement text of any entity referred to directly or indirectly
%   in an attribute value (other than "<") must not contain a <.
%   
%   An example of a start-tag:
%   
%   <termdef id="dt-dog" term="dog">
%   
%   The end of every element that begins with a start-tag must be marked
%   by an end-tag containing a name that echoes the element's type as
%   given in the start-tag:
%   
%   End-tag
%   [42]  ETag ::= '</' Name S? '>'

:- pred eTag(pstate(_), pstate(name)).
:- mode eTag(in, out) is det.

eTag -->
    lit("</")			    then (pred(_::in, pdi, puo) is det -->
    name			    then (pred(Name::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit1('>')			    then (pred(_::in, pdi, puo) is det -->
    return(Name)
    )))).

%   
%   An example of an end-tag:
%   
%   </termdef>
%   
%   The text between the start-tag and end-tag is called the element's
%   content:
%   
%   Content of Elements
%   [43]  content ::= (element | CharData | Reference | CDSect | PI
%   | Comment)*

:- pred content(pstate(_), pstate(list(ref(doc:content)))).
:- mode content(in, out) is det.

content -->
    star(list(element) or list(charData) or
	 list(cdSect) or list(pi) or
	 list(comment) or entityRef(content))
    				    then (pred(Css::in, pdi, puo) is det -->
	{ condense(Css, Cs) },
	return(Cs)
    ).

%   
%   If an element is empty, it must be represented either by a start-tag
%   immediately followed by an end-tag or by an empty-element tag. An
%   empty-element tag takes a special form:
%   
%   Tags for Empty Elements
%   [44]  EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' [ WFC: Unique
%   Att Spec ]

:- pred emptyElemTag(pstate(_), pstate(ref(doc:content))).
:- mode emptyElemTag(in, out) is det.

emptyElemTag -->
    lit1('<')			    then (pred(_::in, pdi, puo) is det -->
    name			    then (pred(Name::in, pdi, puo) is det -->
    star(snd(s and attribute))	    then (pred(Attrs::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit("/>")			    then (pred(_::in, pdi, puo) is det -->
    { Element = element(Name, Attrs, []) },
    add(element(Element), Ref),
    return(Ref)
    ))))).

%   
%   Empty-element tags may be used for any element which has no content,
%   whether or not it is declared using the keyword EMPTY. For
%   interoperability, the empty-element tag must be used, and can only be
%   used, for elements which are declared EMPTY.
%   
%   Examples of empty elements:
%   
%   <IMG align="left"
%    src="http://www.w3.org/Icons/WWW/w3c_home" />
%   <br></br>
%   <br/>
%   
%  3.2 Element Type Declarations
%  
%   The element structure of an XML document may, for validation purposes,
%   be constrained using element type and attribute-list declarations. An
%   element type declaration constrains the element's content.
%   
%   Element type declarations often constrain which element types can
%   appear as children of the element. At user option, an XML processor
%   may issue a warning when a declaration mentions an element type for
%   which no declaration is provided, but this is not an error.
%   
%   An element type declaration takes the form:
%   
%   Element Type Declaration
%   [45]  elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' [ VC:
%   Unique Element Type Declaration ]

:- pred elementdecl(pstate(_), pstate('')).
:- mode elementdecl(in, out) is det.

elementdecl -->
    lit("<!ELEMENT")		    then (pred(_::in, pdi, puo) is det -->
    s				    then (pred(_::in, pdi, puo) is det -->
    pe(name)			    then (pred(Name::in, pdi, puo) is det -->
    s				    then (pred(_::in, pdi, puo) is det -->
    pe(contentspec)		    then (pred(Content::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit1('>')			    then (pred(_::in, pdi, puo) is det -->
    { init(EmptyAttrs) },
    { Element = element(Name, EmptyAttrs, Content) },
    get(gElements, elements(Elements0)),
    { set(Elements0, Name, Element, Elements) },
    set(gElements, elements(Elements)),
    return
    ))))))).

%   [46]  contentspec ::= 'EMPTY' | 'ANY' | Mixed | children

:- pred contentspec(pstate(_), pstate(dtd:content)).
:- mode contentspec(in, out) is det.

contentspec -->
    lit("EMPTY", empty) or lit("ANY", any) or mixed or children.

%   
%   where the Name gives the element type being declared.
%   
%   Validity Constraint: Unique Element Type Declaration
%   No element type may be declared more than once.
%   
%   Examples of element type declarations:
%   
%   <!ELEMENT br EMPTY>
%   <!ELEMENT p (#PCDATA|emph)* >
%   <!ELEMENT %name.para; %content.para; >
%   <!ELEMENT container ANY>
%   
%    3.2.1 Element Content
%    
%   An element type has element content when elements of that type must
%   contain only child elements (no character data), optionally separated
%   by white space (characters matching the nonterminal S). In this case,
%   the constraint includes a content model, a simple grammar governing
%   the allowed types of the child elements and the order in which they
%   are allowed to appear. The grammar is built on content particles
%   (cps), which consist of names, choice lists of content particles, or
%   sequence lists of content particles:
%   
%   Element-content Models
%   [47]  children ::= (choice | seq) ('?' | '*' | '+')?

:- pred children(pstate(_), pstate(dtd:content)).
:- mode children(in, out) is det.

children -->
    (choice or seq)		    then (pred(Children::in, pdi, puo) is det -->
    multiplicity		    then (pred(Mult::in, pdi, puo) is det -->
    { Content = children(Children - Mult) },
    return(Content)
    )).

:- pred multiplicity(pstate(_), pstate(multiplicity)).
:- mode multiplicity(in, out) is det.

multiplicity -->
    opt(lit1(('?'), ('?')) or lit1(('*'), ('*')) or lit1(('+'), ('+')), '1').

%   [48]  cp ::= (Name | choice | seq) ('?' | '*' | '+')?

:- pred cp(pstate(_), pstate(contentParticle)).
:- mode cp(in, out) is det.

cp -->
    ((wrap(name, nameToChild) or choice or seq) and multiplicity)
				then (pred((Kid, Mult)::in, pdi, puo) is det -->
    { CP = (Kid - Mult) },
    return(CP)
    ).

:- pred nameToChild(name, children).
:- mode nameToChild(in, out) is det.

nameToChild(Name, element(Name)).

%   [49]  choice ::= '(' S? cp ( S? '|' S? cp )* S? ')' [ VC: Proper
%   Group/PE Nesting ]

:- pred choice(pstate(_), pstate(children)).
:- mode choice(in, out) is det.

choice -->
    lit1('(')			then (pred(_::in, pdi, puo) is det -->
    opt(s)			then (pred(_::in, pdi, puo) is det -->
    pe(cp)			then (pred(Child::in, pdi, puo) is det -->
    star(snd((opt(s) and lit1('|') and opt(s)) and pe(cp)))
    				then (pred(Children0::in, pdi, puo) is det -->
    opt(s)			then (pred(_::in, pdi, puo) is det -->
    lit1(')')			then (pred(_::in, pdi, puo) is det -->
    { Children = alt([Child|Children0]) },
    return(Children)
    )))))).

%   [50]  seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' [ VC: Proper Group/PE
%   Nesting ]

:- pred seq(pstate(_), pstate(children)).
:- mode seq(in, out) is det.

seq -->
    lit1('(')			then (pred(_::in, pdi, puo) is det -->
    opt(s)			then (pred(_::in, pdi, puo) is det -->
    pe(cp)			then (pred(Child::in, pdi, puo) is det -->
    star(snd((opt(s) and lit1(',') and opt(s)) and pe(cp)))
    				then (pred(Children0::in, pdi, puo) is det -->
    opt(s)			then (pred(_::in, pdi, puo) is det -->
    lit1(')')			then (pred(_::in, pdi, puo) is det -->
    { Children = seq([Child|Children0]) },
    return(Children)
    )))))).

%   
%   where each Name is the type of an element which may appear as a child.
%   Any content particle in a choice list may appear in the element
%   content at the location where the choice list appears in the grammar;
%   content particles occurring in a sequence list must each appear in the
%   element content in the order given in the list. The optional character
%   following a name or list governs whether the element or the content
%   particles in the list may occur one or more (+), zero or more (*), or
%   zero or one times (?). The absence of such an operator means that the
%   element or content particle must appear exactly once. This syntax and
%   meaning are identical to those used in the productions in this
%   specification.
%   
%   The content of an element matches a content model if and only if it is
%   possible to trace out a path through the content model, obeying the
%   sequence, choice, and repetition operators and matching each element
%   in the content against an element type in the content model. For
%   compatibility, it is an error if an element in the document can match
%   more than one occurrence of an element type in the content model. For
%   more information, see "E. Deterministic Content Models".
%   
%   Validity Constraint: Proper Group/PE Nesting
%   Parameter-entity replacement text must be properly nested with
%   parenthetized groups. That is to say, if either of the opening or
%   closing parentheses in a choice, seq, or Mixed construct is contained
%   in the replacement text for a parameter entity, both must be contained
%   in the same replacement text. For interoperability, if a
%   parameter-entity reference appears in a choice, seq, or Mixed
%   construct, its replacement text should not be empty, and neither the
%   first nor last non-blank character of the replacement text should be a
%   connector (| or ,).
%   
%   Examples of element-content models:
%   
%   <!ELEMENT spec (front, body, back?)>
%   <!ELEMENT div1 (head, (p | list | note)*, div2*)>
%   <!ELEMENT dictionary-body (%div.mix; | %dict.mix;)*>
%   
%    3.2.2 Mixed Content
%    
%   An element type has mixed content when elements of that type may
%   contain character data, optionally interspersed with child elements.
%   In this case, the types of the child elements may be constrained, but
%   not their order or their number of occurrences:
%   
%   Mixed-content Declaration
%   [51]  Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*'
%   | '(' S? '#PCDATA' S? ')' [ VC: Proper Group/PE Nesting ]
%   [ VC: No Duplicate Types ]

:- pred mixed(pstate(_), pstate(dtd:content)).
:- mode mixed(in, out) is det.

mixed -->
    mixed1 or mixed2.

:- pred mixed1(pstate(_), pstate(dtd:content)).
:- mode mixed1(in, out) is det.

mixed1 -->
    lit1('(')			    then (pred(_::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit("#PCDATA")		    then (pred(_::in, pdi, puo) is det -->
    star(snd((opt(s) and lit1('|') and opt(s)) and name))
    				    then (pred(Names::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit(")*")			    then (pred(_::in, pdi, puo) is det -->
    { Content = mixed(mixed(Names)) },
    return(Content)
    )))))).

:- pred mixed2(pstate(_), pstate(dtd:content)).
:- mode mixed2(in, out) is det.

mixed2 -->
    lit1('(') and opt(s) and lit("#PCDATA") and opt(s) and lit1(')')
    				    then (pred(_::in, pdi, puo) is det -->
    return(mixed(mixed([])))
    ).

%   
%   where the Names give the types of elements that may appear as
%   children.
%   
%   Validity Constraint: No Duplicate Types
%   The same name must not appear more than once in a single mixed-content
%   declaration.
%   
%   Examples of mixed content declarations:
%   
%   <!ELEMENT p (#PCDATA|a|ul|b|i|em)*>
%   <!ELEMENT p (#PCDATA | %font; | %phrase; | %special; | %form;)* >
%   <!ELEMENT b (#PCDATA)>
%   
%  3.3 Attribute-List Declarations
%  
%   Attributes are used to associate name-value pairs with elements.
%   Attribute specifications may appear only within start-tags and
%   empty-element tags; thus, the productions used to recognize them
%   appear in "3.1 Start-Tags, End-Tags, and Empty-Element Tags".
%   Attribute-list declarations may be used:
%     * To define the set of attributes pertaining to a given element
%       type.
%     * To establish type constraints for these attributes.
%     * To provide default values for attributes.
%       
%   Attribute-list declarations specify the name, data type, and default
%   value (if any) of each attribute associated with a given element type:
%   
%   Attribute-list Declaration
%   [52]  AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'

:- pred attlistDecl(pstate(_), pstate('')).
:- mode attlistDecl(in, out) is det.

attlistDecl -->
    lit("<!ATTLIST")		    then (pred(_::in, pdi, puo) is det -->
    s				    then (pred(_::in, pdi, puo) is det -->
    pe(name)			    then (pred(Name::in, pdi, puo) is det -->
    pe(star(attDef))		    then (pred(AttsList::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit1('>')			    then (pred(_::in, pdi, puo) is det -->
    { init(Atts0) },
    (
    	{ foldl((pred(Att::in, Atts1::in, Atts2::out) is semidet :-
    	    Att = attribute(AttName, _, _),
	    insert(Atts1, AttName, Att, Atts2)
        ), AttsList, Atts0, Atts) }
    ->
	get(gElements, elements(Elements0)),
	( { search(Elements0, Name, Element0) } ->
	    { Attrs = merge(Element0^eAttrs, Atts) },
	    { Element = Element0^eAttrs := Attrs },
	    { set(Elements0, Name, Element, Elements) },
	    set(gElements, elements(Elements))
	;
	    get(gAttributes, attributes(Attributes0)),
	    { set(Attributes0, Name, Atts, Attributes) },
	    set(gAttributes, attributes(Attributes))
	),
	return
    ;
    	error("duplicated attribute")
    ))))))).

%   [53]  AttDef ::= S Name S AttType S DefaultDecl

:- pred attDef(pstate(_), pstate(dtd:attribute)).
:- mode attDef(in, out) is det.

attDef -->
    s				    then (pred(_::in, pdi, puo) is det -->
    pe(name)			    then (pred(Name::in, pdi, puo) is det -->
    s				    then (pred(_::in, pdi, puo) is det -->
    pe(attType)			    then (pred(Type::in, pdi, puo) is det -->
    s				    then (pred(_::in, pdi, puo) is det -->
    pe(defaultDecl)		    then (pred(Default::in, pdi, puo) is det -->
    return(attribute(Name, Type, Default))
    )))))).

%   
%   The Name in the AttlistDecl rule is the type of an element. At user
%   option, an XML processor may issue a warning if attributes are
%   declared for an element type not itself declared, but this is not an
%   error. The Name in the AttDef rule is the name of the attribute.
%   
%   When more than one AttlistDecl is provided for a given element type,
%   the contents of all those provided are merged. When more than one
%   definition is provided for the same attribute of a given element type,
%   the first declaration is binding and later declarations are ignored.
%   For interoperability, writers of DTDs may choose to provide at most
%   one attribute-list declaration for a given element type, at most one
%   attribute definition for a given attribute name, and at least one
%   attribute definition in each attribute-list declaration. For
%   interoperability, an XML processor may at user option issue a warning
%   when more than one attribute-list declaration is provided for a given
%   element type, or more than one attribute definition is provided for a
%   given attribute, but this is not an error.
%   
%    3.3.1 Attribute Types
%    
%   XML attribute types are of three kinds: a string type, a set of
%   tokenized types, and enumerated types. The string type may take any
%   literal string as a value; the tokenized types have varying lexical
%   and semantic constraints, as noted:
%   
%   Attribute Types
%   [54]  AttType ::= StringType | TokenizedType | EnumeratedType

:- pred attType(pstate(_), pstate(type)).
:- mode attType(in, out) is det.

attType -->
    stringType or tokenizedType or enumeratedType.

%   [55]  StringType ::= 'CDATA'

:- pred stringType(pstate(_), pstate(type)).
:- mode stringType(in, out) is det.

stringType -->
    lit("CDATA")		    then (pred(_::in, pdi, puo) is det -->
    return(cdata)
    ).

%   [56]  TokenizedType ::= 'ID' [ VC: ID ]
%   [ VC: One ID per Element Type ]
%   [ VC: ID Attribute Default ]
%   | 'IDREF' [ VC: IDREF ]
%   | 'IDREFS' [ VC: IDREF ]
%   | 'ENTITY' [ VC: Entity Name ]
%   | 'ENTITIES' [ VC: Entity Name ]
%   | 'NMTOKEN' [ VC: Name Token ]
%   | 'NMTOKENS' [ VC: Name Token ]

:- pred tokenizedType(pstate(_), pstate(type)).
:- mode tokenizedType(in, out) is det.

tokenizedType -->

	% Because or/4 commits to the first match, the ordering here
	% is significant because IDREFS matches IDREF matches ID, etc.

    lit("IDREFS", idrefs) or lit("IDREF", idref) or lit("ID", id) or
    lit("ENTITY", entity) or lit("ENTITIES", entities) or
    lit("NMTOKENS", nmtokens) or lit("NMTOKEN", nmtoken).

%   
%   Validity Constraint: ID
%   Values of type ID must match the Name production. A name must not
%   appear more than once in an XML document as a value of this type;
%   i.e., ID values must uniquely identify the elements which bear them.
%   
%   Validity Constraint: One ID per Element Type
%   No element type may have more than one ID attribute specified.
%   
%   Validity Constraint: ID Attribute Default
%   An ID attribute must have a declared default of #IMPLIED or #REQUIRED.
%   
%   Validity Constraint: IDREF
%   Values of type IDREF must match the Name production, and values of
%   type IDREFS must match Names; each Name must match the value of an ID
%   attribute on some element in the XML document; i.e. IDREF values must
%   match the value of some ID attribute.
%   
%   Validity Constraint: Entity Name
%   Values of type ENTITY must match the Name production, values of type
%   ENTITIES must match Names; each Name must match the name of an
%   unparsed entity declared in the DTD.
%   
%   Validity Constraint: Name Token
%   Values of type NMTOKEN must match the Nmtoken production; values of
%   type NMTOKENS must match Nmtokens.
%   
%   Enumerated attributes can take one of a list of values provided in the
%   declaration. There are two kinds of enumerated types:
%   
%   Enumerated Attribute Types
%   [57]  EnumeratedType ::= NotationType | Enumeration

:- pred enumeratedType(pstate(_), pstate(type)).
:- mode enumeratedType(in, out) is det.

enumeratedType -->
    notationType or enumeration.

%   [58]  NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S?
%   ')' [ VC: Notation Attributes ]

:- pred notationType(pstate(_), pstate(type)).
:- mode notationType(in, out) is det.

notationType -->
    lit("NOTATION")		    then (pred(_::in, pdi, puo) is det -->
    s				    then (pred(_::in, pdi, puo) is det -->
    lit1('(')			    then (pred(_::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    nmtoken			    then (pred(Token::in, pdi, puo) is det -->
    star(snd((opt(s) and lit1('|') and opt(s)) and nmtoken))
    				    then (pred(Tokens::in, pdi, puo) is det -->

    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit1(')')			    then (pred(_::in, pdi, puo) is det -->
    { Type = notation([Token|Tokens]) },
    return(Type)
    )))))))).

%   [59]  Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' [ VC:
%   Enumeration ]

:- pred enumeration(pstate(_), pstate(type)).
:- mode enumeration(in, out) is det.

enumeration -->
    lit1('(')			    then (pred(_::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    nmtoken			    then (pred(Token::in, pdi, puo) is det -->
    star(snd((opt(s) and lit1('|') and opt(s)) and nmtoken))
    				    then (pred(Tokens::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit1(')')			    then (pred(_::in, pdi, puo) is det -->
    { Type = notation([Token|Tokens]) },
    return(Type)
    )))))).

%   
%   A NOTATION attribute identifies a notation, declared in the DTD with
%   associated system and/or public identifiers, to be used in
%   interpreting the element to which the attribute is attached.
%   
%   Validity Constraint: Notation Attributes
%   Values of this type must match one of the notation names included in
%   the declaration; all notation names in the declaration must be
%   declared.
%   
%   Validity Constraint: Enumeration
%   Values of this type must match one of the Nmtoken tokens in the
%   declaration.
%   
%   For interoperability, the same Nmtoken should not occur more than once
%   in the enumerated attribute types of a single element type.
%   
%    3.3.2 Attribute Defaults
%    
%   An attribute declaration provides information on whether the
%   attribute's presence is required, and if not, how an XML processor
%   should react if a declared attribute is absent in a document.
%   
%   Attribute Defaults
%   [60]  DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
%   | (('#FIXED' S)? AttValue) [ VC: Required Attribute ]
%   [ VC: Attribute Default Legal ]
%   [ WFC: No < in Attribute Values ]
%   [ VC: Fixed Attribute Default ]

:- pred defaultDecl(pstate(_), pstate(default)).
:- mode defaultDecl(in, out) is det.

defaultDecl -->
    lit("#REQUIRED", required) or
    lit("#IMPLIED", implied) or
    wrap(snd((lit("#FIXED", '') and s) and attValue), wrapFixed) or
    wrap(attValue, wrapDefault).

:- pred wrapFixed(string, default).
:- mode wrapFixed(in, out) is det.

wrapFixed(AttValue, fixed(AttValue)).

:- pred wrapDefault(string, default).
:- mode wrapDefault(in, out) is det.

wrapDefault(AttValue, defaulted(AttValue)).

%   
%   In an attribute declaration, #REQUIRED means that the attribute must
%   always be provided, #IMPLIED that no default value is provided. If the
%   declaration is neither #REQUIRED nor #IMPLIED, then the AttValue value
%   contains the declared default value; the #FIXED keyword states that
%   the attribute must always have the default value. If a default value
%   is declared, when an XML processor encounters an omitted attribute, it
%   is to behave as though the attribute were present with the declared
%   default value.
%   
%   Validity Constraint: Required Attribute
%   If the default declaration is the keyword #REQUIRED, then the
%   attribute must be specified for all elements of the type in the
%   attribute-list declaration.
%   
%   Validity Constraint: Attribute Default Legal
%   The declared default value must meet the lexical constraints of the
%   declared attribute type.
%   
%   Validity Constraint: Fixed Attribute Default
%   If an attribute has a default value declared with the #FIXED keyword,
%   instances of that attribute must match the default value.
%   
%   Examples of attribute-list declarations:
%   
%   <!ATTLIST termdef
%             id      ID      #REQUIRED
%             name    CDATA   #IMPLIED>
%   <!ATTLIST list
%             type    (bullets|ordered|glossary)  "ordered">
%   <!ATTLIST form
%             method  CDATA   #FIXED "POST">
%   
%    3.3.3 Attribute-Value Normalization
%    
%   Before the value of an attribute is passed to the application or
%   checked for validity, the XML processor must normalize it as follows:
%     * a character reference is processed by appending the referenced
%       character to the attribute value
%     * an entity reference is processed by recursively processing the
%       replacement text of the entity
%     * a whitespace character (#x20, #xD, #xA, #x9) is processed by
%       appending #x20 to the normalized value, except that only a single
%       #x20 is appended for a "#xD#xA" sequence that is part of an
%       external parsed entity or the literal entity value of an internal
%       parsed entity
%     * other characters are processed by appending them to the normalized
%       value
%       
%   If the declared value is not CDATA, then the XML processor must
%   further process the normalized attribute value by discarding any
%   leading and trailing space (#x20) characters, and by replacing
%   sequences of space (#x20) characters by a single space (#x20)
%   character.
%   
%   All attributes for which no declaration has been read should be
%   treated by a non-validating parser as if declared CDATA.
%   
%  3.4 Conditional Sections
%  
%   Conditional sections are portions of the document type declaration
%   external subset which are included in, or excluded from, the logical
%   structure of the DTD based on the keyword which governs them.
%   
%   Conditional Section
%   [61]  conditionalSect ::= includeSect | ignoreSect

:- pred conditionalSect(pstate(_), pstate('')).
:- mode conditionalSect(in, out) is det.

conditionalSect -->
    includeSect or ignoreSect.

%   [62]  includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>'

:- pred includeSect(pstate(_), pstate('')).
:- mode includeSect(in, out) is det.

includeSect -->
    lit("<![")			    then (pred(_::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit("INCLUDE")		    then (pred(_::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit1('[')			    then (pred(_::in, pdi, puo) is det -->
    extSubsetDecl		    then (pred(_::in, pdi, puo) is det -->
    lit("]]>")			    then (pred(_::in, pdi, puo) is det -->
    return
    ))))))).


%   [63]  ignoreSect ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents*
%   ']]>'

:- pred ignoreSect(pstate(_), pstate('')).
:- mode ignoreSect(in, out) is det.

ignoreSect -->
    lit("<![")			    then (pred(_::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit("IGNORE")		    then (pred(_::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit1('[')			    then (pred(_::in, pdi, puo) is det -->
    star(ignoreSectContents)	    then (pred(_::in, pdi, puo) is det -->
    return
    )))))).

%   [64]  ignoreSectContents ::= Ignore ('<![' ignoreSectContents ']]>'
%   Ignore)*

:- pred ignoreSectContents(pstate(_), pstate('')).
:- mode ignoreSectContents(in, out) is det.

ignoreSectContents -->
    ignore			    then (pred(Ign::in, pdi, puo) is det -->
    ( { Ign = "<![" } ->
        (star(ignoreSectContents and lit("]]>") and ignore)
    				    then (pred(_::in, pdi, puo) is det -->
	return
	))
    ; % { Ign = "]]>" } ->
        return
    )).

%   [65]  Ignore ::= Char* - (Char* ('<![' | ']]>') Char*)

:- pred ignore(pstate(_), pstate(string)).
:- mode ignore(in, out) is det.

ignore -->
    upto(char, lit("<![") or lit("]]>"))
    				    then (pred(P::in, pdi, puo) is det -->
    { P = (_Ign, Terminal) },
    return(Terminal)
    ).

%   
%   Like the internal and external DTD subsets, a conditional section may
%   contain one or more complete declarations, comments, processing
%   instructions, or nested conditional sections, intermingled with white
%   space.
%   
%   If the keyword of the conditional section is INCLUDE, then the
%   contents of the conditional section are part of the DTD. If the
%   keyword of the conditional section is IGNORE, then the contents of the
%   conditional section are not logically part of the DTD. Note that for
%   reliable parsing, the contents of even ignored conditional sections
%   must be read in order to detect nested conditional sections and ensure
%   that the end of the outermost (ignored) conditional section is
%   properly detected. If a conditional section with a keyword of INCLUDE
%   occurs within a larger conditional section with a keyword of IGNORE,
%   both the outer and the inner conditional sections are ignored.
%   
%   If the keyword of the conditional section is a parameter-entity
%   reference, the parameter entity must be replaced by its content before
%   the processor decides whether to include or ignore the conditional
%   section.
%   
%   An example:
%   
%   <!ENTITY % draft 'INCLUDE' >
%   <!ENTITY % final 'IGNORE' >
%   
%   <![%draft;[
%   <!ELEMENT book (comments*, title, body, supplements?)>
%   ]]>
%   <![%final;[
%   <!ELEMENT book (title, body, supplements?)>
%   ]]>
%   
%4. Physical Structures
%
%   An XML document may consist of one or many storage units. These are
%   called entities; they all have content and are all (except for the
%   document entity, see below, and the external DTD subset) identified by
%   name. Each XML document has one entity called the document entity,
%   which serves as the starting point for the XML processor and may
%   contain the whole document.
%   
%   Entities may be either parsed or unparsed. A parsed entity's contents
%   are referred to as its replacement text; this text is considered an
%   integral part of the document.
%   
%   An unparsed entity is a resource whose contents may or may not be
%   text, and if text, may not be XML. Each unparsed entity has an
%   associated notation, identified by name. Beyond a requirement that an
%   XML processor make the identifiers for the entity and notation
%   available to the application, XML places no constraints on the
%   contents of unparsed entities.
%   
%   Parsed entities are invoked by name using entity references; unparsed
%   entities by name, given in the value of ENTITY or ENTITIES attributes.
%   
%   General entities are entities for use within the document content. In
%   this specification, general entities are sometimes referred to with
%   the unqualified term entity when this leads to no ambiguity. Parameter
%   entities are parsed entities for use within the DTD. These two types
%   of entities use different forms of reference and are recognized in
%   different contexts. Furthermore, they occupy different namespaces; a
%   parameter entity and a general entity with the same name are two
%   distinct entities.
%   
%  4.1 Character and Entity References
%  
%   A character reference refers to a specific character in the ISO/IEC
%   10646 character set, for example one not directly accessible from
%   available input devices.
%   
%   Character Reference
%   [66]  CharRef ::= '&#' [0-9]+ ';'
%   | '&#x' [0-9a-fA-F]+ ';' [ WFC: Legal Character ]

:- pred charRef(pstate(_), pstate(unicode)).
:- mode charRef(in, out) is det.

charRef -->
    (lit("&#x") and
     plus(range('a', 'f') or range('A', 'F') or range('0', '9')) and
     lit1(';')
				then (pred((_, Hs, _)::in, pdi, puo) is det -->
    { hex_digits_to_number(Hs, 0, UniCode) },
    return(UniCode)
    )) or
    (lit("&#") and plus(range('0', '9')) and lit1(';')
				then (pred((_, Ds, _)::in, pdi, puo) is det -->
    { decimal_digits_to_number(Ds, 0, UniCode) },
    return(UniCode)
    )).

:- pred decimal_digits_to_number(list(unicode), int, int).
:- mode decimal_digits_to_number(in, in, out) is det.

decimal_digits_to_number([], U, U).
decimal_digits_to_number([D|Ds], U0, U) :-
	U1 = 10 * U0 + (D - '0'),
	decimal_digits_to_number(Ds, U1, U).

:- pred hex_digits_to_number(list(unicode), int, int).
:- mode hex_digits_to_number(in, in, out) is det.

hex_digits_to_number([], U, U).
hex_digits_to_number([D|Ds], U0, U) :-
	( '0' =< D, D =< '9' ->
	    U1 = 16 * U0 + (D - '0')
	; 'a' =< D, D =< 'f' ->
	    U1 = 16 * U0 + 10 + (D - 'a')
	; 'A' =< D, D =< 'F' ->
	    U1 = 16 * U0 + 10 + (D - 'a')
	;
	    error("hex_digits_to_number: internal error")
	),
	hex_digits_to_number(Ds, U1, U).

%   
%   Well-Formedness Constraint: Legal Character
%   Characters referred to using character references must match the
%   production for Char.
%   If the character reference begins with "&#x", the digits and letters
%   up to the terminating ; provide a hexadecimal representation of the
%   character's code point in ISO/IEC 10646. If it begins just with "&#",
%   the digits up to the terminating ; provide a decimal representation of
%   the character's code point.
%   
%   An entity reference refers to the content of a named entity.
%   References to parsed general entities use ampersand (&) and semicolon
%   (;) as delimiters. Parameter-entity references use percent-sign (%)
%   and semicolon (;) as delimiters.
%   
%   Entity Reference
%   [67]  Reference ::= EntityRef | CharRef

%:- pred reference(pstate(_), pstate('')).
%:- mode reference(in, out) is det.
%
%reference -->
%    % Note: charRef has been migrated to charData, etc. because it
%    % is handled differently to normal entity references.
%    entityRef.

%   [68]  EntityRef ::= '&' Name ';' [ WFC: Entity Declared ]
%   [ VC: Entity Declared ]
%   [ WFC: Parsed Entity ]
%   [ WFC: No Recursion ]

:- pred entityRef(parser(T1, T2), pstate(T1), pstate(T2)).
:- mode entityRef(in(parser), in, out) is det.

entityRef(Parser) -->
    null			   then (pred(X::in, pdi, puo) is det -->
    entityRef			   then (pred(Entity::in, pdi, puo) is det -->
    return(X),
    parseEntity(Parser, mkEntity(Entity))
    )).

:- pred entityRef(pstate(_), pstate(dtd:entity)).
:- mode entityRef(in, out) is det.

entityRef -->
    lit1('&')			    then (pred(_::in, pdi, puo) is det -->
    name			    then (pred(Name::in, pdi, puo) is det -->
    lit1(';')			    then (pred(_::in, pdi, puo) is det -->
    get(gEntities, entities(Entities)),
    ( { search(Entities, Name, EntityDef) } ->
	(getEntity(EntityDef)	    then (pred(Entity::in, pdi, puo) is det -->
	return(Entity)
	))
    ;
    	{ format("reference to undefined entity `%s'", [s(Name)], Msg) },
	error(Msg)
    )))).

%   [69]  PEReference ::= '%' Name ';' [ VC: Entity Declared ]
%   [ WFC: No Recursion ]
%   [ WFC: In DTD ]

:- pred pEReference(parser(T1, T2), pstate(T1), pstate(T2)).
:- mode pEReference(in(parser), in, out) is det.

pEReference(Parser) -->
    null			   then (pred(X::in, pdi, puo) is det -->
    pEReference			   then (pred(Entity::in, pdi, puo) is det -->
    return(X),
    parseEntity(Parser, mkEntity(Entity))
    )).

:- pred pEReference(pstate(_), pstate(dtd:entity)).
:- mode pEReference(in, out) is det.

pEReference -->
    lit1('%')			    then (pred(_::in, pdi, puo) is det -->
    name			    then (pred(Name::in, pdi, puo) is det -->
    lit1(';')			    then (pred(_::in, pdi, puo) is det -->
    get(gPEntities, pentities(PEntities)),
    ( { search(PEntities, Name, EntityDef) } ->
	(getEntity(EntityDef)	    then (pred(Entity::in, pdi, puo) is det -->
	return(Entity)
	))
    ;
    	{ format("reference to undefined parameter entity `%s'",
	    [s(Name)], Msg) },
	error(Msg)
    )))).

%   
%   Well-Formedness Constraint: Entity Declared
%   In a document without any DTD, a document with only an internal DTD
%   subset which contains no parameter entity references, or a document
%   with "standalone='yes'", the Name given in the entity reference must
%   match that in an entity declaration, except that well-formed
%   documents need not declare any of the following entities: amp, lt, gt,
%   apos, quot. The declaration of a parameter entity must precede any
%   reference to it. Similarly, the declaration of a general entity must
%   precede any reference to it which appears in a default value in an
%   attribute-list declaration. Note that if entities are declared in the
%   external subset or in external parameter entities, a non-validating
%   processor is not obligated to read and process their declarations; for
%   such documents, the rule that an entity must be declared is a
%   well-formedness constraint only if standalone='yes'.
%   
%   Validity Constraint: Entity Declared
%   In a document with an external subset or external parameter entities
%   with "standalone='no'", the Name given in the entity reference must
%   match that in an entity declaration. For interoperability, valid
%   documents should declare the entities amp, lt, gt, apos, quot, in the
%   form specified in "4.6 Predefined Entities". The declaration of a
%   parameter entity must precede any reference to it. Similarly, the
%   declaration of a general entity must precede any reference to it which
%   appears in a default value in an attribute-list declaration.
%   
%   Well-Formedness Constraint: Parsed Entity
%   An entity reference must not contain the name of an unparsed entity.
%   Unparsed entities may be referred to only in attribute values declared
%   to be of type ENTITY or ENTITIES.
%   
%   Well-Formedness Constraint: No Recursion
%   A parsed entity must not contain a recursive reference to itself,
%   either directly or indirectly.
%   
%   Well-Formedness Constraint: In DTD
%   Parameter-entity references may only appear in the DTD.
%   
%   Examples of character and entity references:
%   
%   Type <key>less-than</key> (&#x3C;) to save options.
%   This document was prepared on &docdate; and
%   is classified &security-level;.
%   
%   Example of a parameter-entity reference:
%   
%   <!-- declare the parameter entity "ISOLat2"... -->
%   <!ENTITY % ISOLat2
%            SYSTEM "http://www.xml.com/iso/isolat2-xml.entities" >
%   <!-- ... now reference it. -->
%   %ISOLat2;
%   
%  4.2 Entity Declarations
%  
%   Entities are declared thus:
%   
%   Entity Declaration
%   [70]  EntityDecl ::= GEDecl | PEDecl

:- pred entityDecl(pstate(_), pstate('')).
:- mode entityDecl(in, out) is det.

entityDecl -->
    geDecl or peDecl.

%   [71]  GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'

:- pred geDecl(pstate(_), pstate('')).
:- mode geDecl(in, out) is det.

geDecl -->
    lit("<!ENTITY")		    then (pred(_::in, pdi, puo) is det -->
    s				    then (pred(_::in, pdi, puo) is det -->
    pe(name)			    then (pred(Name::in, pdi, puo) is det -->
    s				    then (pred(_::in, pdi, puo) is det -->
    pe(entityDef)		    then (pred(Value::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit1('>')			    then (pred(_::in, pdi, puo) is det -->
    get(gEntities, entities(Entities0)),
    ( { contains(Entities0, Name) } ->
        { format("Multiple declarations of entity `%s'", [s(Name)], Msg) },
	warn(Msg),
	{ Entities = Entities0 }
    ;
        { set(Entities0, Name, Value, Entities) }
    ),
    set(gEntities, entities(Entities)),
    return
    ))))))).

%   [72]  PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'

:- pred peDecl(pstate(_), pstate('')).
:- mode peDecl(in, out) is det.

peDecl -->
    lit("<!ENTITY")		    then (pred(_::in, pdi, puo) is det --> 
    s				    then (pred(_::in, pdi, puo) is det --> 
    lit1('%')			    then (pred(_::in, pdi, puo) is det --> 
    s				    then (pred(_::in, pdi, puo) is det --> 
    pe(name)			    then (pred(Name::in, pdi, puo) is det --> 
    s				    then (pred(_::in, pdi, puo) is det --> 
    pe(peDef)			    then (pred(Value::in, pdi, puo) is det --> 
    opt(s)			    then (pred(_::in, pdi, puo) is det --> 
    lit1('>')			    then (pred(_::in, pdi, puo) is det --> 
    get(gPEntities, pentities(PEntities0)),
    ( { contains(PEntities0, Name) } ->
        { format("Multiple declarations of parameter entity `%s'",
		[s(Name)], Msg) },
	warn(Msg),
	{ PEntities = PEntities0 }
    ;
        { set(PEntities0, Name, Value, PEntities) }
    ),
    set(gPEntities, pentities(PEntities)),
    return
    ))))))))).

%   [73]  EntityDef ::= EntityValue | (ExternalID NDataDecl?)

:- pred entityDef(pstate(_), pstate(entityDef)).
:- mode entityDef(in, out) is det.

entityDef -->
    % XXX x(entityValue) or (externalID and opt(nDataDecl)).
    (entityValue		    then (pred(Entity::in, pdi, puo) is det -->
        return(internal(Entity))
    )) or (
     externalID			    then (pred(ExtId::in, pdi, puo) is det -->
     opt(nDataDecl)		    then (pred(_::in, pdi, puo) is det -->
	return(external(ExtId))
    ))).

%   [74]  PEDef ::= EntityValue | ExternalID

:- pred peDef(pstate(_), pstate(entityDef)).
:- mode peDef(in, out) is det.

peDef -->
    % XXX x(entityValue) or externalID.
    entityDef.

%   
%   The Name identifies the entity in an entity reference or, in the case
%   of an unparsed entity, in the value of an ENTITY or ENTITIES
%   attribute. If the same entity is declared more than once, the first
%   declaration encountered is binding; at user option, an XML processor
%   may issue a warning if entities are declared multiple times.
%   
%    4.2.1 Internal Entities
%    
%   If the entity definition is an EntityValue, the defined entity is
%   called an internal entity. There is no separate physical storage
%   object, and the content of the entity is given in the declaration.
%   Note that some processing of entity and character references in the
%   literal entity value may be required to produce the correct
%   replacement text: see "4.5 Construction of Internal Entity
%   Replacement Text".
%   
%   An internal entity is a parsed entity.
%   
%   Example of an internal entity declaration:
%   
%   <!ENTITY Pub-Status "This is a pre-release of the
%    specification.">
%   
%    4.2.2 External Entities
%    
%   If the entity is not internal, it is an external entity, declared as
%   follows:
%   
%   External Entity Declaration
%   [75]  ExternalID ::= 'SYSTEM' S SystemLiteral
%   | 'PUBLIC' S PubidLiteral S SystemLiteral

:- pred externalID(pstate(_), pstate(externalId)).
:- mode externalID(in, out) is det.

externalID -->
    wrap(snd((lit("SYSTEM", '') and s) and systemLiteral), wrapSystem) or
    wrap(snd((lit("PUBLIC", '') and s) and pubidLiteral) and
           snd(s and systemLiteral), wrapPublic).

:- pred wrapSystem(string, externalId).
:- mode wrapSystem(in, out) is det.

wrapSystem(System, system(System)).

:- pred wrapPublic((string, string), externalId).
:- mode wrapPublic(in, out) is det.

wrapPublic((Public, System), public(Public, System)).

%   [76]  NDataDecl ::= S 'NDATA' S Name [ VC: Notation Declared ]

:- pred nDataDecl(pstate(_), pstate(name)).
:- mode nDataDecl(in, out) is det.

nDataDecl -->
    s				    then (pred(_::in, pdi, puo) is det -->
    lit("NDATA")		    then (pred(_::in, pdi, puo) is det -->
    s				    then (pred(_::in, pdi, puo) is det -->
    name			    then (pred(Name::in, pdi, puo) is det -->
    return(Name)
    )))).


%   
%   If the NDataDecl is present, this is a general unparsed entity;
%   otherwise it is a parsed entity.
%   
%   Validity Constraint: Notation Declared
%   The Name must match the declared name of a notation.
%   
%   The SystemLiteral is called the entity's system identifier. It is a
%   URI, which may be used to retrieve the entity. Note that the hash mark
%   (#) and fragment identifier frequently used with URIs are not,
%   formally, part of the URI itself; an XML processor may signal an error
%   if a fragment identifier is given as part of a system identifier.
%   Unless otherwise provided by information outside the scope of this
%   specification (e.g. a special XML element type defined by a particular
%   DTD, or a processing instruction defined by a particular application
%   specification), relative URIs are relative to the location of the
%   resource within which the entity declaration occurs. A URI might thus
%   be relative to the document entity, to the entity containing the
%   external DTD subset, or to some other external parameter entity.
%   
%   An XML processor should handle a non-ASCII character in a URI by
%   representing the character in UTF-8 as one or more bytes, and then
%   escaping these bytes with the URI escaping mechanism (i.e., by
%   converting each byte to %HH, where HH is the hexadecimal notation of
%   the byte value).
%   
%   In addition to a system identifier, an external identifier may include
%   a public identifier. An XML processor attempting to retrieve the
%   entity's content may use the public identifier to try to generate an
%   alternative URI. If the processor is unable to do so, it must use the
%   URI specified in the system literal. Before a match is attempted, all
%   strings of white space in the public identifier must be normalized to
%   single space characters (#x20), and leading and trailing white space
%   must be removed.
%   
%   Examples of external entity declarations:
%   
%   <!ENTITY open-hatch
%            SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml">
%   <!ENTITY open-hatch
%            PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//
%   EN"
%            "http://www.textuality.com/boilerplate/OpenHatch.xml">
%   <!ENTITY hatch-pic
%            SYSTEM "../grafix/OpenHatch.gif"
%            NDATA gif >
%   
%  4.3 Parsed Entities
%  
%    4.3.1 The Text Declaration
%    
%   External parsed entities may each begin with a text declaration.
%   
%   Text Declaration
%   [77]  TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'

:- pred textDecl(pstate(_), pstate('')).
:- mode textDecl(in, out) is det.

textDecl -->
    lit("<?xml")		    then (pred(_::in, pdi, puo) is det -->
    opt(versionInfo)		    then (pred(_::in, pdi, puo) is det -->
    encodingDecl		    then (pred(EncName::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit("?>")			    then (pred(_::in, pdi, puo) is det -->
    get(gEncodings, encodings(Encodings)),
    ( { search(Encodings, EncName, Enc) } ->
        setEncoding(Enc),
	return
    ;
    	{ format("unknown encoding `%s' in external entity",
		[s(EncName)], Msg) },
	error(Msg)
    )))))).

%   
%   The text declaration must be provided literally, not by reference to a
%   parsed entity. No text declaration may appear at any position other
%   than the beginning of an external parsed entity.
%   
%    4.3.2 Well-Formed Parsed Entities
%    
%   The document entity is well-formed if it matches the production
%   labeled document. An external general parsed entity is well-formed if
%   it matches the production labeled extParsedEnt. An external parameter
%   entity is well-formed if it matches the production labeled extPE.
%   
%   Well-Formed External Parsed Entity
%   [78]  extParsedEnt ::= TextDecl? content

:- pred extParsedEnt(pstate(_), pstate(list(ref(doc:content)))).
:- mode extParsedEnt(in, out) is det.

extParsedEnt -->
    snd(opt(textDecl) and content).

%   [79]  extPE ::= TextDecl? extSubsetDecl

:- pred extPE(pstate(_), pstate('')).
:- mode extPE(in, out) is det.

extPE -->
    opt(textDecl)		    then (pred(_::in, pdi, puo) is det -->
    extSubsetDecl		    then (pred(_::in, pdi, puo) is det -->
    return
    )).

%   
%   An internal general parsed entity is well-formed if its replacement
%   text matches the production labeled content. All internal parameter
%   entities are well-formed by definition.
%   
%   A consequence of well-formedness in entities is that the logical and
%   physical structures in an XML document are properly nested; no
%   start-tag, end-tag, empty-element tag, element, comment, processing
%   instruction, character reference, or entity reference can begin in one
%   entity and end in another.
%   
%    4.3.3 Character Encoding in Entities
%    
%   Each external parsed entity in an XML document may use a different
%   encoding for its characters. All XML processors must be able to read
%   entities in either UTF-8 or UTF-16.
%   
%   Entities encoded in UTF-16 must begin with the Byte Order Mark
%   described by ISO/IEC 10646 Annex E and Unicode Appendix B (the ZERO
%   WIDTH NO-BREAK SPACE character, #xFEFF). This is an encoding
%   signature, not part of either the markup or the character data of the
%   XML document. XML processors must be able to use this character to
%   differentiate between UTF-8 and UTF-16 encoded documents.
%   
%   Although an XML processor is required to read only entities in the
%   UTF-8 and UTF-16 encodings, it is recognized that other encodings are
%   used around the world, and it may be desired for XML processors to
%   read entities that use them. Parsed entities which are stored in an
%   encoding other than UTF-8 or UTF-16 must begin with a text declaration
%   containing an encoding declaration:
%   
%   Encoding Declaration
%   [80]  EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' |  "'" EncName
%   "'" )

:- pred encodingDecl(pstate(_), pstate(string)).
:- mode encodingDecl(in, out) is det.

encodingDecl -->
    s				    then (pred(_::in, pdi, puo) is det -->
    lit("encoding")		    then (pred(_::in, pdi, puo) is det -->
    eq				    then (pred(_::in, pdi, puo) is det -->
    quote			    then (pred(Q::in, pdi, puo) is det -->
    encName			    then (pred(Name::in, pdi, puo) is det -->
    quote			    then (pred(EndQ::in, pdi, puo) is det -->
    ( { Q = EndQ } ->
    	return(Name)
    ;
    	error("mismatched quotes")
    ))))))).

%   [81]  EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* /* Encoding name
%   contains only Latin characters */

:- pred encName(pstate(_), pstate(string)).
:- mode encName(in, out) is det.

encName -->
    (range('A', 'Z') or range('a', 'z'))
    				    then (pred(C::in, pdi, puo) is det --> 
    star(range('A', 'Z') or range('a', 'z') or range('0', '9') or
         lit1('.') or lit1('_') or lit1('-'))
				    then (pred(Cs::in, pdi, puo) is det --> 
    mkString([C|Cs], EncName),
    return(EncName)
    )).

%   
%   In the document entity, the encoding declaration is part of the XML
%   declaration. The EncName is the name of the encoding used.
%   
%   In an encoding declaration, the values "UTF-8", "UTF-16",
%   "ISO-10646-UCS-2", and "ISO-10646-UCS-4" should be used for the
%   various encodings and transformations of Unicode / ISO/IEC 10646, the
%   values "ISO-8859-1", "ISO-8859-2", ... "ISO-8859-9" should be used for
%   the parts of ISO 8859, and the values "ISO-2022-JP", "Shift_JIS", and
%   "EUC-JP" should be used for the various encoded forms of JIS
%   X-0208-1997. XML processors may recognize other encodings; it is
%   recommended that character encodings registered (as charsets) with the
%   Internet Assigned Numbers Authority [IANA], other than those just
%   listed, should be referred to using their registered names. Note that
%   these registered names are defined to be case-insensitive, so
%   processors wishing to match against them should do so in a
%   case-insensitive way.
%   
%   In the absence of information provided by an external transport
%   protocol (e.g. HTTP or MIME), it is an error for an entity including
%   an encoding declaration to be presented to the XML processor in an
%   encoding other than that named in the declaration, for an encoding
%   declaration to occur other than at the beginning of an external
%   entity, or for an entity which begins with neither a Byte Order Mark
%   nor an encoding declaration to use an encoding other than UTF-8. Note
%   that since ASCII is a subset of UTF-8, ordinary ASCII entities do not
%   strictly need an encoding declaration.
%   
%   It is a fatal error when an XML processor encounters an entity with an
%   encoding that it is unable to process.
%   
%   Examples of encoding declarations:
%   
%   <?xml encoding='UTF-8'?>
%   <?xml encoding='EUC-JP'?>
%   
%  4.4 XML Processor Treatment of Entities and References
%  
%   The table below summarizes the contexts in which character references,
%   entity references, and invocations of unparsed entities might appear
%   and the required behavior of an XML processor in each case. The labels
%   in the leftmost column describe the recognition context:
%   
%   Reference in Content
%          as a reference anywhere after the start-tag and before the
%          end-tag of an element; corresponds to the nonterminal content.
%          
%   Reference in Attribute Value
%          as a reference within either the value of an attribute in a
%          start-tag, or a default value in an attribute declaration;
%          corresponds to the nonterminal AttValue.
%          
%   Occurs as Attribute Value
%          as a Name, not a reference, appearing either as the value of an
%          attribute which has been declared as type ENTITY, or as one of
%          the space-separated tokens in the value of an attribute which
%          has been declared as type ENTITIES.
%          
%   Reference in Entity Value
%          as a reference within a parameter or internal entity's literal
%          entity value in the entity's declaration; corresponds to the
%          nonterminal EntityValue.
%          
%   Reference in DTD
%          as a reference within either the internal or external subsets
%          of the DTD, but outside of an EntityValue or AttValue.
%          
%                           Entity Type Character
%                             Parameter Internal
%                          General External Parsed
%                              General Unparsed
%                                 Reference
%    in Content Not recognized Included Included if validating Forbidden
%                                  Included
%                                 Reference
%      in Attribute Value Not recognized Included in literal Forbidden
%                             Forbidden Included
%                                 Occurs as
%         Attribute Value Not recognized Forbidden Forbidden Notify
%                               Not recognized
%                                 Reference
%       in EntityValue Included in literal Bypassed Bypassed Forbidden
%                                  Included
%                                 Reference
%       in DTD Included as PE Forbidden Forbidden Forbidden Forbidden
%                                      
%    4.4.1 Not Recognized
%    
%   Outside the DTD, the % character has no special significance; thus,
%   what would be parameter entity references in the DTD are not
%   recognized as markup in content. Similarly, the names of unparsed
%   entities are not recognized except when they appear in the value of an
%   appropriately declared attribute.
%   
%    4.4.2 Included
%    
%   An entity is included when its replacement text is retrieved and
%   processed, in place of the reference itself, as though it were part of
%   the document at the location the reference was recognized. The
%   replacement text may contain both character data and (except for
%   parameter entities) markup, which must be recognized in the usual way,
%   except that the replacement text of entities used to escape markup
%   delimiters (the entities amp, lt, gt, apos, quot) is always treated as
%   data. (The string "AT&T;" expands to "AT&T;" and the remaining
%   ampersand is not recognized as an entity-reference delimiter.) A
%   character reference is included when the indicated character is
%   processed in place of the reference itself.
%   
%    4.4.3 Included If Validating
%    
%   When an XML processor recognizes a reference to a parsed entity, in
%   order to validate the document, the processor must include its
%   replacement text. If the entity is external, and the processor is not
%   attempting to validate the XML document, the processor may, but need
%   not, include the entity's replacement text. If a non-validating parser
%   does not include the replacement text, it must inform the application
%   that it recognized, but did not read, the entity.
%   
%   This rule is based on the recognition that the automatic inclusion
%   provided by the SGML and XML entity mechanism, primarily designed to
%   support modularity in authoring, is not necessarily appropriate for
%   other applications, in particular document browsing. Browsers, for
%   example, when encountering an external parsed entity reference, might
%   choose to provide a visual indication of the entity's presence and
%   retrieve it for display only on demand.
%   
%    4.4.4 Forbidden
%    
%   The following are forbidden, and constitute fatal errors:
%     * the appearance of a reference to an unparsed entity.
%     * the appearance of any character or general-entity reference in the
%       DTD except within an EntityValue or AttValue.
%     * a reference to an external entity in an attribute value.
%       
%    4.4.5 Included in Literal
%    
%   When an entity reference appears in an attribute value, or a parameter
%   entity reference appears in a literal entity value, its replacement
%   text is processed in place of the reference itself as though it were
%   part of the document at the location the reference was recognized,
%   except that a single or double quote character in the replacement text
%   is always treated as a normal data character and will not terminate
%   the literal. For example, this is well-formed:
%   
%   <!ENTITY % YN '"Yes"' >
%   <!ENTITY WhatHeSaid "He said &YN;" >
%   
%   while this is not:
%   
%   <!ENTITY EndAttr "27'" >
%   <element attribute='a-&EndAttr;>
%   
%    4.4.6 Notify
%    
%   When the name of an unparsed entity appears as a token in the value of
%   an attribute of declared type ENTITY or ENTITIES, a validating
%   processor must inform the application of the system and public (if
%   any) identifiers for both the entity and its associated notation.
%   
%    4.4.7 Bypassed
%    
%   When a general entity reference appears in the EntityValue in an
%   entity declaration, it is bypassed and left as is.
%   
%    4.4.8 Included as PE
%    
%   Just as with external parsed entities, parameter entities need only be
%   included if validating. When a parameter-entity reference is
%   recognized in the DTD and included, its replacement text is enlarged
%   by the attachment of one leading and one following space (#x20)
%   character; the intent is to constrain the replacement text of
%   parameter entities to contain an integral number of grammatical tokens
%   in the DTD.
%   
%  4.5 Construction of Internal Entity Replacement Text
%  
%   In discussing the treatment of internal entities, it is useful to
%   distinguish two forms of the entity's value. The literal entity value
%   is the quoted string actually present in the entity declaration,
%   corresponding to the non-terminal EntityValue. The replacement text is
%   the content of the entity, after replacement of character references
%   and parameter-entity references.
%   
%   The literal entity value as given in an internal entity declaration
%   (EntityValue) may contain character, parameter-entity, and
%   general-entity references. Such references must be contained entirely
%   within the literal entity value. The actual replacement text that is
%   included as described above must contain the replacement text of any
%   parameter entities referred to, and must contain the character
%   referred to, in place of any character references in the literal
%   entity value; however, general-entity references must be left as-is,
%   unexpanded. For example, given the following declarations:
%   
%   <!ENTITY % pub    "&#xc9;ditions Gallimard" >
%   <!ENTITY   rights "All rights reserved" >
%   <!ENTITY   book   "La Peste: Albert Camus,
%   &#xA9; 1947 %pub;. &rights;" >
%   
%   then the replacement text for the entity "book" is:
%   
%   La Peste: Albert Camus,
%   © 1947 Éditions Gallimard. &rights;
%   
%   The general-entity reference "&rights;" would be expanded should the
%   reference "&book;" appear in the document's content or an attribute
%   value.
%   
%   These simple rules may have complex interactions; for a detailed
%   discussion of a difficult example, see "D. Expansion of Entity and
%   Character References".
%   
%  4.6 Predefined Entities
%  
%   Entity and character references can both be used to escape the left
%   angle bracket, ampersand, and other delimiters. A set of general
%   entities (amp, lt, gt, apos, quot) is specified for this purpose.
%   Numeric character references may also be used; they are expanded
%   immediately when recognized and must be treated as character data, so
%   the numeric character references "<" and "&" may be used to
%   escape < and & when they occur in character data.
%   
%   All XML processors must recognize these entities whether they are
%   declared or not. For interoperability, valid XML documents should
%   declare these entities, like any others, before using them. If the
%   entities in question are declared, they must be declared as internal
%   entities whose replacement text is the single character being escaped
%   or a character reference to that character, as shown below.
%   
%   <!ENTITY lt     "&#60;">
%   <!ENTITY gt     ">">
%   <!ENTITY amp    "&#38;">
%   <!ENTITY apos   "'">
%   <!ENTITY quot   """>
%   
%   Note that the < and & characters in the declarations of "lt" and "amp"
%   are doubly escaped to meet the requirement that entity replacement be
%   well-formed.
%   
%  4.7 Notation Declarations
%  
%   Notations identify by name the format of unparsed entities, the format
%   of elements which bear a notation attribute, or the application to
%   which a processing instruction is addressed.
%   
%   Notation declarations provide a name for the notation, for use in
%   entity and attribute-list declarations and in attribute
%   specifications, and an external identifier for the notation which may
%   allow an XML processor or its client application to locate a helper
%   application capable of processing data in the given notation.
%   
%   Notation Declarations
%   [82]  NotationDecl ::= '<!NOTATION' S Name S (ExternalID |  PublicID)
%   S? '>'

:- pred notationDecl(pstate(_), pstate('')).
:- mode notationDecl(in, out) is det.

notationDecl -->
    lit("<!NOTATION")		    then (pred(_::in, pdi, puo) is det -->
    s				    then (pred(_::in, pdi, puo) is det -->
    name			    then (pred(_::in, pdi, puo) is det -->
    s				    then (pred(_::in, pdi, puo) is det -->
    (externalID or publicID)	    then (pred(_::in, pdi, puo) is det -->
    opt(s)			    then (pred(_::in, pdi, puo) is det -->
    lit1('>')			    then (pred(_::in, pdi, puo) is det -->
    return
    ))))))).

%   [83]  PublicID ::= 'PUBLIC' S PubidLiteral

:- pred publicID(pstate(_), pstate(externalId)).
:- mode publicID(in, out) is det.

publicID -->
    lit("PUBLIC")		    then (pred(_::in, pdi, puo) is det -->
    s				    then (pred(_::in, pdi, puo) is det -->
    pubidLiteral		    then (pred(Lit::in, pdi, puo) is det -->
    return(public(Lit, ""))
    ))).

%   
%   XML processors must provide applications with the name and external
%   identifier(s) of any notation declared and referred to in an attribute
%   value, attribute definition, or entity declaration. They may
%   additionally resolve the external identifier into the system
%   identifier, file name, or other information needed to allow the
%   application to call a processor for data in the notation described.
%   (It is not an error, however, for XML documents to declare and refer
%   to notations for which notation-specific applications are not
%   available on the system where the XML processor or application is
%   running.)
%   
%  4.8 Document Entity
%  
%   The document entity serves as the root of the entity tree and a
%   starting-point for an XML processor. This specification does not
%   specify how the document entity is to be located by an XML processor;
%   unlike other entities, the document entity has no name and might well
%   appear on a processor input stream without any identification at all.
%   
%5. Conformance
%
%  5.1 Validating and Non-Validating Processors
%  
%   Conforming XML processors fall into two classes: validating and
%   non-validating.
%   
%   Validating and non-validating processors alike must report violations
%   of this specification's well-formedness constraints in the content of
%   the document entity and any other parsed entities that they read.
%   
%   Validating processors must report violations of the constraints
%   expressed by the declarations in the DTD, and failures to fulfill the
%   validity constraints given in this specification. To accomplish this,
%   validating XML processors must read and process the entire DTD and all
%   external parsed entities referenced in the document.
%   
%   Non-validating processors are required to check only the document
%   entity, including the entire internal DTD subset, for well-formedness.
%   While they are not required to check the document for validity, they
%   are required to process all the declarations they read in the internal
%   DTD subset and in any parameter entity that they read, up to the first
%   reference to a parameter entity that they do not read; that is to say,
%   they must use the information in those declarations to normalize
%   attribute values, include the replacement text of internal entities,
%   and supply default attribute values. They must not process entity
%   declarations or attribute-list declarations encountered after a
%   reference to a parameter entity that is not read, since the entity may
%   have contained overriding declarations.
%   
%  5.2 Using XML Processors
%  
%   The behavior of a validating XML processor is highly predictable; it
%   must read every piece of a document and report all well-formedness and
%   validity violations. Less is required of a non-validating processor;
%   it need not read any part of the document other than the document
%   entity. This has two effects that may be important to users of XML
%   processors:
%     * Certain well-formedness errors, specifically those that require
%       reading external entities, may not be detected by a non-validating
%       processor. Examples include the constraints entitled Entity
%       Declared, Parsed Entity, and No Recursion, as well as some of the
%       cases described as forbidden in "4.4 XML Processor Treatment of
%       Entities and References".
%     * The information passed from the processor to the application may
%       vary, depending on whether the processor reads parameter and
%       external entities. For example, a non-validating processor may not
%       normalize attribute values, include the replacement text of
%       internal entities, or supply default attribute values, where doing
%       so depends on having read declarations in external or parameter
%       entities.
%       
%   For maximum reliability in interoperating between different XML
%   processors, applications which use non-validating processors should
%   not rely on any behaviors not required of such processors.
%   Applications which require facilities such as the use of default
%   attributes or internal entities which are declared in external
%   entities should use validating XML processors.
%   
%6. Notation
%
%   The formal grammar of XML is given in this specification using a
%   simple Extended Backus-Naur Form (EBNF) notation. Each rule in the
%   grammar defines one symbol, in the form
%   
%   symbol ::= expression
%   
%   Symbols are written with an initial capital letter if they are defined
%   by a regular expression, or with an initial lower case letter
%   otherwise. Literal strings are quoted.
%   
%   Within the expression on the right-hand side of a rule, the following
%   expressions are used to match strings of one or more characters:
%   
%   #xN
%          where N is a hexadecimal integer, the expression matches the
%          character in ISO/IEC 10646 whose canonical (UCS-4) code value,
%          when interpreted as an unsigned binary number, has the value
%          indicated. The number of leading zeros in the #xN form is
%          insignificant; the number of leading zeros in the corresponding
%          code value is governed by the character encoding in use and is
%          not significant for XML.
%          
%   [a-zA-Z], [#xN-#xN]
%          matches any character with a value in the range(s) indicated
%          (inclusive).
%          
%   [^a-z], [^#xN-#xN]
%          matches any character with a value outside the range indicated.
%          
%   [^abc], [^#xN#xN#xN]
%          matches any character with a value not among the characters
%          given.
%          
%   "string"
%          matches a literal string matching that given inside the double
%          quotes.
%          
%   'string'
%          matches a literal string matching that given inside the single
%          quotes.
%          
%   These symbols may be combined to match more complex patterns as
%   follows, where A and B represent simple expressions:
%   
%   (expression)
%          expression is treated as a unit and may be combined as
%          described in this list.
%          
%   A?
%          matches A or nothing; optional A.
%          
%   A B
%          matches A followed by B.
%          
%   A | B
%          matches A or B but not both.
%          
%   A - B
%          matches any string that matches A but does not match B.
%          
%   A+
%          matches one or more occurrences of A.
%          
%   A*
%          matches zero or more occurrences of A.
%          
%   Other notations used in the productions are:
%   
%   /* ... */
%          comment.
%          
%   [ wfc: ... ]
%          well-formedness constraint; this identifies by name a
%          constraint on well-formed documents associated with a
%          production.
%          
%   [ vc: ... ]
%          validity constraint; this identifies by name a constraint on
%          valid documents associated with a production.
%     _________________________________________________________________
%   
%                                  Appendices
%                                       
%A. References
%
%  A.1 Normative References
%  
%   IANA
%          (Internet Assigned Numbers Authority) Official Names for
%          Character Sets, ed. Keld Simonsen et al. See
%          ftp://ftp.isi.edu/in-notes/iana/assignments/character-sets.
%          
%   IETF RFC 1766
%          IETF (Internet Engineering Task Force). RFC 1766: Tags for the
%          Identification of Languages, ed. H. Alvestrand. 1995.
%          
%   ISO 639
%          (International Organization for Standardization). ISO 639:1988
%          (E). Code for the representation of names of languages.
%          [Geneva]: International Organization for Standardization, 1988.
%          
%   ISO 3166
%          (International Organization for Standardization). ISO
%          3166-1:1997 (E). Codes for the representation of names of
%          countries and their subdivisions -- Part 1: Country codes
%          [Geneva]: International Organization for Standardization, 1997.
%          
%   ISO/IEC 10646
%          ISO (International Organization for Standardization). ISO/IEC
%          10646-1993 (E). Information technology -- Universal
%          Multiple-Octet Coded Character Set (UCS) -- Part 1:
%          Architecture and Basic Multilingual Plane. [Geneva]:
%          International Organization for Standardization, 1993 (plus
%          amendments AM 1 through AM 7).
%          
%   Unicode
%          The Unicode Consortium. The Unicode Standard, Version 2.0.
%          Reading, Mass.: Addison-Wesley Developers Press, 1996.
%          
%  A.2 Other References
%  
%   Aho/Ullman
%          Aho, Alfred V., Ravi Sethi, and Jeffrey D. Ullman. Compilers:
%          Principles, Techniques, and Tools. Reading: Addison-Wesley,
%          1986, rpt. corr. 1988.
%          
%   Berners-Lee et al.
%          Berners-Lee, T., R. Fielding, and L. Masinter. Uniform Resource
%          Identifiers (URI): Generic Syntax and Semantics. 1997. (Work in
%          progress; see updates to RFC1738.)
%          
%   Brüggemann-Klein
%          Brüggemann-Klein, Anne. Regular Expressions into Finite
%          Automata. Extended abstract in I. Simon, Hrsg., LATIN 1992, S.
%          97-98. Springer-Verlag, Berlin 1992. Full Version in
%          Theoretical Computer Science 120: 197-213, 1993.
%          
%   Brüggemann-Klein and Wood
%          Brüggemann-Klein, Anne, and Derick Wood. Deterministic Regular
%          Languages. Universität Freiburg, Institut für Informatik,
%          Bericht 38, Oktober 1991.
%          
%   Clark
%          James Clark. Comparison of SGML and XML. See
%          http://www.w3.org/TR/NOTE-sgml-xml-971215.
%          
%   IETF RFC1738
%          IETF (Internet Engineering Task Force). RFC 1738: Uniform
%          Resource Locators (URL), ed. T. Berners-Lee, L. Masinter, M.
%          McCahill. 1994.
%          
%   IETF RFC1808
%          IETF (Internet Engineering Task Force). RFC 1808: Relative
%          Uniform Resource Locators, ed. R. Fielding. 1995.
%          
%   IETF RFC2141
%          IETF (Internet Engineering Task Force). RFC 2141: URN Syntax,
%          ed. R. Moats. 1997.
%          
%   ISO 8879
%          ISO (International Organization for Standardization). ISO
%          8879:1986(E). Information processing -- Text and Office Systems
%          -- Standard Generalized Markup Language (SGML). First edition
%          -- 1986-10-15. [Geneva]: International Organization for
%          Standardization, 1986.
%          
%   ISO/IEC 10744
%          ISO (International Organization for Standardization). ISO/IEC
%          10744-1992 (E). Information technology -- Hypermedia/Time-based
%          Structuring Language (HyTime). [Geneva]: International
%          Organization for Standardization, 1992. Extended Facilities
%          Annexe. [Geneva]: International Organization for
%          Standardization, 1996.
%          
%B. Character Classes
%

	% The character classes are in a separate module
	% for compile-time performance!
:- include_module xml:parse:chars.
:- import_module xml:parse:chars.

%C. XML and SGML (Non-Normative)
%
%   XML is designed to be a subset of SGML, in that every valid XML
%   document should also be a conformant SGML document. For a detailed
%   comparison of the additional restrictions that XML places on documents
%   beyond those of SGML, see [Clark].
%   
%D. Expansion of Entity and Character References (Non-Normative)
%
%   This appendix contains some examples illustrating the sequence of
%   entity- and character-reference recognition and expansion, as
%   specified in "4.4 XML Processor Treatment of Entities and References".
%   
%   If the DTD contains the declaration
%   
%   <!ENTITY example "<p>An ampersand (&#38;) may be escaped
%   numerically (&#38;#38;) or with a general entity
%   (&amp;).</p>" >
%   
%   then the XML processor will recognize the character references when it
%   parses the entity declaration, and resolve them before storing the
%   following string as the value of the entity "example":
%   
%   <p>An ampersand (&) may be escaped
%   numerically (&#38;) or with a general entity
%   (&amp;).</p>
%   
%   A reference in the document to "&example;" will cause the text to be
%   reparsed, at which time the start- and end-tags of the "p" element
%   will be recognized and the three references will be recognized and
%   expanded, resulting in a "p" element with the following content (all
%   data, no delimiters or markup):
%   
%   An ampersand (&) may be escaped
%   numerically (&) or with a general entity
%   (&).
%   
%   A more complex example will illustrate the rules and their effects
%   fully. In the following example, the line numbers are solely for
%   reference.
%   
%   1 <?xml version='1.0'?>
%   2 <!DOCTYPE test [
%   3 <!ELEMENT test (#PCDATA) >
%   4 <!ENTITY % xx '%zz;'>
%   5 <!ENTITY % zz '<!ENTITY tricky "error-prone" >' >
%   6 %xx;
%   7 ]>
%   8 <test>This sample shows a &tricky; method.</test>
%   
%   This produces the following:
%     * in line 4, the reference to character 37 is expanded immediately,
%       and the parameter entity "xx" is stored in the symbol table with
%       the value "%zz;". Since the replacement text is not rescanned, the
%       reference to parameter entity "zz" is not recognized. (And it
%       would be an error if it were, since "zz" is not yet declared.)
%     * in line 5, the character reference "<" is expanded immediately
%       and the parameter entity "zz" is stored with the replacement text
%       "<!ENTITY tricky "error-prone" >", which is a well-formed entity
%       declaration.
%     * in line 6, the reference to "xx" is recognized, and the
%       replacement text of "xx" (namely "%zz;") is parsed. The reference
%       to "zz" is recognized in its turn, and its replacement text
%       ("<!ENTITY tricky "error-prone" >") is parsed. The general entity
%       "tricky" has now been declared, with the replacement text
%       "error-prone".
%     * in line 8, the reference to the general entity "tricky" is
%       recognized, and it is expanded, so the full content of the "test"
%       element is the self-describing (and ungrammatical) string This
%       sample shows a error-prone method.
%       
%E. Deterministic Content Models (Non-Normative)
%
%   For compatibility, it is required that content models in element type
%   declarations be deterministic.
%   
%   SGML requires deterministic content models (it calls them
%   "unambiguous"); XML processors built using SGML systems may flag
%   non-deterministic content models as errors.
%   
%   For example, the content model ((b, c) | (b, d)) is non-deterministic,
%   because given an initial b the parser cannot know which b in the model
%   is being matched without looking ahead to see which element follows
%   the b. In this case, the two references to b can be collapsed into a
%   single reference, making the model read (b, (c | d)). An initial b now
%   clearly matches only a single name in the content model. The parser
%   doesn't need to look ahead to see what follows; either c or d would be
%   accepted.
%   
%   More formally: a finite state automaton may be constructed from the
%   content model using the standard algorithms, e.g. algorithm 3.5 in
%   section 3.9 of Aho, Sethi, and Ullman [Aho/Ullman]. In many such
%   algorithms, a follow set is constructed for each position in the
%   regular expression (i.e., each leaf node in the syntax tree for the
%   regular expression); if any position has a follow set in which more
%   than one following position is labeled with the same element type
%   name, then the content model is in error and may be reported as an
%   error.
%   
%   Algorithms exist which allow many but not all non-deterministic
%   content models to be reduced automatically to equivalent deterministic
%   models; see Brüggemann-Klein 1991 [Brüggemann-Klein].
%   
%F. Autodetection of Character Encodings (Non-Normative)
%
%   The XML encoding declaration functions as an internal label on each
%   entity, indicating which character encoding is in use. Before an XML
%   processor can read the internal label, however, it apparently has to
%   know what character encoding is in use--which is what the internal
%   label is trying to indicate. In the general case, this is a hopeless
%   situation. It is not entirely hopeless in XML, however, because XML
%   limits the general case in two ways: each implementation is assumed to
%   support only a finite set of character encodings, and the XML encoding
%   declaration is restricted in position and content in order to make it
%   feasible to autodetect the character encoding in use in each entity in
%   normal cases. Also, in many cases other sources of information are
%   available in addition to the XML data stream itself. Two cases may be
%   distinguished, depending on whether the XML entity is presented to the
%   processor without, or with, any accompanying (external) information.
%   We consider the first case first.
%   
%   Because each XML entity not in UTF-8 or UTF-16 format must begin with
%   an XML encoding declaration, in which the first characters must be
%   '<?xml', any conforming processor can detect, after two to four octets
%   of input, which of the following cases apply. In reading this list, it
%   may help to know that in UCS-4, '<' is "#x0000003C" and '?' is
%   "#x0000003F", and the Byte Order Mark required of UTF-16 data streams
%   is "#xFEFF".
%   
%     * 00 00 00 3C: UCS-4, big-endian machine (1234 order)
%     * 3C 00 00 00: UCS-4, little-endian machine (4321 order)
%     * 00 00 3C 00: UCS-4, unusual octet order (2143)
%     * 00 3C 00 00: UCS-4, unusual octet order (3412)
%     * FE FF: UTF-16, big-endian
%     * FF FE: UTF-16, little-endian
%     * 00 3C 00 3F: UTF-16, big-endian, no Byte Order Mark (and thus,
%       strictly speaking, in error)
%     * 3C 00 3F 00: UTF-16, little-endian, no Byte Order Mark (and thus,
%       strictly speaking, in error)
%     * 3C 3F 78 6D: UTF-8, ISO 646, ASCII, some part of ISO 8859,
%       Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding
%       which ensures that the characters of ASCII have their normal
%       positions, width, and values; the actual encoding declaration must
%       be read to detect which of these applies, but since all of these
%       encodings use the same bit patterns for the ASCII characters, the
%       encoding declaration itself may be read reliably
%     * 4C 6F A7 94: EBCDIC (in some flavor; the full encoding declaration
%       must be read to tell which code page is in use)
%     * other: UTF-8 without an encoding declaration, or else the data
%       stream is corrupt, fragmentary, or enclosed in a wrapper of some
%       kind
%       
%   This level of autodetection is enough to read the XML encoding
%   declaration and parse the character-encoding identifier, which is
%   still necessary to distinguish the individual members of each family
%   of encodings (e.g. to tell UTF-8 from 8859, and the parts of 8859 from
%   each other, or to distinguish the specific EBCDIC code page in use,
%   and so on).
%   
%   Because the contents of the encoding declaration are restricted to
%   ASCII characters, a processor can reliably read the entire encoding
%   declaration as soon as it has detected which family of encodings is in
%   use. Since in practice, all widely used character encodings fall into
%   one of the categories above, the XML encoding declaration allows
%   reasonably reliable in-band labeling of character encodings, even when
%   external sources of information at the operating-system or
%   transport-protocol level are unreliable.
%   
%   Once the processor has detected the character encoding in use, it can
%   act appropriately, whether by invoking a separate input routine for
%   each case, or by calling the proper conversion function on each
%   character of input.
%   
%   Like any self-labeling system, the XML encoding declaration will not
%   work if any software changes the entity's character set or encoding
%   without updating the encoding declaration. Implementors of
%   character-encoding routines should be careful to ensure the accuracy
%   of the internal and external information used to label the entity.
%   
%   The second possible case occurs when the XML entity is accompanied by
%   encoding information, as in some file systems and some network
%   protocols. When multiple sources of information are available, their
%   relative priority and the preferred method of handling conflict should
%   be specified as part of the higher-level protocol used to deliver XML.
%   Rules for the relative priority of the internal label and the
%   MIME-type label in an external header, for example, should be part of
%   the RFC document defining the text/xml and application/xml MIME types.
%   In the interests of interoperability, however, the following rules are
%   recommended.
%     * If an XML entity is in a file, the Byte-Order Mark and
%       encoding-declaration PI are used (if present) to determine the
%       character encoding. All other heuristics and sources of
%       information are solely for error recovery.
%     * If an XML entity is delivered with a MIME type of text/xml, then
%       the charset parameter on the MIME type determines the character
%       encoding method; all other heuristics and sources of information
%       are solely for error recovery.
%     * If an XML entity is delivered with a MIME type of application/xml,
%       then the Byte-Order Mark and encoding-declaration PI are used (if
%       present) to determine the character encoding. All other heuristics
%       and sources of information are solely for error recovery.
%       
%   These rules apply only in the absence of protocol-level documentation;
%   in particular, when the MIME types text/xml and application/xml are
%   defined, the recommendations of the relevant RFC will supersede these
%   rules.
%   
%G. W3C XML Working Group (Non-Normative)
%
%   This specification was prepared and approved for publication by the
%   W3C XML Working Group (WG). WG approval of this specification does not
%   necessarily imply that all WG members voted for its approval. The
%   current and former members of the XML WG are:
%   Jon Bosak, Sun (Chair); James Clark (Technical Lead); Tim Bray,
%   Textuality and Netscape (XML Co-editor); Jean Paoli, Microsoft (XML
%   Co-editor); C. M. Sperberg-McQueen, U. of Ill. (XML Co-editor); Dan
%   Connolly, W3C (W3C Liaison); Paula Angerstein, Texcel; Steve DeRose,
%   INSO; Dave Hollander, HP; Eliot Kimber, ISOGEN; Eve Maler, ArborText;
%   Tom Magliery, NCSA; Murray Maloney, Muzmo and Grif; Makoto Murata,
%   Fuji Xerox Information Systems; Joel Nava, Adobe; Conleth O'Connell,
%   Vignette; Peter Sharpe, SoftQuad; John Tigue, DataChannel
%   
%   Copyright  ©  1998 W3C (MIT, INRIA, Keio ), All Rights Reserved. W3C
%   liability, trademark, document use and software licensing rules
%   apply.

:- pred add(doc:content, ref(doc:content), pstate(T), pstate(T)).
:- mode add(in, out, pdi, puo) is det.

add(Cont, Ref) -->
    get(gContent, Content0),
    { doc:add(Cont, Ref, Content0, Content) },
    set(gContent, Content).

%:- pred psp(pstate(T1), pstate('')).
%:- mode psp(pdi, puo) is det.
%
%psp -->
%    plus(ps)			    then (pred(_::in, pdi, puo) is det -->
%    return
%    ).

%:- pred ps(pstate(T1), pstate('')).
%:- mode ps(pdi, puo) is det.
%
%ps -->
%    x(s) or x(pEReference) or x(comment).

:- pred pe(parser(T1, T2), pstate(T1), pstate(T2)).
:- mode pe(in(parser), pdi, puo) is det.

pe(P) -->
    pEReference(pe(P)) or P.

:- pred null(pstate(T1), pstate(T1)).
:- mode null(pdi, puo) is det.

null --> [].

:- pred getEntity(entityDef, pstate(T1), pstate(dtd:entity)).
:- mode getEntity(in, pdi, puo) is det.

getEntity(internal(Entity)) -->
    return(Entity).
getEntity(external(system(SystemId))) -->
    get(gDirs, dirs(Dirs)),
    io(find(SystemId, Dirs), Res0),
    (
        { Res0 = ok(_Path) },
	io((pred((Res10, Entity0)::out, di, uo) is det -->
	    read_file_as_string(Res10, Entity0)
	), (Res1, Entity)),
	(
	    { Res1 = ok },
	    io(seen),
	    return(Entity)
	;
	    { Res1 = error(Err) },
	    { io__error_message(Err, Msg) },
	    error(Msg)
	)
    ;
        { Res0 = error(Msg) },
	error(Msg)
    ).
getEntity(external(public(PublicId, SystemId))) -->
    ( { SystemId \= "" } ->
        getEntity(external(system(SystemId)))
    ;
        get(gCatalog, catalog(Catalog)),
	( { search(Catalog, PublicId, FoundSystemId) } ->
	    getEntity(external(system(FoundSystemId)))
	;
	    error("Entity not found")
	)
    ).

:- pred warn(string, pstate(T1), pstate(T1)).
:- mode warn(in, pdi, puo) is det.

warn(Msg) -->
    io((pred(di, uo) is det -->
    	io__stderr_stream(StdErr),
	write_string(StdErr, "warning: "),
	write_string(StdErr, Msg),
	write_string(StdErr, "\n")
    )).



More information about the developers mailing list