diff options
| author | Charles.Forsyth <devnull@localhost> | 2006-12-22 17:07:39 +0000 |
|---|---|---|
| committer | Charles.Forsyth <devnull@localhost> | 2006-12-22 17:07:39 +0000 |
| commit | 37da2899f40661e3e9631e497da8dc59b971cbd0 (patch) | |
| tree | cbc6d4680e347d906f5fa7fca73214418741df72 /appl/lib/xml.b | |
| parent | 54bc8ff236ac10b3eaa928fd6bcfc0cdb2ba46ae (diff) | |
20060303a
Diffstat (limited to 'appl/lib/xml.b')
| -rw-r--r-- | appl/lib/xml.b | 712 |
1 files changed, 712 insertions, 0 deletions
diff --git a/appl/lib/xml.b b/appl/lib/xml.b new file mode 100644 index 00000000..5e10608d --- /dev/null +++ b/appl/lib/xml.b @@ -0,0 +1,712 @@ +implement Xml; + +# +# Portions copyright © 2002 Vita Nuova Holdings Limited +# +# +# Derived from saxparser.b Copyright © 2001-2002 by John Powers or his employer +# + +# TO DO: +# - provide a way of getting attributes out of <?...?> (process) requests, +# so that we can process stylesheet requests given in that way. + +include "sys.m"; + sys: Sys; +include "bufio.m"; + bufio: Bufio; + Iobuf: import bufio; +include "string.m"; + str: String; +include "hash.m"; + hash: Hash; + HashTable: import hash; +include "xml.m"; + +Parcel: adt { + pick { + Start or + Empty => + name: string; + attrs: Attributes; + End => + name: string; + Text => + ch: string; + ws1, ws2: int; + Process => + target: string; + data: string; + Error => + loc: Locator; + msg: string; + Doctype => + name: string; + public: int; + params: list of string; + Stylesheet => + attrs: Attributes; + EOF => + } +}; + +entinit := array[] of { + ("AElig", "Æ"), + ("OElig", "Œ"), + ("aelig", "æ"), + ("amp", "&"), + ("apos", "\'"), + ("copy", "©"), + ("gt", ">"), + ("ldquo", "``"), + ("lt", "<"), + ("mdash", "-"), # XXX ?? + ("oelig", "œ"), + ("quot", "\""), + ("rdquo", "''"), + ("rsquo", "'"), + ("trade", "™"), + ("nbsp", "\u00a0"), +}; +entdict: ref HashTable; + +init(): string +{ + sys = load Sys Sys->PATH; + bufio = load Bufio Bufio->PATH; + if (bufio == nil) + return sys->sprint("cannot load %s: %r", Bufio->PATH); + str = load String String->PATH; + if (str == nil) + return sys->sprint("cannot load %s: %r", String->PATH); + hash = load Hash Hash->PATH; + if (hash == nil) + return sys->sprint("cannot load %s: %r", Hash->PATH); + entdict = hash->new(23); + for (i := 0; i < len entinit; i += 1) { + (key, value) := entinit[i]; + entdict.insert(key, (0, 0.0, value)); + } + return nil; +} + +blankparser: Parser; + +open(srcfile: string, warning: chan of (Locator, string), preelem: string): (ref Parser, string) +{ + x := ref blankparser; + x.in = bufio->open(srcfile, Bufio->OREAD); + if (x.in == nil) + return (nil, sys->sprint("cannot open %s: %r", srcfile)); + # ignore utf16 intialisation character (yuck) + c := x.in.getc(); + if (c != 16rfffe && c != 16rfeff) + x.in.ungetc(); + x.estack = nil; + x.loc = Locator(1, srcfile, ""); + x.warning = warning; + x.preelem = preelem; + return (x, ""); +} + +Parser.next(x: self ref Parser): ref Item +{ + curroffset := x.fileoffset; + currloc := x.loc; + # read up until end of current item + while (x.actdepth > x.readdepth) { + pick p := getparcel(x) { + Start => + x.actdepth++; + End => + x.actdepth--; + EOF => + x.actdepth = 0; # premature EOF closes all tags + Error => + return ref Item.Error(curroffset, x.loc, x.errormsg); + } + } + if (x.actdepth < x.readdepth) { + x.fileoffset = int x.in.offset(); + return nil; + } + gp := getparcel(x); + item: ref Item; + pick p := gp { + Start => + x.actdepth++; + item = ref Item.Tag(curroffset, p.name, p.attrs); + End => + x.actdepth--; + item = nil; + EOF => + x.actdepth = 0; + item = nil; + Error => + x.actdepth = 0; # XXX is this the right thing to do? + item = ref Item.Error(curroffset, currloc, x.errormsg); + Text => + item = ref Item.Text(curroffset, p.ch, p.ws1, p.ws2); + Process => + item = ref Item.Process(curroffset, p.target, p.data); + Empty => + item = ref Item.Tag(curroffset, p.name, p.attrs); + Doctype => + item = ref Item.Doctype(curroffset, p.name, p.public, p.params); + Stylesheet => + item = ref Item.Stylesheet(curroffset, p.attrs); + } + x.fileoffset = int x.in.offset(); + return item; +} + +Parser.atmark(x: self ref Parser, m: ref Mark): int +{ + return int x.in.offset() == m.offset; +} + +Parser.down(x: self ref Parser) +{ + x.readdepth++; +} + +Parser.up(x: self ref Parser) +{ + x.readdepth--; +} + +# mark is only defined after a next(), not after up() or down(). +# this means that we don't have to record lots of state when going up or down levels. +Parser.mark(x: self ref Parser): ref Mark +{ + return ref Mark(x.estack, x.loc.line, int x.in.offset(), x.readdepth); +} + +Parser.goto(x: self ref Parser, m: ref Mark) +{ + x.in.seek(big m.offset, Sys->SEEKSTART); + x.fileoffset = m.offset; + x.eof = 0; + x.estack = m.estack; + x.loc.line = m.line; + x.readdepth = m.readdepth; + x.actdepth = len x.estack; +} + +Mark.str(m: self ref Mark): string +{ + # assume that neither the filename nor any of the tags contain spaces. + # format: + # offset readdepth linenum [tag...] + # XXX would be nice if the produced string did not contain + # any spaces so it could be treated as a word in other contexts. + s := sys->sprint("%d %d %d", m.offset, m.readdepth, m.line); + for (t := m.estack; t != nil; t = tl t) + s += " " + hd t; + return s; +} + +Parser.str2mark(p: self ref Parser, s: string): ref Mark +{ + (n, toks) := sys->tokenize(s, " "); + if (n < 3) + return nil; + m := ref Mark(nil, p.loc.line, 0, 0); + (m.offset, toks) = (int hd toks, tl toks); + (m.readdepth, toks) = (int hd toks, tl toks); + (m.line, toks) = (int hd toks, tl toks); + m.estack = toks; + return m; +} + +getparcel(x: ref Parser): ref Parcel +{ + { + p: ref Parcel; + while (!x.eof && p == nil) { + c := getc(x); + if (c == '<') + p = element(x); + else { + ungetc(x); + p = characters(x); + } + } + if (p == nil) + p = ref Parcel.EOF; + return p; + } + exception e{ + "sax:*" => + return ref Parcel.Error(x.loc, x.errormsg); + } +} + +parcelstr(gi: ref Parcel): string +{ + if (gi == nil) + return "nil"; + pick i := gi { + Start => + return sys->sprint("Start: %s", i.name); + Empty => + return sys->sprint("Empty: %s", i.name); + End => + return "End"; + Text => + return "Text"; + Doctype => + return sys->sprint("Doctype: %s", i.name); + Stylesheet => + return "Stylesheet"; + Error => + return "Error: " + i.msg; + EOF => + return "EOF"; + * => + return "Unknown"; + } +} + +element(x: ref Parser): ref Parcel +{ + # <tag ...> + elemname := xmlname(x); + c: int; + if (elemname != "") { + attrs := buildattrs(x); + skipwhite(x); + c = getc(x); + isend := 0; + if (c == '/') + isend = 1; + else + ungetc(x); + expect(x, '>'); + + if (isend) + return ref Parcel.Empty(elemname, attrs); + else { + startelement(x, elemname); + return ref Parcel.Start(elemname, attrs); + } + # </tag> + } else if ((c = getc(x)) == '/') { + elemname = xmlname(x); + if (elemname != "") { + expect(x, '>'); + endelement(x, elemname); + return ref Parcel.End(elemname); + } + else + error(x, sys->sprint("illegal beginning of tag: '%c'", c)); + # <?tag ... ?> + } else if (c == '?') { + elemname = xmlname(x); + if (elemname != "") { + # this special case could be generalised if there were many + # processing instructions that took attributes like this. + if (elemname == "xml-stylesheet") { + attrs := buildattrs(x); + balancedstring(x, "?>"); + return ref Parcel.Stylesheet(attrs); + } else { + data := balancedstring(x, "?>"); + return ref Parcel.Process(elemname, data); + } + } + } else if (c == '!') { + c = getc(x); + case c { + '-' => + # <!-- comment --> + if(getc(x) == '-'){ + balancedstring(x, "-->"); + return nil; + } + '[' => + # <![CDATA[...]] + s := xmlname(x); + if(s == "CDATA" && getc(x) == '['){ + data := balancedstring(x, "]]>"); + return ref Parcel.Text(data, 0, 0); + } + * => + # <!declaration + ungetc(x); + s := xmlname(x); + case s { + "DOCTYPE" => + # <!DOCTYPE name (SYSTEM "filename" | PUBLIC "pubid" "uri"?)? ("[" decls "]")?> + skipwhite(x); + name := xmlname(x); + if(name == nil) + break; + id := ""; + uri := ""; + public := 0; + skipwhite(x); + case sort := xmlname(x) { + "SYSTEM" => + id = xmlstring(x, 1); + "PUBLIC" => + public = 1; + id = xmlstring(x, 1); + skipwhite(x); + c = getc(x); + ungetc(x); + if(c == '"' || c == '\'') + uri = xmlstring(x, 1); + * => + error(x, sys->sprint("unknown DOCTYPE: %s", sort)); + return nil; + } + skipwhite(x); + if(getc(x) == '['){ + error(x, "cannot handle DOCTYPE with declarations"); + return nil; + } + ungetc(x); + skipwhite(x); + if(getc(x) == '>') + return ref Parcel.Doctype(name, public, id :: uri :: nil); + "ELEMENT" or "ATTRLIST" or "NOTATION" or "ENTITY" => + # don't interpret internal DTDs + # <!ENTITY name ("value" | SYSTEM "filename")> + s = gets(x, '>'); + if(s == nil || s[len s-1] != '>') + error(x, "end of file in declaration"); + return nil; + * => + error(x, sys->sprint("unknown declaration: %s", s)); + } + } + error(x, "invalid XML declaration"); + } else + error(x, sys->sprint("illegal beginning of tag: %c", c)); + return nil; +} + +characters(x: ref Parser): ref Parcel +{ + p: ref Parcel; + content := gets(x, '<'); + if (len content > 0) { + if (content[len content - 1] == '<') { + ungetc(x); + content = content[0:len content - 1]; + } + ws1, ws2: int; + if (x.ispre) { + content = substituteentities(x, content); + ws1 = ws2 = 0; + } else + (content, ws1, ws2) = substituteentities_sp(x, content); + if (content != nil || ws1) + p = ref Parcel.Text(content, ws1, ws2); + } + return p; +} + +startelement(x: ref Parser, name: string) +{ + x.estack = name :: x.estack; + if (name == x.preelem) + x.ispre++; +} + +endelement(x: ref Parser, name: string) +{ + if (x.estack != nil && name == hd x.estack) { + x.estack = tl x.estack; + if (name == x.preelem) + x.ispre--; + } else { + starttag := ""; + if (x.estack != nil) + starttag = hd x.estack; + warning(x, sys->sprint("<%s></%s> mismatch", starttag, name)); + + # invalid XML but try to recover anyway to reduce turnaround time on fixing errors. + # loop back up through the tag stack to see if there's a matching tag, in which case + # jump up in the stack to that, making some rude assumptions about the + # way Parcels are handled at the top level. + n := 0; + for (t := x.estack; t != nil; (t, n) = (tl t, n + 1)) + if (hd t == name) + break; + if (t != nil) { + x.estack = tl t; + x.actdepth -= n; + } + } +} + +buildattrs(x: ref Parser): Attributes +{ + attrs: list of Attribute; + + attr: Attribute; + for (;;) { + skipwhite(x); + attr.name = xmlname(x); + if (attr.name == nil) + break; + skipwhite(x); + c := getc(x); + if(c != '='){ + ungetc(x); + attr.value = nil; + }else + attr.value = xmlstring(x, 1); + attrs = attr :: attrs; + } + return Attributes(attrs); +} + +xmlstring(x: ref Parser, dosub: int): string +{ + skipwhite(x); + s := ""; + delim := getc(x); + if (delim == '\"' || delim == '\'') { + s = gets(x, delim); + n := len s; + if (n == 0 || s[n-1] != delim) + error(x, "unclosed string at end of file"); + s = s[0:n-1]; # TO DO: avoid copy + if(dosub) + s = substituteentities(x, s); + } else + error(x, sys->sprint("illegal string delimiter: %c", delim)); + return s; +} + +xmlname(x: ref Parser): string +{ + name := ""; + ch := getc(x); + case ch { + '_' or ':' or + 'a' to 'z' or + 'A' to 'Z' or + 16r100 to 16rd7ff or + 16re000 or 16rfffd => + name[0] = ch; +loop: + for (;;) { + case ch = getc(x) { + '_' or '-' or ':' or '.' or + 'a' to 'z' or + '0' to '9' or + 'A' to 'Z' or + 16r100 to 16rd7ff or + 16re000 to 16rfffd => + name[len name] = ch; + * => + break loop; + } + } + } + ungetc(x); + return name; +} + +substituteentities(x: ref Parser, buff: string): string +{ + i := 0; + while (i < len buff) { + if (buff[i] == '&') { + (t, j) := translateentity(x, buff, i); + # XXX could be quicker + buff = buff[0:i] + t + buff[j:]; + i += len t; + } else + i++; + } + return buff; +} + +# subsitute entities, squashing whitespace along the way. +substituteentities_sp(x: ref Parser, buf: string): (string, int, int) +{ + firstwhite := 0; + # skip initial white space + for (i := 0; i < len buf; i++) { + c := buf[i]; + if (c != ' ' && c != '\t' && c != '\n' && c != '\r') + break; + firstwhite = 1; + } + + lastwhite := 0; + s := ""; + for (; i < len buf; i++) { + c := buf[i]; + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') + lastwhite = 1; + else { + if (lastwhite) { + s[len s] = ' '; + lastwhite = 0; + } + if (c == '&') { + # should &x20; count as whitespace? + (ent, j) := translateentity(x, buf, i); + i = j - 1; + s += ent; + } else + s[len s] = c; + } + } + return (s, firstwhite, lastwhite); +} + +translateentity(x: ref Parser, s: string, i: int): (string, int) +{ + i++; + for (j := i; j < len s; j++) + if (s[j] == ';') + break; + ent := s[i:j]; + if (j == len s) { + if (len ent > 10) + ent = ent[0:11] + "..."; + warning(x, sys->sprint("missing ; at end of entity (&%s)", ent)); + return (nil, i); + } + j++; + if (ent == nil) { + warning(x, "empty entity"); + return ("", j); + } + if (ent[0] == '#') { + n: int; + rem := ent; + if (len ent >= 3 && ent[1] == 'x') + (n, rem) = str->toint(ent[2:], 16); + else if (len ent >= 2) + (n, rem) = str->toint(ent[1:], 10); + if (rem != nil) { + warning(x, sys->sprint("unrecognized entity (&%s)", ent)); + return (nil, j); + } + ch: string = nil; + ch[0] = n; + return (ch, j); + } + hv := entdict.find(ent); + if (hv == nil) { + warning(x, sys->sprint("unrecognized entity (&%s)", ent)); + return (nil, j); + } + return (hv.s, j); +} + +balancedstring(x: ref Parser, eos: string): string +{ + s := ""; + instring := 0; + quote: int; + + for (i := 0; i < len eos; i++) + s[len s] = ' '; + + skipwhite(x); + while ((c := getc(x)) != Bufio->EOF) { + s[len s] = c; + if (instring) { + if (c == quote) + instring = 0; + } else if (c == '\"' || c == '\'') { + quote = c; + instring = 1; + } else if (s[len s - len eos : len s] == eos) + return s[len eos : len s - len eos]; + } + error(x, sys->sprint("unexpected end of file while looking for \"%s\"", eos)); + return ""; +} + +skipwhite(x: ref Parser) +{ + while ((c := getc(x)) == ' ' || c == '\t' || c == '\n' || c == '\r') + ; + ungetc(x); +} + +expectwhite(x: ref Parser) +{ + if ((c := getc(x)) != ' ' && c != '\t' && c != '\n' && c != '\r') + error(x, "expecting white space"); + skipwhite(x); +} + +expect(x: ref Parser, ch: int) +{ + skipwhite(x); + c := getc(x); + if (c != ch) + error(x, sys->sprint("expecting %c", ch)); +} + +getc(x: ref Parser): int +{ + if (x.eof) + return Bufio->EOF; + ch := x.in.getc(); + if (ch == Bufio->EOF) + x.eof = 1; + else if (ch == '\n') + x.loc.line++; + x.lastnl = ch == '\n'; + return ch; +} + +gets(x: ref Parser, delim: int): string +{ + if (x.eof) + return ""; + s := x.in.gets(delim); + for (i := 0; i < len s; i++) + if (s[i] == '\n') + x.loc.line++; + if (s == "") + x.eof = 1; + else + x.lastnl = s[len s - 1] == '\n'; + return s; +} + +ungetc(x: ref Parser) +{ + if (x.eof) + return; + x.in.ungetc(); + x.loc.line -= x.lastnl; +} + +Attributes.all(al: self Attributes): list of Attribute +{ + return al.attrs; +} + +Attributes.get(attrs: self Attributes, name: string): string +{ + for (a := attrs.attrs; a != nil; a = tl a) + if ((hd a).name == name) + return (hd a).value; + return nil; +} + +warning(x: ref Parser, msg: string) +{ + if (x.warning != nil) + x.warning <-= (x.loc, msg); +} + +error(x: ref Parser, msg: string) +{ + x.errormsg = msg; + raise "sax:error"; +} |
