summaryrefslogtreecommitdiff
path: root/appl/lib/xml.b
diff options
context:
space:
mode:
Diffstat (limited to 'appl/lib/xml.b')
-rw-r--r--appl/lib/xml.b712
1 files changed, 712 insertions, 0 deletions
diff --git a/appl/lib/xml.b b/appl/lib/xml.b
new file mode 100644
index 00000000..5e10608d
--- /dev/null
+++ b/appl/lib/xml.b
@@ -0,0 +1,712 @@
+implement Xml;
+
+#
+# Portions copyright © 2002 Vita Nuova Holdings Limited
+#
+#
+# Derived from saxparser.b Copyright © 2001-2002 by John Powers or his employer
+#
+
+# TO DO:
+# - provide a way of getting attributes out of <?...?> (process) requests,
+# so that we can process stylesheet requests given in that way.
+
+include "sys.m";
+ sys: Sys;
+include "bufio.m";
+ bufio: Bufio;
+ Iobuf: import bufio;
+include "string.m";
+ str: String;
+include "hash.m";
+ hash: Hash;
+ HashTable: import hash;
+include "xml.m";
+
+Parcel: adt {
+ pick {
+ Start or
+ Empty =>
+ name: string;
+ attrs: Attributes;
+ End =>
+ name: string;
+ Text =>
+ ch: string;
+ ws1, ws2: int;
+ Process =>
+ target: string;
+ data: string;
+ Error =>
+ loc: Locator;
+ msg: string;
+ Doctype =>
+ name: string;
+ public: int;
+ params: list of string;
+ Stylesheet =>
+ attrs: Attributes;
+ EOF =>
+ }
+};
+
+entinit := array[] of {
+ ("AElig", "Æ"),
+ ("OElig", "Œ"),
+ ("aelig", "æ"),
+ ("amp", "&"),
+ ("apos", "\'"),
+ ("copy", "©"),
+ ("gt", ">"),
+ ("ldquo", "``"),
+ ("lt", "<"),
+ ("mdash", "-"), # XXX ??
+ ("oelig", "œ"),
+ ("quot", "\""),
+ ("rdquo", "''"),
+ ("rsquo", "'"),
+ ("trade", "™"),
+ ("nbsp", "\u00a0"),
+};
+entdict: ref HashTable;
+
+init(): string
+{
+ sys = load Sys Sys->PATH;
+ bufio = load Bufio Bufio->PATH;
+ if (bufio == nil)
+ return sys->sprint("cannot load %s: %r", Bufio->PATH);
+ str = load String String->PATH;
+ if (str == nil)
+ return sys->sprint("cannot load %s: %r", String->PATH);
+ hash = load Hash Hash->PATH;
+ if (hash == nil)
+ return sys->sprint("cannot load %s: %r", Hash->PATH);
+ entdict = hash->new(23);
+ for (i := 0; i < len entinit; i += 1) {
+ (key, value) := entinit[i];
+ entdict.insert(key, (0, 0.0, value));
+ }
+ return nil;
+}
+
+blankparser: Parser;
+
+open(srcfile: string, warning: chan of (Locator, string), preelem: string): (ref Parser, string)
+{
+ x := ref blankparser;
+ x.in = bufio->open(srcfile, Bufio->OREAD);
+ if (x.in == nil)
+ return (nil, sys->sprint("cannot open %s: %r", srcfile));
+ # ignore utf16 intialisation character (yuck)
+ c := x.in.getc();
+ if (c != 16rfffe && c != 16rfeff)
+ x.in.ungetc();
+ x.estack = nil;
+ x.loc = Locator(1, srcfile, "");
+ x.warning = warning;
+ x.preelem = preelem;
+ return (x, "");
+}
+
+Parser.next(x: self ref Parser): ref Item
+{
+ curroffset := x.fileoffset;
+ currloc := x.loc;
+ # read up until end of current item
+ while (x.actdepth > x.readdepth) {
+ pick p := getparcel(x) {
+ Start =>
+ x.actdepth++;
+ End =>
+ x.actdepth--;
+ EOF =>
+ x.actdepth = 0; # premature EOF closes all tags
+ Error =>
+ return ref Item.Error(curroffset, x.loc, x.errormsg);
+ }
+ }
+ if (x.actdepth < x.readdepth) {
+ x.fileoffset = int x.in.offset();
+ return nil;
+ }
+ gp := getparcel(x);
+ item: ref Item;
+ pick p := gp {
+ Start =>
+ x.actdepth++;
+ item = ref Item.Tag(curroffset, p.name, p.attrs);
+ End =>
+ x.actdepth--;
+ item = nil;
+ EOF =>
+ x.actdepth = 0;
+ item = nil;
+ Error =>
+ x.actdepth = 0; # XXX is this the right thing to do?
+ item = ref Item.Error(curroffset, currloc, x.errormsg);
+ Text =>
+ item = ref Item.Text(curroffset, p.ch, p.ws1, p.ws2);
+ Process =>
+ item = ref Item.Process(curroffset, p.target, p.data);
+ Empty =>
+ item = ref Item.Tag(curroffset, p.name, p.attrs);
+ Doctype =>
+ item = ref Item.Doctype(curroffset, p.name, p.public, p.params);
+ Stylesheet =>
+ item = ref Item.Stylesheet(curroffset, p.attrs);
+ }
+ x.fileoffset = int x.in.offset();
+ return item;
+}
+
+Parser.atmark(x: self ref Parser, m: ref Mark): int
+{
+ return int x.in.offset() == m.offset;
+}
+
+Parser.down(x: self ref Parser)
+{
+ x.readdepth++;
+}
+
+Parser.up(x: self ref Parser)
+{
+ x.readdepth--;
+}
+
+# mark is only defined after a next(), not after up() or down().
+# this means that we don't have to record lots of state when going up or down levels.
+Parser.mark(x: self ref Parser): ref Mark
+{
+ return ref Mark(x.estack, x.loc.line, int x.in.offset(), x.readdepth);
+}
+
+Parser.goto(x: self ref Parser, m: ref Mark)
+{
+ x.in.seek(big m.offset, Sys->SEEKSTART);
+ x.fileoffset = m.offset;
+ x.eof = 0;
+ x.estack = m.estack;
+ x.loc.line = m.line;
+ x.readdepth = m.readdepth;
+ x.actdepth = len x.estack;
+}
+
+Mark.str(m: self ref Mark): string
+{
+ # assume that neither the filename nor any of the tags contain spaces.
+ # format:
+ # offset readdepth linenum [tag...]
+ # XXX would be nice if the produced string did not contain
+ # any spaces so it could be treated as a word in other contexts.
+ s := sys->sprint("%d %d %d", m.offset, m.readdepth, m.line);
+ for (t := m.estack; t != nil; t = tl t)
+ s += " " + hd t;
+ return s;
+}
+
+Parser.str2mark(p: self ref Parser, s: string): ref Mark
+{
+ (n, toks) := sys->tokenize(s, " ");
+ if (n < 3)
+ return nil;
+ m := ref Mark(nil, p.loc.line, 0, 0);
+ (m.offset, toks) = (int hd toks, tl toks);
+ (m.readdepth, toks) = (int hd toks, tl toks);
+ (m.line, toks) = (int hd toks, tl toks);
+ m.estack = toks;
+ return m;
+}
+
+getparcel(x: ref Parser): ref Parcel
+{
+ {
+ p: ref Parcel;
+ while (!x.eof && p == nil) {
+ c := getc(x);
+ if (c == '<')
+ p = element(x);
+ else {
+ ungetc(x);
+ p = characters(x);
+ }
+ }
+ if (p == nil)
+ p = ref Parcel.EOF;
+ return p;
+ }
+ exception e{
+ "sax:*" =>
+ return ref Parcel.Error(x.loc, x.errormsg);
+ }
+}
+
+parcelstr(gi: ref Parcel): string
+{
+ if (gi == nil)
+ return "nil";
+ pick i := gi {
+ Start =>
+ return sys->sprint("Start: %s", i.name);
+ Empty =>
+ return sys->sprint("Empty: %s", i.name);
+ End =>
+ return "End";
+ Text =>
+ return "Text";
+ Doctype =>
+ return sys->sprint("Doctype: %s", i.name);
+ Stylesheet =>
+ return "Stylesheet";
+ Error =>
+ return "Error: " + i.msg;
+ EOF =>
+ return "EOF";
+ * =>
+ return "Unknown";
+ }
+}
+
+element(x: ref Parser): ref Parcel
+{
+ # <tag ...>
+ elemname := xmlname(x);
+ c: int;
+ if (elemname != "") {
+ attrs := buildattrs(x);
+ skipwhite(x);
+ c = getc(x);
+ isend := 0;
+ if (c == '/')
+ isend = 1;
+ else
+ ungetc(x);
+ expect(x, '>');
+
+ if (isend)
+ return ref Parcel.Empty(elemname, attrs);
+ else {
+ startelement(x, elemname);
+ return ref Parcel.Start(elemname, attrs);
+ }
+ # </tag>
+ } else if ((c = getc(x)) == '/') {
+ elemname = xmlname(x);
+ if (elemname != "") {
+ expect(x, '>');
+ endelement(x, elemname);
+ return ref Parcel.End(elemname);
+ }
+ else
+ error(x, sys->sprint("illegal beginning of tag: '%c'", c));
+ # <?tag ... ?>
+ } else if (c == '?') {
+ elemname = xmlname(x);
+ if (elemname != "") {
+ # this special case could be generalised if there were many
+ # processing instructions that took attributes like this.
+ if (elemname == "xml-stylesheet") {
+ attrs := buildattrs(x);
+ balancedstring(x, "?>");
+ return ref Parcel.Stylesheet(attrs);
+ } else {
+ data := balancedstring(x, "?>");
+ return ref Parcel.Process(elemname, data);
+ }
+ }
+ } else if (c == '!') {
+ c = getc(x);
+ case c {
+ '-' =>
+ # <!-- comment -->
+ if(getc(x) == '-'){
+ balancedstring(x, "-->");
+ return nil;
+ }
+ '[' =>
+ # <![CDATA[...]]
+ s := xmlname(x);
+ if(s == "CDATA" && getc(x) == '['){
+ data := balancedstring(x, "]]>");
+ return ref Parcel.Text(data, 0, 0);
+ }
+ * =>
+ # <!declaration
+ ungetc(x);
+ s := xmlname(x);
+ case s {
+ "DOCTYPE" =>
+ # <!DOCTYPE name (SYSTEM "filename" | PUBLIC "pubid" "uri"?)? ("[" decls "]")?>
+ skipwhite(x);
+ name := xmlname(x);
+ if(name == nil)
+ break;
+ id := "";
+ uri := "";
+ public := 0;
+ skipwhite(x);
+ case sort := xmlname(x) {
+ "SYSTEM" =>
+ id = xmlstring(x, 1);
+ "PUBLIC" =>
+ public = 1;
+ id = xmlstring(x, 1);
+ skipwhite(x);
+ c = getc(x);
+ ungetc(x);
+ if(c == '"' || c == '\'')
+ uri = xmlstring(x, 1);
+ * =>
+ error(x, sys->sprint("unknown DOCTYPE: %s", sort));
+ return nil;
+ }
+ skipwhite(x);
+ if(getc(x) == '['){
+ error(x, "cannot handle DOCTYPE with declarations");
+ return nil;
+ }
+ ungetc(x);
+ skipwhite(x);
+ if(getc(x) == '>')
+ return ref Parcel.Doctype(name, public, id :: uri :: nil);
+ "ELEMENT" or "ATTRLIST" or "NOTATION" or "ENTITY" =>
+ # don't interpret internal DTDs
+ # <!ENTITY name ("value" | SYSTEM "filename")>
+ s = gets(x, '>');
+ if(s == nil || s[len s-1] != '>')
+ error(x, "end of file in declaration");
+ return nil;
+ * =>
+ error(x, sys->sprint("unknown declaration: %s", s));
+ }
+ }
+ error(x, "invalid XML declaration");
+ } else
+ error(x, sys->sprint("illegal beginning of tag: %c", c));
+ return nil;
+}
+
+characters(x: ref Parser): ref Parcel
+{
+ p: ref Parcel;
+ content := gets(x, '<');
+ if (len content > 0) {
+ if (content[len content - 1] == '<') {
+ ungetc(x);
+ content = content[0:len content - 1];
+ }
+ ws1, ws2: int;
+ if (x.ispre) {
+ content = substituteentities(x, content);
+ ws1 = ws2 = 0;
+ } else
+ (content, ws1, ws2) = substituteentities_sp(x, content);
+ if (content != nil || ws1)
+ p = ref Parcel.Text(content, ws1, ws2);
+ }
+ return p;
+}
+
+startelement(x: ref Parser, name: string)
+{
+ x.estack = name :: x.estack;
+ if (name == x.preelem)
+ x.ispre++;
+}
+
+endelement(x: ref Parser, name: string)
+{
+ if (x.estack != nil && name == hd x.estack) {
+ x.estack = tl x.estack;
+ if (name == x.preelem)
+ x.ispre--;
+ } else {
+ starttag := "";
+ if (x.estack != nil)
+ starttag = hd x.estack;
+ warning(x, sys->sprint("<%s></%s> mismatch", starttag, name));
+
+ # invalid XML but try to recover anyway to reduce turnaround time on fixing errors.
+ # loop back up through the tag stack to see if there's a matching tag, in which case
+ # jump up in the stack to that, making some rude assumptions about the
+ # way Parcels are handled at the top level.
+ n := 0;
+ for (t := x.estack; t != nil; (t, n) = (tl t, n + 1))
+ if (hd t == name)
+ break;
+ if (t != nil) {
+ x.estack = tl t;
+ x.actdepth -= n;
+ }
+ }
+}
+
+buildattrs(x: ref Parser): Attributes
+{
+ attrs: list of Attribute;
+
+ attr: Attribute;
+ for (;;) {
+ skipwhite(x);
+ attr.name = xmlname(x);
+ if (attr.name == nil)
+ break;
+ skipwhite(x);
+ c := getc(x);
+ if(c != '='){
+ ungetc(x);
+ attr.value = nil;
+ }else
+ attr.value = xmlstring(x, 1);
+ attrs = attr :: attrs;
+ }
+ return Attributes(attrs);
+}
+
+xmlstring(x: ref Parser, dosub: int): string
+{
+ skipwhite(x);
+ s := "";
+ delim := getc(x);
+ if (delim == '\"' || delim == '\'') {
+ s = gets(x, delim);
+ n := len s;
+ if (n == 0 || s[n-1] != delim)
+ error(x, "unclosed string at end of file");
+ s = s[0:n-1]; # TO DO: avoid copy
+ if(dosub)
+ s = substituteentities(x, s);
+ } else
+ error(x, sys->sprint("illegal string delimiter: %c", delim));
+ return s;
+}
+
+xmlname(x: ref Parser): string
+{
+ name := "";
+ ch := getc(x);
+ case ch {
+ '_' or ':' or
+ 'a' to 'z' or
+ 'A' to 'Z' or
+ 16r100 to 16rd7ff or
+ 16re000 or 16rfffd =>
+ name[0] = ch;
+loop:
+ for (;;) {
+ case ch = getc(x) {
+ '_' or '-' or ':' or '.' or
+ 'a' to 'z' or
+ '0' to '9' or
+ 'A' to 'Z' or
+ 16r100 to 16rd7ff or
+ 16re000 to 16rfffd =>
+ name[len name] = ch;
+ * =>
+ break loop;
+ }
+ }
+ }
+ ungetc(x);
+ return name;
+}
+
+substituteentities(x: ref Parser, buff: string): string
+{
+ i := 0;
+ while (i < len buff) {
+ if (buff[i] == '&') {
+ (t, j) := translateentity(x, buff, i);
+ # XXX could be quicker
+ buff = buff[0:i] + t + buff[j:];
+ i += len t;
+ } else
+ i++;
+ }
+ return buff;
+}
+
+# subsitute entities, squashing whitespace along the way.
+substituteentities_sp(x: ref Parser, buf: string): (string, int, int)
+{
+ firstwhite := 0;
+ # skip initial white space
+ for (i := 0; i < len buf; i++) {
+ c := buf[i];
+ if (c != ' ' && c != '\t' && c != '\n' && c != '\r')
+ break;
+ firstwhite = 1;
+ }
+
+ lastwhite := 0;
+ s := "";
+ for (; i < len buf; i++) {
+ c := buf[i];
+ if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
+ lastwhite = 1;
+ else {
+ if (lastwhite) {
+ s[len s] = ' ';
+ lastwhite = 0;
+ }
+ if (c == '&') {
+ # should &x20; count as whitespace?
+ (ent, j) := translateentity(x, buf, i);
+ i = j - 1;
+ s += ent;
+ } else
+ s[len s] = c;
+ }
+ }
+ return (s, firstwhite, lastwhite);
+}
+
+translateentity(x: ref Parser, s: string, i: int): (string, int)
+{
+ i++;
+ for (j := i; j < len s; j++)
+ if (s[j] == ';')
+ break;
+ ent := s[i:j];
+ if (j == len s) {
+ if (len ent > 10)
+ ent = ent[0:11] + "...";
+ warning(x, sys->sprint("missing ; at end of entity (&%s)", ent));
+ return (nil, i);
+ }
+ j++;
+ if (ent == nil) {
+ warning(x, "empty entity");
+ return ("", j);
+ }
+ if (ent[0] == '#') {
+ n: int;
+ rem := ent;
+ if (len ent >= 3 && ent[1] == 'x')
+ (n, rem) = str->toint(ent[2:], 16);
+ else if (len ent >= 2)
+ (n, rem) = str->toint(ent[1:], 10);
+ if (rem != nil) {
+ warning(x, sys->sprint("unrecognized entity (&%s)", ent));
+ return (nil, j);
+ }
+ ch: string = nil;
+ ch[0] = n;
+ return (ch, j);
+ }
+ hv := entdict.find(ent);
+ if (hv == nil) {
+ warning(x, sys->sprint("unrecognized entity (&%s)", ent));
+ return (nil, j);
+ }
+ return (hv.s, j);
+}
+
+balancedstring(x: ref Parser, eos: string): string
+{
+ s := "";
+ instring := 0;
+ quote: int;
+
+ for (i := 0; i < len eos; i++)
+ s[len s] = ' ';
+
+ skipwhite(x);
+ while ((c := getc(x)) != Bufio->EOF) {
+ s[len s] = c;
+ if (instring) {
+ if (c == quote)
+ instring = 0;
+ } else if (c == '\"' || c == '\'') {
+ quote = c;
+ instring = 1;
+ } else if (s[len s - len eos : len s] == eos)
+ return s[len eos : len s - len eos];
+ }
+ error(x, sys->sprint("unexpected end of file while looking for \"%s\"", eos));
+ return "";
+}
+
+skipwhite(x: ref Parser)
+{
+ while ((c := getc(x)) == ' ' || c == '\t' || c == '\n' || c == '\r')
+ ;
+ ungetc(x);
+}
+
+expectwhite(x: ref Parser)
+{
+ if ((c := getc(x)) != ' ' && c != '\t' && c != '\n' && c != '\r')
+ error(x, "expecting white space");
+ skipwhite(x);
+}
+
+expect(x: ref Parser, ch: int)
+{
+ skipwhite(x);
+ c := getc(x);
+ if (c != ch)
+ error(x, sys->sprint("expecting %c", ch));
+}
+
+getc(x: ref Parser): int
+{
+ if (x.eof)
+ return Bufio->EOF;
+ ch := x.in.getc();
+ if (ch == Bufio->EOF)
+ x.eof = 1;
+ else if (ch == '\n')
+ x.loc.line++;
+ x.lastnl = ch == '\n';
+ return ch;
+}
+
+gets(x: ref Parser, delim: int): string
+{
+ if (x.eof)
+ return "";
+ s := x.in.gets(delim);
+ for (i := 0; i < len s; i++)
+ if (s[i] == '\n')
+ x.loc.line++;
+ if (s == "")
+ x.eof = 1;
+ else
+ x.lastnl = s[len s - 1] == '\n';
+ return s;
+}
+
+ungetc(x: ref Parser)
+{
+ if (x.eof)
+ return;
+ x.in.ungetc();
+ x.loc.line -= x.lastnl;
+}
+
+Attributes.all(al: self Attributes): list of Attribute
+{
+ return al.attrs;
+}
+
+Attributes.get(attrs: self Attributes, name: string): string
+{
+ for (a := attrs.attrs; a != nil; a = tl a)
+ if ((hd a).name == name)
+ return (hd a).value;
+ return nil;
+}
+
+warning(x: ref Parser, msg: string)
+{
+ if (x.warning != nil)
+ x.warning <-= (x.loc, msg);
+}
+
+error(x: ref Parser, msg: string)
+{
+ x.errormsg = msg;
+ raise "sax:error";
+}