diff options
Diffstat (limited to 'appl/charon/lex.b')
| -rw-r--r-- | appl/charon/lex.b | 1340 |
1 files changed, 1340 insertions, 0 deletions
diff --git a/appl/charon/lex.b b/appl/charon/lex.b new file mode 100644 index 00000000..4b7274c0 --- /dev/null +++ b/appl/charon/lex.b @@ -0,0 +1,1340 @@ +implement Lex; + +include "common.m"; + +# local copies from CU +sys: Sys; +CU: CharonUtils; +S: String; +T: StringIntTab; +C: Ctype; +J: Script; +ctype: array of byte; + +EOF : con -2; +EOB : con -1; + +tagnames = array[] of { + " ", + "!", + "a", + "abbr", + "acronym", + "address", + "applet", + "area", + "b", + "base", + "basefont", + "bdo", + "big", + "blink", + "blockquote", + "body", + "bq", + "br", + "button", + "caption", + "center", + "cite", + "code", + "col", + "colgroup", + "dd", + "del", + "dfn", + "dir", + "div", + "dl", + "dt", + "em", + "fieldset", + "font", + "form", + "frame", + "frameset", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "hr", + "html", + "i", + "iframe", + "image", + "img", + "input", + "ins", + "isindex", + "kbd", + "label", + "legend", + "li", + "link", + "map", + "menu", + "meta", + "nobr", + "noframes", + "noscript", + "object", + "ol", + "optgroup", + "option", + "p", + "param", + "pre", + "q", + "s", + "samp", + "script", + "select", + "small", + "span", + "strike", + "strong", + "style", + "sub", + "sup", + "table", + "tbody", + "td", + "textarea", + "tfoot", + "th", + "thead", + "title", + "tr", + "tt", + "u", + "ul", + "var", + "xmp" +}; + +tagtable : array of T->StringInt; # initialized from tagnames + +attrnames = array[] of { + "abbr", + "accept", + "accept-charset", + "accesskey", + "action", + "align", + "alink", + "alt", + "archive", + "axis", + "background", + "bgcolor", + "border", + "cellpadding", + "cellspacing", + "char", + "charoff", + "charset", + "checked", + "cite", + "class", + "classid", + "clear", + "code", + "codebase", + "codetype", + "color", + "cols", + "colspan", + "compact", + "content", + "coords", + "data", + "datafld", + "dataformatas", + "datapagesize", + "datasrc", + "datetime", + "declare", + "defer", + "dir", + "disabled", + "enctype", + "event", + "face", + "for", + "frame", + "frameborder", + "headers", + "height", + "href", + "hreflang", + "hspace", + "http-equiv", + "id", + "ismap", + "label", + "lang", + "language", + "link", + "longdesc", + "lowsrc", + "marginheight", + "marginwidth", + "maxlength", + "media", + "method", + "multiple", + "name", + "nohref", + "noresize", + "noshade", + "nowrap", + "object", + "onabort", + "onblur", + "onchange", + "onclick", + "ondblclick", + "onerror", + "onfocus", + "onkeydown", + "onkeypress", + "onkeyup", + "onload", + "onmousedown", + "onmousemove", + "onmouseout", + "onmouseover", + "onmouseup", + "onreset", + "onresize", + "onselect", + "onsubmit", + "onunload", + "profile", + "prompt", + "readonly", + "rel", + "rev", + "rows", + "rowspan", + "rules", + "scheme", + "scope", + "scrolling", + "selected", + "shape", + "size", + "span", + "src", + "standby", + "start", + "style", + "summary", + "tabindex", + "target", + "text", + "title", + "type", + "usemap", + "valign", + "value", + "valuetype", + "version", + "vlink", + "vspace", + "width" +}; + +attrtable : array of T->StringInt; # initialized from attrnames + +chartab:= array[] of { T->StringInt + ("AElig", 'Æ'), + ("Aacute", 'Á'), + ("Acirc", 'Â'), + ("Agrave", 'À'), + ("Alpha", 'Α'), + ("Aring", 'Å'), + ("Atilde", 'Ã'), + ("Auml", 'Ä'), + ("Beta", 'Β'), + ("Ccedil", 'Ç'), + ("Chi", 'Χ'), + ("Dagger", '‡'), + ("Delta", 'Δ'), + ("ETH", 'Ð'), + ("Eacute", 'É'), + ("Ecirc", 'Ê'), + ("Egrave", 'È'), + ("Epsilon", 'Ε'), + ("Eta", 'Η'), + ("Euml", 'Ë'), + ("Gamma", 'Γ'), + ("Iacute", 'Í'), + ("Icirc", 'Î'), + ("Igrave", 'Ì'), + ("Iota", 'Ι'), + ("Iuml", 'Ï'), + ("Kappa", 'Κ'), + ("Lambda", 'Λ'), + ("Mu", 'Μ'), + ("Ntilde", 'Ñ'), + ("Nu", 'Ν'), + ("OElig", 'Œ'), + ("Oacute", 'Ó'), + ("Ocirc", 'Ô'), + ("Ograve", 'Ò'), + ("Omega", 'Ω'), + ("Omicron", 'Ο'), + ("Oslash", 'Ø'), + ("Otilde", 'Õ'), + ("Ouml", 'Ö'), + ("Phi", 'Φ'), + ("Pi", 'Π'), + ("Prime", '″'), + ("Psi", 'Ψ'), + ("Rho", 'Ρ'), + ("Scaron", 'Š'), + ("Sigma", 'Σ'), + ("THORN", 'Þ'), + ("Tau", 'Τ'), + ("Theta", 'Θ'), + ("Uacute", 'Ú'), + ("Ucirc", 'Û'), + ("Ugrave", 'Ù'), + ("Upsilon", 'Υ'), + ("Uuml", 'Ü'), + ("Xi", 'Ξ'), + ("Yacute", 'Ý'), + ("Yuml", 'Ÿ'), + ("Zeta", 'Ζ'), + ("aacute", 'á'), + ("acirc", 'â'), + ("acute", '´'), + ("aelig", 'æ'), + ("agrave", 'à'), + ("alefsym", 'ℵ'), + ("alpha", 'α'), + ("amp", '&'), + ("and", '∧'), + ("ang", '∠'), + ("aring", 'å'), + ("asymp", '≈'), + ("atilde", 'ã'), + ("auml", 'ä'), + ("bdquo", '„'), + ("beta", 'β'), + ("brvbar", '¦'), + ("bull", '•'), + ("cap", '∩'), + ("ccedil", 'ç'), + ("cdots", '⋯'), + ("cedil", '¸'), + ("cent", '¢'), + ("chi", 'χ'), + ("circ", 'ˆ'), + ("clubs", '♣'), + ("cong", '≅'), + ("copy", '©'), + ("crarr", '↵'), + ("cup", '∪'), + ("curren", '¤'), + ("dArr", '⇓'), + ("dagger", '†'), + ("darr", '↓'), + ("ddots", '⋱'), + ("deg", '°'), + ("delta", 'δ'), + ("diams", '♦'), + ("divide", '÷'), + ("eacute", 'é'), + ("ecirc", 'ê'), + ("egrave", 'è'), + ("emdash", '—'), + ("empty", '∅'), + ("emsp", ' '), + ("endash", '–'), + ("ensp", ' '), + ("epsilon", 'ε'), + ("equiv", '≡'), + ("eta", 'η'), + ("eth", 'ð'), + ("euml", 'ë'), + ("euro", '€'), + ("exist", '∃'), + ("fnof", 'ƒ'), + ("forall", '∀'), + ("frac12", '½'), + ("frac14", '¼'), + ("frac34", '¾'), + ("frasl", '⁄'), + ("gamma", 'γ'), + ("ge", '≥'), + ("gt", '>'), + ("hArr", '⇔'), + ("harr", '↔'), + ("hearts", '♥'), + ("hellip", '…'), + ("iacute", 'í'), + ("icirc", 'î'), + ("iexcl", '¡'), + ("igrave", 'ì'), + ("image", 'ℑ'), + ("infin", '∞'), + ("int", '∫'), + ("iota", 'ι'), + ("iquest", '¿'), + ("isin", '∈'), + ("iuml", 'ï'), + ("kappa", 'κ'), + ("lArr", '⇐'), + ("lambda", 'λ'), + ("lang", '〈'), + ("laquo", '«'), + ("larr", '←'), + ("lceil", '⌈'), + ("ldots", '…'), + ("ldquo", '“'), + ("le", '≤'), + ("lfloor", '⌊'), + ("lowast", '∗'), + ("loz", '◊'), + ("lrm", ''), + ("lsaquo", '‹'), + ("lsquo", '‘'), + ("lt", '<'), + ("macr", '¯'), + ("mdash", '—'), + ("micro", 'µ'), + ("middot", '·'), + ("minus", '−'), + ("mu", 'μ'), + ("nabla", '∇'), + ("nbsp", ' '), + ("ndash", '–'), + ("ne", '≠'), + ("ni", '∋'), + ("not", '¬'), + ("notin", '∉'), + ("nsub", '⊄'), + ("ntilde", 'ñ'), + ("nu", 'ν'), + ("oacute", 'ó'), + ("ocirc", 'ô'), + ("oelig", 'œ'), + ("ograve", 'ò'), + ("oline", '‾'), + ("omega", 'ω'), + ("omicron", 'ο'), + ("oplus", '⊕'), + ("or", '∨'), + ("ordf", 'ª'), + ("ordm", 'º'), + ("oslash", 'ø'), + ("otilde", 'õ'), + ("otimes", '⊗'), + ("ouml", 'ö'), + ("para", '¶'), + ("part", '∂'), + ("permil", '‰'), + ("perp", '⊥'), + ("phi", 'φ'), + ("pi", 'π'), + ("piv", 'ϖ'), + ("plusmn", '±'), + ("pound", '£'), + ("prime", '′'), + ("prod", '∏'), + ("prop", '∝'), + ("psi", 'ψ'), + ("quad", ' '), + ("quot", '"'), + ("quot", '"'), + ("rArr", '⇒'), + ("radic", '√'), + ("rang", '〉'), + ("raquo", '»'), + ("rarr", '→'), + ("rceil", '⌉'), + ("rdquo", '”'), + ("real", 'ℜ'), + ("reg", '®'), + ("rfloor", '⌋'), + ("rho", 'ρ'), + ("rlm", ''), + ("rsaquo", '›'), + ("rsquo", '’'), + ("sbquo", '‚'), + ("scaron", 'š'), + ("sdot", '⋅'), + ("sect", '§'), + ("shy", ''), + ("sigma", 'σ'), + ("sigmaf", 'ς'), + ("sim", '∼'), + ("sp", ' '), + ("spades", '♠'), + ("sub", '⊂'), + ("sube", '⊆'), + ("sum", '∑'), + ("sup", '⊃'), + ("sup1", '¹'), + ("sup2", '²'), + ("sup3", '³'), + ("supe", '⊇'), + ("szlig", 'ß'), + ("tau", 'τ'), + ("there4", '∴'), + ("theta", 'θ'), + ("thetasym", 'ϑ'), + ("thinsp", ' '), + ("thorn", 'þ'), + ("tilde", '˜'), + ("times", '×'), + ("trade", '™'), + ("uArr", '⇑'), + ("uacute", 'ú'), + ("uarr", '↑'), + ("ucirc", 'û'), + ("ugrave", 'ù'), + ("uml", '¨'), + ("upsih", 'ϒ'), + ("upsilon", 'υ'), + ("uuml", 'ü'), + ("varepsilon", '∈'), + ("varphi", 'ϕ'), + ("varpi", 'ϖ'), + ("varrho", 'ϱ'), + ("vdots", '⋮'), + ("vsigma", 'ς'), + ("vtheta", 'ϑ'), + ("weierp", '℘'), + ("xi", 'ξ'), + ("yacute", 'ý'), + ("yen", '¥'), + ("yuml", 'ÿ'), + ("zeta", 'ζ'), + ("zwj", ''), + ("zwnj", ''), +}; + +# Characters Winstart..Winend are those that Windows +# uses interpolated into the Latin1 set. +# They aren't supposed to appear in HTML, but they do.... +Winstart : con 16r7f; +Winend: con 16r9f; +winchars := array[] of { '•', + '•', '•', '‚', 'ƒ', '„', '…', '†', '‡', + 'ˆ', '‰', 'Š', '‹', 'Œ', '•', '•', '•', + '•', '‘', '’', '“', '”', '•', '–', '—', + '˜', '™', 'š', '›', 'œ', '•', '•', 'Ÿ' +}; + +NAMCHAR : con (C->L|C->U|C->D|C->N); +LETTER : con (C->L|C->U); + +dbg := 0; +warn := 0; + +init(cu: CharonUtils) +{ + CU = cu; + sys = load Sys Sys->PATH; + S = load String String->PATH; + C = cu->C; + J = cu->J; + T = load StringIntTab StringIntTab->PATH; + tagtable = CU->makestrinttab(tagnames); + attrtable = CU->makestrinttab(attrnames); + ctype = C->ctype; +} + +TokenSource.new(b: ref CU->ByteSource, chset : Btos, mtype: int) : ref TokenSource +{ + ts := ref TSstate ( + 0, # bi + 0, # prevbi + "", # s + 0, # si + Convcs->Startstate, # state + Convcs->Startstate # prevstate + ); + ans := ref TokenSource( + b, # b + chset, # chset + ts, # state + mtype, # mtype + 0 # inxmp + ); + dbg = int (CU->config).dbg['x']; + warn = (int (CU->config).dbg['w']) || dbg; + return ans; +} + +TokenSource.gettoks(ts: self ref TokenSource): array of ref Token +{ + ToksMax : con 500; # max chunk of tokens returned + a := array[ToksMax] of ref Token; + ai := 0; + pcdai := 0; + lim := 0; + # put some dbg output in here + if(ts.mtype == CU->TextHtml) { + pcdstate : ref TSstate; +gather: + while(ai < ToksMax-1) { # always allow space for a Data token + state := getstate(ts); + c := getchar(ts); + if (c < ' ') { + c = eatctls(c, ts); + if (c < 0) + break; + } + tok : ref Token; + if(c == '<') { + tok = gettag(ts); + if (tok != nil && ts.inxmp && tok.tag != Txmp+RBRA) { + rewind(ts, state); + getchar(ts); # consume the '<' + tok = ref Token(Data, "<", nil); + } + if(tok != nil && tok.tag != Comment) { + a[ai++] = tok; + case (tok.tag) { + Tselect or Ttitle or Toption=> + # Several tags expect PCDATA after them. + # Capture state so we can rewind if necessary + pcdstate = state; + pcdai = ai-1; + Ttextarea => + pcdstate = state; + pcdai = ai-1; + # not sure if we should parse entity references + tok = gettagdata(ts, tok.tag, 1); + if(tok != nil) { + pcdstate = nil; + a[ai++] = tok; + } + Tscript => + pcdstate = state; + pcdai = ai-1; + # special rules for getting Data + tok = getscriptdata(ts); + if(tok != nil) { + pcdstate = nil; + a[ai++] = tok; + } + Txmp => + pcdstate = nil; + ts.inxmp = 1; + Txmp+RBRA => + pcdstate = nil; + ts.inxmp = 0; + Data => + ; + Tmeta => + pcdstate = nil; + break gather; + * => + pcdstate = nil; + } + } + } else { + tok = getdata(ts, c); + if(tok != nil) + a[ai++] = tok; + } + if(tok == nil && !eof(ts)) { + # we need more input to complete the token + lim = ts.state.bi; + rewind(ts, state); + break gather; + } else + if(dbg > 1) + sys->print("lex: got token %s\n", tok.tostring()); + } + # Several tags expect PCDATA after them. + # which means that build needs to see another tag or eof + # after any data in order to know that PCDATA is ended. + # Rewind if we haven't got to the following tag yet. + if (pcdstate != nil && !eof(ts)) { + rewind(ts, pcdstate); + ai = pcdai; + } + } + else { + # plain text (non-html) tokens + while(ai < ToksMax) { + tok := getplaindata(ts); + if(tok == nil) + break; + else + a[ai++] = tok; + if(dbg > 1) + sys->print("lex: got token %s\n", tok.tostring()); + } + } + if(dbg) + sys->print("lex: returning %d tokens\n", ai); + if (lim > ts.b.lim) + ts.b.lim = lim; + else + ts.b.lim = ts.state.prevbi; + if(ai == 0) + return nil; + return a[0:ai]; +} + +# must not be called from within TokenSource.gettoks() +# as it will not work with rewind() and ungetchar() +# +TokenSource.setchset(ts: self ref TokenSource, chset: Btos) +{ + st := ts.state; + nchars := st.si; + if (nchars > 0 && nchars < len st.s) { + # align bi to the current input char + bs := ts.b; + (state, nil, n) := ts.chset->btos(st.prevcsstate, bs.data[st.prevbi:st.bi], nchars); + st.bi = st.prevbi + n; + st.prevbi = st.bi; + } + ts.chset = chset; + st.csstate = st.prevcsstate = Convcs->Startstate; + st.s = nil; + st.si = 0; +} + + +eof(ts : ref TokenSource) : int +{ + st := ts.state; + bs := ts.b; + return (st.s == nil && bs.eof && st.prevbi == bs.edata); +} + +# For case where source isn't HTML. +# Just make data tokens, one per line (or partial line, +# at end of buffer), ignoring non-whitespace control +# characters and dumping \r's +getplaindata(ts: ref TokenSource): ref Token +{ + s := ""; + j := 0; + + for(c := getchar(ts); c >= 0; c = getchar(ts)) { + if(c < ' ') { + if(ctype[c] == C->W) { + if(c == '\r') { + # ignore it unless no following '\n', + # in which case treat it like '\n' + c = getchar(ts); + if(c != '\n') { + if(c >= 0) + ungetchar(ts); + c = '\n'; + } + } + } + else + c = 0; # ignore + } + if(c != 0) + s[j++] = c; + if(c == '\n') + break; + } + if(s == "") + return nil; + return ref Token(Data, s, nil); +} + +eatctls(c: int, ts: ref TokenSource): int +{ + while (c >= 0) { + if (c >= ' ') + return c; + if(ctype[c] == C->W) { + if(c == '\r') { + c = getchar(ts); + if (c != '\n' && c >= 0) { + ungetchar(ts); + c = '\n'; + } + } + return c; + } + c = getchar(ts); + } + return -1; +} + +# Gather data up to next start-of-tag or end-of-buffer. +# Translate entity references (&) if not in <XMP> section. +# Ignore non-whitespace control characters and get rid of \r's. +getdata(ts: ref TokenSource, firstc : int): ref Token +{ + s := ""; + j := 0; + c := firstc; + + while(c >= 0) { + if (c < ' ') + c = eatctls(c, ts); + if (c < 0) + break; + if(c == '&' && !ts.inxmp) { + ok : int; + (c, ok) = ampersand(ts); + if(!ok) { + ungetchar(ts); + break; # incomplete entity reference (ts backed up by ampersand) + } + } + else if(c == '<') { + ungetchar(ts); + break; + } + if(c != 0) + s[j++] = c; + c = getchar(ts); + } + if(s == "") + return nil; + return ref Token(Data, s, nil); +} + +# The rules for lexing scripts are different (ugh). +# Gather up everything until see a </SCRIPT>. +getscriptdata(ts: ref TokenSource): ref Token +{ + tok := gettagdata(ts, Tscript, 0); + if (tok != nil) + tok.text = CU->stripscript(tok.text); + return tok; +} + +gettagdata(ts: ref TokenSource, tag, doentities: int): ref Token +{ + s := ""; + j := 0; + c := getchar(ts); + + while(c >= 0) { + if (c == '<') { + tstate := getstate(ts); + tok := gettag(ts); + rewind(ts, tstate); + if (tok != nil && tok.tag == tag+RBRA) { + ungetchar(ts); + return ref Token(Data, s, nil); + } + # tag was not </tag>, take as regular data + } + if (doentities && c == '&') + (c, nil) = ampersand(ts); + + if(c < 0) + break; + if(c != 0) + s[j++] = c; + c = getchar(ts); + } + if(eof(ts)) + return ref Token(Data, s, nil); + + return nil; +} + +# We've just seen a '<'. Gather up stuff to closing '>' (if buffer +# ends before then, return nil). +# If it's a tag, look up the name, gather the attributes, and return +# the appropriate token. +# Else it's either just plain data or some kind of ignorable stuff: +# return a Data or Comment token as appropriate. +gettag(ts: ref TokenSource): ref Token +{ + rbra := 0; + ans : ref Token = nil; + al: list of Attr; + start := getstate(ts); + c := getchar(ts); + + # dummy loop: break out of this when hit end of buffer + eob: + for(;;) { + if(c == '/') { + rbra = RBRA; + c = getchar(ts); + } + if(c < 0) + break eob; + if(c>=C->NCTYPE || !int (ctype[c]&LETTER)) { + # not a tag + if(c == '!') { + ans = comment(ts); + if(ans != nil) + return ans; + break eob; + } + else { + rewind(ts, start); + return ref Token(Data, "<", nil); + } + } + # c starts a tagname + ans = ref Token(Notfound, nil, nil); + name := ""; + name[0] = lowerc(c); + i := 1; + for(;;) { + c = getchar(ts); + if(c < 0) + break eob; + if(c>=C->NCTYPE || !int (ctype[c]&NAMCHAR)) + break; + name[i++] = lowerc(c); + } + (fnd, tag) := T->lookup(tagtable, name); + if(fnd) + ans.tag = tag+rbra; + else + ans.text = name; # for warning print, in build +attrloop: + for(;;) { + # look for "ws name" or "ws name ws = ws val" (ws=whitespace) + # skip whitespace + while(c < C->NCTYPE && ctype[c] == C->W) { + c = getchar(ts); + if(c < 0) + break eob; + } + if(c == '>') + break attrloop; + if(c == '<') { + if(warn) + sys->print("warning: unclosed tag; last name=%s\n", name); + ungetchar(ts); + break attrloop; + } + if(c >= C->NCTYPE || !int (ctype[c]&LETTER)) { + if(warn) + sys->print("warning: expected attribute name; last name=%s\n", name); + # skip to next attribute name + for(;;) { + c = getchar(ts); + if(c < 0) + break eob; + if(c < C->NCTYPE && int (ctype[c]&LETTER)) + continue attrloop; + if(c == '<') { + if(warn) + sys->print("warning: unclosed tag; last name=%s\n", name); + ungetchar(ts); + break attrloop; + } + if(c == '>') + break attrloop; + } + } + # gather attribute name + name = ""; + name[0] = lowerc(c); + i = 1; + for(;;) { + c = getchar(ts); + if(c < 0) + break eob; + if(c >= C->NCTYPE || !int (ctype[c]&NAMCHAR)) + break; + name[i++] = lowerc(c); + } + (afnd, attid) := T->lookup(attrtable, name); + if(warn && !afnd) + sys->print("warning: unknown attribute name %s\n", name); + # skip whitespace + while(c < C->NCTYPE && ctype[c] == C->W) { + c = getchar(ts); + if(c < 0) + break eob; + } + if(c != '=') { + # no value for this attr + if(afnd) + al = (attid, "") :: al; + continue attrloop; + } + # c is '=' here; skip whitespace + for(;;) { + c = getchar(ts); + if(c < 0) + break eob; + if(c >= C->NCTYPE || ctype[c] != C->W) + break; + } + # gather value + quote := 0; + if(c == '\'' || c == '"') { + quote = c; + c = getchar(ts); + if(c < 0) + break eob; + } + val := ""; + nv := 0; + valloop: + for(;;) { + if(c < 0) + break eob; +# other browsers allow value strings to be broken across lines +# especially the case for Javascript event handlers / URLs + if (c == '>' && !quote) + break valloop; +# old code otherwise ok - keep for now for reference +# if(c == '>') { +# if(quote) { +# # c might be part of string (though not good style) +# # but if line ends before close quote, assume +# # there was an unmatched quote +# ti := ts.i; +# for(;;) { +# c = getchar(ts); +# if(c < 0) +# break eob; +# if(c == quote) { +# backup(ts, ti); +# val[nv++] = '>'; +# c = getchar(ts); +# continue valloop; +# } +# if(c == '\n') { +# if(warn) +# sys->print("warning: apparent unmatched quote\n"); +# backup(ts, ti); +# quote = 0; +# c = '>'; +# break valloop; +# } +# } +# } +# else +# break valloop; +# } + if(quote) { + if(c == quote) { + c = getchar(ts); + if(c < 0) + break eob; + break valloop; + } + if(c == '\r') { + c = getchar(ts); + continue valloop; + } + if(c == '\t' || c == '\n') + c = ' '; + } + else { + if(c < C->NCTYPE && ctype[c]==C->W) + break valloop; + } + if(c == '&') { + ok : int; + (c, ok) = ampersand(ts); + if(!ok) + break eob; + } + val[nv++] = c; + c = getchar(ts); + } + if(afnd) + al = (attid, val) :: al; + } + ans.attr = al; + return ans; + } + if(eof(ts)) { + if(warn) + sys->print("warning: incomplete tag at end of page\n"); + rewind(ts, start); + return ref Token(Data, "<", nil); + } + return nil; +} + + +# We've just read a '<!', +# so this may be a comment or other ignored section, or it may +# be just a literal string if there is no close before end of file +# (other browsers do that). +# The accepted practice seems to be (note: contrary to SGML spec!): +# If see <!--, look for --> to close, or if none, > to close. +# If see <!(not --), look for > to close. +# If no close before end of file, leave original characters in as literal data. +# +# If we see ignorable stuff, return Comment token. +# Else return nil (caller should back up and try again when more data arrives, +# unless at end of file, in which case caller should just make '<' a data token). +comment(ts: ref TokenSource) : ref Token +{ + havecomment := 0; + commentstart := 0; + c := getchar(ts); + if(c == '-') { + state := getstate(ts); + c = getchar(ts); + if(c == '-') { + commentstart = 1; + if(findstr(ts, "-->")) + havecomment = 1; + else + rewind(ts, state); + } + } + if(!havecomment) { + if(c == '>') + havecomment = 1; + else if(c >= 0) { + if(findstr(ts, ">")) + havecomment = 1; + } + } + if(havecomment) + return ref Token(Comment, nil, nil); + return nil; +} + +# Look for string s in token source. +# If found, return 1, with buffer at next char after s, +# else return 0 (caller should back up). +findstr(ts: ref TokenSource, s: string) : int +{ + n := len s; + eix := n-1; + buf := ""; + c : int; + + if (n == 1) { + while ((c = getchar(ts)) >= 0) + if (c == s[0]) + return 1; + return 0; + } + + for (i := 0; i < n; i++) { + c = getchar(ts); + if (c < 0) + return 0; + buf[i] = c; + } + + for (;;) { + # this could be much more efficient by tracking + # the start char through buf + if (buf == s) + return 1; + c = getchar(ts); + if (c < 0) + return 0; + buf = buf[1:]; + buf[eix] = c; + } + return 0; # keep the compiler quiet +} + +# We've just read an '&'; look for an entity reference +# name, and if found, return (translated char, 1). +# Otherwise the input stream is rewound to just after +# the '&' +# if there is a complete entity name but it isn't known, +# ('&', 1) is returned, if an incomplete name is encountered +# (0, 0) is returned +ampersand(ts: ref TokenSource): (int, int) +{ + state := getstate(ts); + c := getchar(ts); + fnd := 0; + ans := 0; + if(c == '#') { + v := 0; + c = getchar(ts); + if (c == 'x' || c == 'X') { + for (c = getchar(ts); c >= 0; c = getchar(ts)) { + if (int (ctype[c] & C->D)) { + v = v*16 + c-'0'; + continue; + } + c = lowerc(c); + if (c >= 'a' && c <= 'f') { + v = v*16 + 10 + c-'a'; + continue; + } + break; + } + } else { + while(c >= 0) { + if(ctype[c] != C->D) + break; + v = v*10 + c-'0'; + c = getchar(ts); + } + } + if(c >= 0) { + if(!(c == ';' || c == '\n' || c == '\r' || c == '<')) + ungetchar(ts); + c = v; + if(c==160) + c = ' '; # non-breaking space + if(c >= Winstart && c <= Winend) + c = winchars[c-Winstart]; + ans = c; + fnd = (v != 0); + } + } + # only US-ASCII chars can make up &charnames; + else if(c >= 0 && c < 16r80 && int (ctype[c] & LETTER)) { + s := ""; + s[0] = c; + k := 1; + for(;;) { + c = getchar(ts); + if(c < 0) + break; + if(c < 16r80 && int (ctype[c]&NAMCHAR)) + s[k++] = c; + else { + if(!(c == ';' || c == '\n' || c == '\r')) + ungetchar(ts); + break; + } + } + if (c < 0 || c == ' ' || c == ';' || c == '\n' || c == '\r' || c == '<') + (fnd, ans) = T->lookup(chartab, s); + } + if(!fnd) { + if(c < 0 && !eof(ts)) { + # was incomplete + rewind(ts, state); + return (0, 0); + } + else { + rewind(ts, state); + return ('&', 1); + } + } + # elide soft hyphens (­ / &xAD;) +# not suficient - need to do it for all input in getdata() which is too heavy handed +# if (ans == '') +# ans = 0; + return (ans, 1); +} + +# If c is an uppercase letter, return its lowercase version, +# otherwise return c. +# Assume c is a NAMCHAR, so don't need range check on ctype[] +lowerc(c: int) : int +{ + if(ctype[c] == C->U) { + # this works for accented characters in Latin1, too + return c + 16r20; + } + return c; +} + +Token.aval(t: self ref Token, attid: int): (int, string) +{ + attr := t.attr; + while(attr != nil) { + a := hd attr; + if(a.attid == attid) + return (1, a.value); + attr = tl attr; + } + return (0, ""); +} + + +# for debugging +Token.tostring(t: self ref Token) : string +{ + ans := ""; + tag := t.tag; + if(tag == Data) + ans = ans + "'" + t.text + "'"; + else { + ans = ans + "<"; + if(tag >= RBRA) { + tag -= RBRA; + ans = ans + "/"; + } + tname := tagnames[tag]; + if(tag == Notfound) + tname = "?"; + ans = ans + S->toupper(tname); + for(al := t.attr; al != nil; al = tl al) { + a := hd al; + aname := attrnames[a.attid]; + ans = ans + " " + aname; + if(a.value != "") + ans = ans + "='" + a.value + "'"; + } + ans = ans + ">"; + } + return ans; +} + + +CONVBLK : con 1024; # number of characters to convert at a time + +# Returns -1 if no complete character left before current end of data. +getchar(ts: ref TokenSource): int +{ + st := ts.state; + if (st.s == nil || st.si >= len st.s) { + bs := ts.b; + st.si = 0; + st.s = ""; + st.prevcsstate = st.csstate; + st.prevbi = st.bi; + edata := bs.edata; + if (st.bi >= edata) + return -1; + (state, s, n ) := ts.chset->btos(st.csstate, bs.data[st.bi:edata], CONVBLK); + if (s == nil) { + if (bs.eof && edata == bs.edata) { + # must have been an encoding error at eof + st.prevbi = st.bi = edata; + } + return -1; + } + st.csstate = state; + st.s = s; + st.bi += n; + } + return st.s[st.si++]; +} + +# back up by one input character +# NOTE: can only call this function post a successful getchar() call +ungetchar(ts : ref TokenSource) +{ + st := ts.state; + # assert(len st.s >= 1 && st.si > 0) + if (st.si <= 0) + raise "EXInternal:too many backups"; + st.si--; +} + +rewind(ts : ref TokenSource, state : ref TSstate) +{ + ts.state = state; +} + +# return a copy of the TokenSource state +getstate(ts : ref TokenSource) : ref TSstate +{ + return ref *ts.state; +} + |
