diff options
Diffstat (limited to 'appl/cmd/webgrab.b')
| -rw-r--r-- | appl/cmd/webgrab.b | 532 |
1 files changed, 532 insertions, 0 deletions
diff --git a/appl/cmd/webgrab.b b/appl/cmd/webgrab.b new file mode 100644 index 00000000..0659f398 --- /dev/null +++ b/appl/cmd/webgrab.b @@ -0,0 +1,532 @@ +# Webgrab -- for getting html pages and the subordinate files (images, frame children) +# they refer to (using "src=..." in a tag) into the local file space. +# Assume http: scheme if none specified. +# Usage: +# webgrab [-r] [-v] [-o stem] url +# If stem is specified, file will be saved in stem.html and images will +# go in stem_1.jpg (or .gif, ...), stem_2.jpg, etc. +# If stem is not specified, derive it from url (see getstem comment, below). +# If -r is specified, get "raw", i.e., no image fetching/html munging. +# If -v is specified (verbose), print some progress information, +# with more if -vv is given. + +implement Webgrab; + +include "sys.m"; + sys: Sys; + FD: import sys; + +include "draw.m"; + +include "string.m"; + S: String; + +include "url.m"; + U: Url; + ParsedUrl: import U; + +include "daytime.m"; + DT: Daytime; + +include "bufio.m"; + B: Bufio; + +include "arg.m"; + +Webgrab: module +{ + init: fn(ctxt: ref Draw->Context, args: list of string); +}; + +stderr: ref FD; +verbose := 0; + +httpproxy: ref Url->ParsedUrl; +noproxydoms: list of string; # domains that don't require proxy + +init(nil: ref Draw->Context, args: list of string) +{ + sys = load Sys Sys->PATH; + stderr = sys->fildes(2); + S = load String String->PATH; + U = load Url Url->PATH; + DT = load Daytime Daytime->PATH; + B = load Bufio Bufio->PATH; + arg := load Arg Arg->PATH; + if(S == nil || U == nil || DT == nil || B == nil || arg == nil) + error_exit("can't load a module"); + U->init(); + stem := ""; + rawflag := 0; + arg->init(args); + arg->setusage("webgrab [-r] [-v[v]] [-o stem] url"); + url := ""; + while((o := arg->opt()) != 0) + case o { + 'r' => + rawflag = 1; + 'v' => + verbose++; + 'o' => + stem = arg->earg(); + * => + arg->usage(); + } + args = arg->argv(); + if(len args != 1) + arg->usage(); + url = hd args; + arg = nil; + (nil,xr) := S->splitstrl(url,"//"); + (nil,yr) := S->splitl(url,":"); + if(xr == "" && yr == "") + url = "http://" + url; + u := U->makeurl(url); + if(stem == "") + stem = getstem(u); + readconfig(); + grab(u, stem, rawflag); +} + +readconfig() +{ + cfgio := B->open("/services/webget/config", sys->OREAD); + if(cfgio != nil) { + for(;;) { + line := B->cfgio.gets('\n'); + if(line == "") { + B->cfgio.close(); + break; + } + if(line[0]=='#') + continue; + (key, val) := S->splitl(line, " \t="); + val = S->take(S->drop(val, " \t="), "^\r\n"); + if(val == "") + continue; + case key { + "httpproxy" => + if(val == "none") + continue; + # val should be host or host:port + httpproxy = U->makeurl("http://" + val); + if(verbose) + sys->fprint(stderr, "Using http proxy %s\n", httpproxy.tostring()); + "noproxy" or + "noproxydoms" => + (nil, noproxydoms) = sys->tokenize(val, ";, \t"); + } + } + } +} + +# Make up a stem for forming save-file-names, based on url u. +# Use the last non-nil component of u.path, without a final extension, +# else use the host. Then, if the stem still contains a '.' (e.g., www.lucent) +# use the part after the final '.'. +# Finally, if all else fails, use use "grabout". +getstem(u: ref ParsedUrl) : string +{ + stem := ""; + if(u.path != "") { + (l, r) := S->splitr(u.path, "/"); + if(r == "") { + # path ended with '/'; try next to last component + if(l != "") + (l, r) = S->splitr(l[0:len l - 1], "/"); + } + if(r != "") + stem = r; + } + if(stem == "") + stem = u.host; + if(stem != "") { + ext: string; + (stem, ext) = S->splitr(stem, "."); + if(stem == "") + stem = ext; + else + stem = stem[0:len stem - 1]; + (nil, stem) = S->splitr(stem, "."); + } + if(stem == "") + stem = "grabout"; + return stem; +} + +grab(u: ref ParsedUrl, stem: string, rawflag: int) +{ + (err, contents, fd, actual) := httpget(u); + if(err != "") + error_exit(err); + ish := is_html(contents); + if(ish) + contents = addfetchcomment(contents, u, actual); + if(rawflag || !ish) { + writebytes(stem, contents, fd); + return; + } + # get subordinates, modify contents + subs : list of (string, string); + (contents, subs) = subfix(contents, stem); + writebytes(stem + ".html", contents, fd); + for(l := subs; l != nil; l = tl l) { + (fname, suburl) := hd l; + subu := U->makeurl(suburl); + subu.makeabsolute(actual); + (suberr, subcontents, subfd, subactual) := httpget(subu); + if(suberr != "") { + sys->fprint(stderr, "webgrab: can't fetch subordinate %s from %s: %s\n", fname, subu.tostring(), suberr); + continue; + } + writebytes(fname, subcontents, subfd); + } +} + +# Fix the html in array a so that referenced subordinate files (SRC= or BACKGROUND= fields of tags) +# are replaced with local names (stem_1.xxx, stem_2.xxx, etc.), +# and return the fixed array along with a list of (local name, subordinate url) +# of images to be fetched. +subfix(a: array of byte, stem: string) : (array of byte, list of (string, string)) +{ + alen := len a; + if(alen == 0) + return (a, nil); + nsubs := 0; + newa := array[alen + 1000] of byte; + newai := 0; + j := 0; + intag := 0; + incom := 0; + quote := 0; + subs : list of (string, string) = nil; + for(i := 0; i < alen; i++) { + c := int a[i]; + if(incom) { + if(amatch(a, i, alen, "-->")) { + incom = 0; + i = i+2; + } + } + else if(intag) { + if(quote==0 && (amatch(a, i, alen, "src") || amatch(a, i, alen, "background"))) { + v := ""; + eqi := 0; + if(amatch(a, i, alen, "src")) + k := i+3; + else + k = i+10; + for(; k < alen; k++) + if(!iswhite(int a[k])) + break; + if(k < alen && int a[k] == '=') { + eqi = k; + k++; + while(k<alen && iswhite(int a[k])) + k++; + if(k<alen) { + kstart := k; + c = int a[k]; + if(c == '\'' || c== '"') { + quote = int a[k++]; + while(k<alen && (int a[k])!=quote) + k++; + v = string a[kstart+1:k]; + k++; + } + else { + while(k<alen && !iswhite(int a[k]) && int a[k] != '>') + k++; + v = string a[kstart:k]; + } + } + } + if(v != "") { + f := ""; + for(l := subs; l != nil; l = tl l) { + (ff,uu) := hd l; + if(v == uu) { + f = ff; + break; + } + } + if(f == "") { + nsubs++; + f = stem + "_" + string nsubs + getsuff(v); + subs = (f, v) :: subs; + } + # should check for newa too small + newa[newai:] = a[j:eqi+1]; + newai += eqi+1-j; + xa := array of byte f; + newa[newai:] = xa; + newai += len xa; + j = k; + } + i = k-1; + } + if(c == '>' && quote == 0) + intag = 0; + if(quote) { + if(quote == c) + quote = 0; + else if(c == '"' || c == '\'') + quote = c; + } + } + else if(c == '<') + intag = 1; + } + if(nsubs == 0) + return (a, nil); + if(i > j) { + newa[newai:] = a[j:i]; + newai += i-j; + } + ans := array[newai] of byte; + ans[0:] = newa[0:newai]; + anssubs : list of (string, string) = nil; + for(ll := subs; ll != nil; ll = tl ll) + anssubs = hd ll :: anssubs; + return (ans, anssubs); +} + +# add c after all f's in a +fixnames(a: array of byte, f: string, c: byte) +{ + alen := len a; + n := alen - len f; + for(i := 0; i < n; i++) { + if(amatch(a, i, alen, f)) { + a[i+len f] = c; + } + } +} + +amatch(a: array of byte, i, alen: int, s: string) : int +{ + slen := len s; + for(k := 0; i+k < alen && k < slen; k++) { + c := int a[i+k]; + if(c >= 'A' && c <= 'Z') + c = c + (int 'a' - int 'A'); + if(c != s[k]) + break; + } + if(k == slen) { + return 1; + } + return 0; +} + +getsuff(ustr: string) : string +{ + u := U->makeurl(ustr); + if(u.path != "") { + for(i := len u.path - 1; i >= 0; i--) { + c := u.path[i]; + if(c == '.') + return u.path[i:]; + if(c == '/') + break; + } + } + return ""; +} + +iswhite(c: int) : int +{ + return (c==' ' || c=='\t' || c=='\n' || c=='\r'); +} + +# Add a comment to end of a giving date and source of fetch +addfetchcomment(a: array of byte, u, actu: ref ParsedUrl) : array of byte +{ + now := DT->text(DT->local(DT->now())); + ustr := u.tostring(); + actustr := actu.tostring(); + comment := "\n<!-- Fetched " + now + " from " + ustr; + if(ustr != actustr) + comment += ", redirected to " + actustr; + comment += " -->\n"; + acom := array of byte comment; + newa := array[len a + len acom] of byte; + newa[0:] = a; + newa[len a:] = acom; + return newa; +} + +# Get u, return (error string, body, actual url of source, after redirection) +httpget(u: ref ParsedUrl) : (string, array of byte, ref Sys->FD, ref ParsedUrl) +{ + ans, body : array of byte; + restfd: ref Sys->FD; + for(redir := 0; redir < 10; redir++) { + if(u.port == "") + u.port = "80"; # default IP port for HTTP + if(verbose) + sys->fprint(stderr, "connecting to %s\n", u.host); + dialhost, port: string; + req := "GET "; + if(httpproxy != nil && need_proxy(u.host)) { + dialhost = httpproxy.host; + port = httpproxy.port; + req += "http://" + u.host; + } + else { + dialhost = u.host; + port = u.port; + } + (ok, net) := sys->dial("tcp!" + dialhost + "!" + port, nil); + if(ok < 0) + return (sys->sprint("can't dial %s: %r", dialhost), nil, nil, nil); + req += "/" + u.path; + if(u.query != "") + req += "?" + u.query; + req += " HTTP/1.0\r\nHost: "+u.host+"\r\nUser-agent: Inferno/webgrab\r\n\r\n"; + if(verbose) + sys->fprint(stderr, "writing request: %s\n", req); + areq := array of byte req; + n := sys->write(net.dfd, areq, len areq); + if(n != len areq) + return (sys->sprint("write problem: %r"), nil, nil, nil); + (ans, restfd) = readbytes(net.dfd); + (status, rest) := stripline(ans); + if(verbose) + sys->fprint(stderr, "response: %s\n", status); + (vers, statusrest) := S->splitl(status, " "); + if(!S->prefix("HTTP/", vers)) + return ("bad reply status: " + status, rest, restfd, nil); + code := int statusrest; + location := ""; + body = rest; + for(;;) { + hline: string; + (hline, body) = stripline(body); + if(hline == "") + break; + if(verbose > 1) + sys->fprint(stderr, "%s\n", hline); + if(!iswhite(hline[0])) { + (hname, hrest) := S->splitl(hline, ":"); + if(hrest != "") { + hname = S->tolower(hname); + hval := S->drop(hrest, ": \t"); + hval = S->take(hval, "^ \t"); + if(hname == "location") + location = hval; + } + } + } + if(code != 200) { + if((code == 300 || code == 301 || code == 302) && location != "") { + # MultipleChoices, MovedPerm, or MovedTemp + if(verbose) + sys->fprint(stderr, "redirect to %s\n", location); + u = U->makeurl(location); + continue; + } + return ("status not ok: " + status, rest, restfd, u); + } + break; + } + return ("", body, restfd, u); +} + +need_proxy(h: string) : int +{ + doml := noproxydoms; + if(doml == nil) + return 1; # all domains need proxy + + lh := len h; + for(dom := hd doml; doml != nil; doml = tl doml) { + ld := len dom; + if(lh >= ld && h[lh-ld:] == dom) + return 0; # domain is on the noproxy list + } + + return 1; +} + +# Simple guess test for HTML: first non-white byte is '<' +is_html(a: array of byte) : int +{ + for(i := 0; i < len a; i++) + if(!iswhite(int a[i])) + break; + if(i < len a && a[i] == byte '<') + return 1; + return 0; +} + +readbytes(fd: ref Sys->FD) : (array of byte, ref Sys->FD) +{ + buf := array[Sys->ATOMICIO] of byte; + i := 0; + avail := len buf; + while (avail > 0) { + n := sys->read(fd, buf[i:], avail); + if(n <= 0) { + fd = nil; + break; + } + i += n; + avail -= n; + } + return (buf[0:i], fd); +} + +writebytes(f: string, a: array of byte, fd: ref Sys->FD) +{ + ofd: ref Sys->FD; + if (f == "-") + ofd = sys->fildes(1); + else + ofd = sys->create(f, Sys->OWRITE, 8r666); + if(ofd == nil) { + sys->fprint(stderr, "webgrab: can't create %s: %r\n", f); + return; + } + i := 0; + clen := len a; + while(i < clen) { + n := sys->write(ofd, a[i:], clen-i); + if(n < 0) { + sys->fprint(stderr, "webgrab: write error: %r\n"); + return; + } + i += n; + } + if(fd != nil) { + buf := array[Sys->ATOMICIO] of byte; + while((n := sys->read(fd, buf, len buf)) > 0) { + if(sys->write(ofd, buf, n) != n) { + sys->fprint(stderr, "webgrab: write error: %r\n"); + return; + } + } + if(n < 0) { + sys->fprint(stderr, "webgrab: read error: %r\n"); + return; + } + clen += n; + } + if (f != "-") + sys->fprint(stderr, "created %s, %d bytes\n", f, clen); +} + +stripline(b: array of byte) : (string, array of byte) +{ + n := len b - 1; + for(i := 0; i < n; i++) + if(b[i] == byte '\r' && b[i+1] == byte '\n') + return (string b[0:i], b[i+2:]); + return ("", b); +} + +error_exit(msg: string) +{ + sys->fprint(sys->fildes(2), "%s\n", msg); + raise "fail:error"; +} |
