summaryrefslogtreecommitdiff
path: root/appl/lib/regex.b
diff options
context:
space:
mode:
authorCharles.Forsyth <devnull@localhost>2006-12-22 17:07:39 +0000
committerCharles.Forsyth <devnull@localhost>2006-12-22 17:07:39 +0000
commit37da2899f40661e3e9631e497da8dc59b971cbd0 (patch)
treecbc6d4680e347d906f5fa7fca73214418741df72 /appl/lib/regex.b
parent54bc8ff236ac10b3eaa928fd6bcfc0cdb2ba46ae (diff)
20060303a
Diffstat (limited to 'appl/lib/regex.b')
-rw-r--r--appl/lib/regex.b389
1 files changed, 389 insertions, 0 deletions
diff --git a/appl/lib/regex.b b/appl/lib/regex.b
new file mode 100644
index 00000000..3e94df3f
--- /dev/null
+++ b/appl/lib/regex.b
@@ -0,0 +1,389 @@
+implement Regex;
+
+include "regex.m";
+
+# syntax
+
+# RE ALT regular expression
+# NUL
+# ALT CAT alternation
+# CAT | ALT
+#
+# CAT DUP catenation
+# DUP CAT
+#
+# DUP PRIM possibly duplicated primary
+# PCLO
+# CLO
+# OPT
+#
+# PCLO PRIM + 1 or more
+# CLO PRIM * 0 or more
+# OPT PRIM ? 0 or 1
+#
+# PRIM ( RE )
+# ()
+# DOT any character
+# CHAR a single character
+# ESC escape sequence
+# [ SET ] character set
+# NUL null string
+# HAT beginning of string
+# DOL end of string
+#
+
+NIL : con -1; # a refRex constant
+NONE: con -2; # ditto, for an un-set value
+BAD: con 1<<16; # a non-character
+HUGE: con (1<<31) - 1;
+
+# the data structures of re.m would like to be ref-linked, but are
+# circular (see fn walk), thus instead of pointers we use indexes
+# into an array (arena) of nodes of the syntax tree of a regular expression.
+# from a storage-allocation standpoint, this replaces many small
+# allocations of one size with one big one of variable size.
+
+ReStr: adt {
+ s : string;
+ i : int; # cursor postion
+ n : int; # number of chars left; -1 on error
+ peek : fn(s: self ref ReStr): int;
+ next : fn(s: self ref ReStr): int;
+};
+
+ReStr.peek(s: self ref ReStr): int
+{
+ if(s.n <= 0)
+ return BAD;
+ return s.s[s.i];
+}
+
+ReStr.next(s: self ref ReStr): int
+{
+ if(s.n <= 0)
+ return BAD;
+ s.n--;
+ return s.s[s.i++];
+}
+
+newRe(kind: int, left, right: refRex, set: ref Set, ar: ref Arena, pno: int): refRex
+{
+ ar.rex[ar.ptr] = Rex(kind, left, right, set, pno);
+ return ar.ptr++;
+}
+
+# parse a regex by recursive descent to get a syntax tree
+
+re(s: ref ReStr, ar: ref Arena): refRex
+{
+ left := cat(s, ar);
+ if(left==NIL || s.peek()!='|')
+ return left;
+ s.next();
+ right := re(s, ar);
+ if(right == NIL)
+ return NIL;
+ return newRe(ALT, left, right, nil, ar, 0);
+}
+
+cat(s: ref ReStr, ar: ref Arena): refRex
+{
+ left := dup(s, ar);
+ if(left == NIL)
+ return left;
+ right := cat(s, ar);
+ if(right == NIL)
+ return left;
+ return newRe(CAT, left, right, nil, ar, 0);
+}
+
+dup(s: ref ReStr, ar: ref Arena): refRex
+{
+ case s.peek() {
+ BAD or ')' or ']' or '|' or '?' or '*' or '+' =>
+ return NIL;
+ }
+ prim: refRex;
+ case kind:=s.next() {
+ '(' => if(ar.pno < 0) {
+ if(s.peek() == ')') {
+ s.next();
+ prim = newRe(NUL, NONE, NONE, nil, ar, 0);
+ } else {
+ prim = re(s, ar);
+ if(prim==NIL || s.next()!=')')
+ s.n = -1;
+ }
+ } else {
+ pno := ++ar.pno;
+ lp := newRe(LPN, NONE, NONE, nil, ar, pno);
+ rp := newRe(RPN, NONE, NONE, nil, ar, pno);
+ if(s.peek() == ')') {
+ s.next();
+ prim = newRe(CAT, lp, rp, nil, ar, 0);
+
+ } else {
+ prim = re(s, ar);
+ if(prim==NIL || s.next()!=')')
+ s.n = -1;
+ else {
+ prim = newRe(CAT, prim, rp, nil, ar, 0);
+ prim = newRe(CAT, lp, prim, nil, ar, 0);
+ }
+ }
+ }
+ '[' => prim = newRe(SET, NONE, NONE, newSet(s), ar, 0);
+ * => case kind {
+ '.' => kind = DOT;
+ '^' => kind = HAT;
+ '$' => kind = DOL;
+ }
+ prim = newRe(esc(s, kind), NONE, NONE, nil, ar, 0);
+ }
+ case s.peek() {
+ '*' => kind = CLO;
+ '+' => kind = PCLO;
+ '?' => kind = OPT;
+ * => return prim;
+ }
+ s.next();
+ return newRe(kind, prim, NONE, nil, ar, 0);
+}
+
+esc(s: ref ReStr, char: int): int
+{
+ if(char == '\\') {
+ char = s.next();
+ case char {
+ BAD => s.n = -1;
+ 'n' => char = '\n';
+ }
+ }
+ return char;
+}
+
+# walk the tree adjusting pointers to refer to
+# next state of the finite state machine
+
+walk(r: refRex, succ: refRex, ar: ref Arena)
+{
+ if(r==NONE)
+ return;
+ rex := ar.rex[r];
+ case rex.kind {
+ ALT => walk(rex.left, succ, ar);
+ walk(rex.right, succ, ar);
+ return;
+ CAT => walk(rex.left, rex.right, ar);
+ walk(rex.right, succ, ar);
+ ar.rex[r] = ar.rex[rex.left]; # optimization
+ return;
+ CLO or PCLO =>
+ end := newRe(OPT, r, succ, nil, ar, 0); # here's the circularity
+ walk(rex.left, end, ar);
+ OPT => walk(rex.left, succ, ar);
+ }
+ ar.rex[r].right = succ;
+}
+
+compile(e: string, flag: int): (Re, string)
+{
+ if(e == nil)
+ return (nil, "missing expression");
+ s := ref ReStr(e, 0, len e);
+ ar := ref Arena(array[2*s.n] of Rex, 0, 0, (flag&1)-1);
+ start := ar.start = re(s, ar);
+ if(start==NIL || s.n!=0)
+ return (nil, "invalid regular expression");
+ walk(start, NIL, ar);
+ if(ar.pno < 0)
+ ar.pno = 0;
+ return (ar, nil);
+}
+
+# todo1, todo2: queues for epsilon and advancing transitions
+Gaz: adt {
+ pno: int;
+ beg: int;
+ end: int;
+};
+Trace: adt {
+ cre: refRex; # cursor in Re
+ beg: int; # where this trace began;
+ gaz: list of Gaz;
+};
+Queue: adt {
+ ptr: int;
+ q: array of Trace;
+};
+
+execute(re: Re, s: string): array of (int, int)
+{
+ return executese(re, s, (-1,-1), 1, 1);
+}
+
+executese(re: Re, s: string, range: (int, int), bol: int, eol: int): array of (int,int)
+{
+ if(re==nil)
+ return nil;
+ (s0, s1) := range;
+ if(s0 < 0)
+ s0 = 0;
+ if(s1 < 0)
+ s1 = len s;
+ gaz : list of Gaz;
+ (beg, end) := (-1, -1);
+ todo1 := ref Queue(0, array[re.ptr] of Trace);
+ todo2 := ref Queue(0, array[re.ptr] of Trace);
+ for(i:=s0; i<=s1; i++) {
+ small2 := HUGE; # earliest possible match if advance
+ if(beg == -1) # no leftmost match yet
+ todo1.q[todo1.ptr++] = Trace(re.start, i, nil);
+ for(k:=0; k<todo1.ptr; k++) {
+ q := todo1.q[k];
+ rex := re.rex[q.cre];
+ next1 := next2 := NONE;
+ case rex.kind {
+ NUL =>
+ next1 = rex.right;
+ DOT =>
+ if(i<len s && s[i]!='\n')
+ next2 = rex.right;
+ HAT =>
+ if(i == s0 && bol)
+ next1 = rex.right;
+ DOL =>
+ if(i == s1 && eol)
+ next1 = rex.right;
+ SET =>
+ if(i<len s && member(s[i], rex.set))
+ next2 = rex.right;
+ CAT or
+ PCLO =>
+ next1 = rex.left;
+ ALT or
+ CLO or
+ OPT =>
+ next1 = rex.right;
+ k = insert(rex.left, q.beg, q.gaz, todo1, k);
+ LPN =>
+ next1 = rex.right;
+ q.gaz = Gaz(rex.pno,i,-1)::q.gaz;
+ RPN =>
+ next1 = rex.right;
+ for(r:=q.gaz; ; r=tl r) {
+ (pno,beg1,end1) := hd r;
+ if(rex.pno==pno && end1==-1) {
+ q.gaz = Gaz(pno,beg1,i)::q.gaz;
+ break;
+ }
+ }
+ * =>
+ if(i<len s && rex.kind==s[i])
+ next2 = rex.right;
+ }
+ if(next1 != NONE) {
+ if(next1 != NIL)
+ k =insert(next1, q.beg, q.gaz, todo1, k);
+ else if(better(q.beg, i, beg, end))
+ (gaz, beg, end) = (q.gaz, q.beg, i);
+ }
+ if(next2 != NONE) {
+ if(next2 != NIL) {
+ if(q.beg < small2)
+ small2 = q.beg;
+ insert(next2, q.beg, q.gaz, todo2, 0);
+ } else if(better(q.beg, i+1, beg, end))
+ (gaz, beg, end) = (q.gaz, q.beg, i+1);
+ }
+
+ }
+ if(beg!=-1 && beg<small2) # nothing better possible
+ break;
+ (todo1,todo2) = (todo2, todo1);
+ todo2.ptr = 0;
+ }
+ if(beg == -1)
+ return nil;
+ result := array[re.pno+1] of { 0 => (beg,end), * => (-1,-1) };
+ for( ; gaz!=nil; gaz=tl gaz) {
+ (pno, beg1, end1) := hd gaz;
+ (rbeg, nil) := result[pno];
+ if(rbeg==-1 && (beg1|end1)!=-1)
+ result[pno] = (beg1,end1);
+ }
+ return result;
+}
+
+better(newbeg, newend, oldbeg, oldend: int): int
+{
+ return oldbeg==-1 || newbeg<oldbeg ||
+ newbeg==oldbeg && newend>oldend;
+}
+
+insert(next: refRex, tbeg: int, tgaz: list of Gaz, todo: ref Queue, k: int): int
+{
+ for(j:=0; j<todo.ptr; j++)
+ if(todo.q[j].cre == next)
+ if(todo.q[j].beg <= tbeg)
+ return k;
+ else
+ break;
+ if(j < k)
+ k--;
+ if(j < todo.ptr)
+ todo.ptr--;
+ for( ; j<todo.ptr; j++)
+ todo.q[j] = todo.q[j+1];
+ todo.q[todo.ptr++] = Trace(next, tbeg, tgaz);
+ return k;
+}
+
+ASCII : con 128;
+WORD : con 32;
+
+member(char: int, set: ref Set): int
+{
+ if(char < 128)
+ return ((set.ascii[char/WORD]>>char%WORD)&1)^set.neg;
+ for(l:=set.unicode; l!=nil; l=tl l) {
+ (beg, end) := hd l;
+ if(char>=beg && char<=end)
+ return !set.neg;
+ }
+ return set.neg;
+}
+
+newSet(s: ref ReStr): ref Set
+{
+ set := ref Set(0, array[ASCII/WORD] of {* => 0}, nil);
+ if(s.peek() == '^') {
+ set.neg = 1;
+ s.next();
+ }
+ while(s.n > 0) {
+ char1 := s.next();
+ if(char1 == ']')
+ return set;
+ char1 = esc(s, char1);
+ char2 := char1;
+ if(s.peek() == '-') {
+ s.next();
+ char2 = s.next();
+ if(char2 == ']')
+ break;
+ char2 = esc(s, char2);
+ if(char2 < char1)
+ break;
+ }
+ for( ; char1<=char2; char1++)
+ if(char1 < ASCII)
+ set.ascii[char1/WORD] |= 1<<char1%WORD;
+ else {
+ set.unicode = (char1,char2)::set.unicode;
+ break;
+ }
+ }
+ s.n = -1;
+ return nil;
+}