4 * regcomp and regexec -- regsub and regerror are elsewhere
6 * Copyright (c) 1986 by University of Toronto.
7 * Written by Henry Spencer. Not derived from licensed software.
9 * Permission is granted to anyone to use this software for any
10 * purpose on any computer system, and to redistribute it freely,
11 * subject to the following restrictions:
13 * 1. The author is not responsible for the consequences of use of
14 * this software, no matter how awful, even if they arise
17 * 2. The origin of this software must not be misrepresented, either
18 * by explicit claim or by omission.
20 * 3. Altered versions must be plainly marked as such, and must not
21 * be misrepresented as being the original software.
23 * Beware that some of this code is subtly aware of the way operator
24 * precedence is structured in regular expressions. Serious changes in
25 * regular-expression syntax might require a total rethink.
27 * *** NOTE: this code has been altered slightly for use in Tcl. ***
28 * *** The only change is to use ckalloc and ckfree instead of ***
29 * *** malloc and free. ***
31 * *** and again for Expect!!! - DEL
33 * *** More minor corrections stolen from tcl7.5p1/regexp.c - DEL
38 #include "expect_cf.h"
40 #include "tclRegexp.h"
41 #include "exp_regexp.h"
44 #define NOTSTATIC /* was at one time, but Expect needs access */
47 * The "internal use only" fields in regexp.h are present to pass info from
48 * compile to execute that permits the execute phase to run lots faster on
49 * simple cases. They are:
51 * regstart char that must begin a match; '\0' if none obvious
52 * reganch is the match anchored (at beginning-of-line only)?
53 * regmust string (pointer into program) that match must include, or NULL
54 * regmlen length of regmust string
56 * Regstart and reganch permit very fast decisions on suitable starting points
57 * for a match, cutting down the work a lot. Regmust permits fast rejection
58 * of lines that cannot possibly match. The regmust tests are costly enough
59 * that regcomp() supplies a regmust only if the r.e. contains something
60 * potentially expensive (at present, the only such thing detected is * or +
61 * at the start of the r.e., which can involve a lot of backup). Regmlen is
62 * supplied because the test in regexec() needs it and regcomp() is computing
67 * Structure for regexp "program". This is essentially a linear encoding
68 * of a nondeterministic finite-state machine (aka syntax charts or
69 * "railroad normal form" in parsing technology). Each node is an opcode
70 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
71 * all nodes except BRANCH implement concatenation; a "next" pointer with
72 * a BRANCH on both ends of it is connecting two alternatives. (Here we
73 * have one of the subtle syntax dependencies: an individual BRANCH (as
74 * opposed to a collection of them) is never concatenated with anything
75 * because of operator precedence.) The operand of some types of node is
76 * a literal string; for others, it is a node leading into a sub-FSM. In
77 * particular, the operand of a BRANCH node is the first node of the branch.
78 * (NB this is *not* a tree structure: the tail of the branch connects
79 * to the thing following the set of BRANCHes.) The opcodes are:
82 /* definition number opnd? meaning */
83 #define END 0 /* no End of program. */
84 #define BOL 1 /* no Match "" at beginning of line. */
85 #define EOL 2 /* no Match "" at end of line. */
86 #define ANY 3 /* no Match any one character. */
87 #define ANYOF 4 /* str Match any character in this string. */
88 #define ANYBUT 5 /* str Match any character not in this string. */
89 #define BRANCH 6 /* node Match this alternative, or the next... */
90 #define BACK 7 /* no Match "", "next" ptr points backward. */
91 #define EXACTLY 8 /* str Match this string. */
92 #define NOTHING 9 /* no Match empty string. */
93 #define STAR 10 /* node Match this (simple) thing 0 or more times. */
94 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
95 #define OPEN 20 /* no Mark this point in input as start of #n. */
96 /* OPEN+1 is number 1, etc. */
97 #define CLOSE (OPEN+NSUBEXP) /* no Analogous to OPEN. */
102 * BRANCH The set of branches constituting a single choice are hooked
103 * together with their "next" pointers, since precedence prevents
104 * anything being concatenated to any individual branch. The
105 * "next" pointer of the last BRANCH in a choice points to the
106 * thing following the whole choice. This is also where the
107 * final "next" pointer of each individual branch points; each
108 * branch starts with the operand node of a BRANCH node.
110 * BACK Normal "next" pointers all implicitly point forward; BACK
111 * exists to make loop structures possible.
113 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
114 * BRANCH structures using BACK. Simple cases (one character
115 * per match) are implemented with STAR and PLUS for speed
116 * and to minimize recursive plunges.
118 * OPEN,CLOSE ...are numbered at compile time.
122 * A node is one char of opcode followed by two chars of "next" pointer.
123 * "Next" pointers are stored as two 8-bit pieces, high order first. The
124 * value is a positive offset from the opcode of the node containing it.
125 * An operand, if any, simply follows the node. (Note that much of the
126 * code generation knows about this implicit relationship.)
128 * Using two bytes for the "next" pointer is vast overkill for most things,
129 * but allows patterns to get big without disasters.
132 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
133 #define OPERAND(p) ((p) + 3)
136 * See regmagic.h for one further detail of program structure.
141 * Utility definitions.
144 #define UCHARAT(p) ((int)*(unsigned char *)(p))
146 #define UCHARAT(p) ((int)*(p)&CHARBITS)
149 #define FAIL(m) { regerror(m); return(NULL); }
150 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
151 #define META "^$.[()|?+*\\"
154 * Flags to be passed up and down.
156 #define HASWIDTH 01 /* Known never to match null string. */
157 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
158 #define SPSTART 04 /* Starts with * or +. */
159 #define WORST 0 /* Worst case. */
162 * Global work variables for regcomp().
164 static char *regparse; /* Input-scan pointer. */
165 static int regnpar; /* () count. */
166 static char regdummy;
167 static char *regcode; /* Code-emit pointer; ®dummy = don't. */
168 static long regsize; /* Code size. */
171 * The first byte of the regexp internal "program" is actually this magic
172 * number; the start node begins in the second byte.
178 * Forward declarations for regcomp()'s friends.
181 #define STATIC static
184 STATIC char *regbranch();
185 STATIC char *regpiece();
186 STATIC char *regatom();
187 STATIC char *regnode();
188 STATIC char *regnext();
190 STATIC void reginsert();
191 STATIC void regtail();
192 STATIC void regoptail();
194 STATIC int strcspn();
197 /* regcomp originally appeared here - DEL */
200 - reg - regular expression, i.e. main body or parenthesized thing
202 * Caller must absorb opening parenthesis.
204 * Combining parenthesis handling with the base level of regular expression
205 * is a trifle forced, but the need to tie the tails of the branches to what
206 * follows makes it hard to avoid.
210 int paren; /* Parenthesized? */
215 register char *ender;
216 register int parno = 0;
219 *flagp = HASWIDTH; /* Tentatively. */
221 /* Make an OPEN node, if parenthesized. */
223 if (regnpar >= NSUBEXP)
227 ret = regnode(OPEN+parno);
231 /* Pick up the branches, linking them together. */
232 br = regbranch(&flags);
236 regtail(ret, br); /* OPEN -> first. */
239 if (!(flags&HASWIDTH))
241 *flagp |= flags&SPSTART;
242 while (*regparse == '|') {
244 br = regbranch(&flags);
247 regtail(ret, br); /* BRANCH -> BRANCH. */
248 if (!(flags&HASWIDTH))
250 *flagp |= flags&SPSTART;
253 /* Make a closing node, and hook it on the end. */
254 ender = regnode((paren) ? CLOSE+parno : END);
257 /* Hook the tails of the branches to the closing node. */
258 for (br = ret; br != NULL; br = regnext(br))
259 regoptail(br, ender);
261 /* Check for proper termination. */
262 if (paren && *regparse++ != ')') {
263 FAIL("unmatched ()");
264 } else if (!paren && *regparse != '\0') {
265 if (*regparse == ')') {
266 FAIL("unmatched ()");
268 FAIL("junk on end"); /* "Can't happen". */
276 - regbranch - one alternative of an | operator
278 * Implements the concatenation operator.
285 register char *chain;
286 register char *latest;
289 *flagp = WORST; /* Tentatively. */
291 ret = regnode(BRANCH);
293 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
294 latest = regpiece(&flags);
297 *flagp |= flags&HASWIDTH;
298 if (chain == NULL) /* First piece. */
299 *flagp |= flags&SPSTART;
301 regtail(chain, latest);
304 if (chain == NULL) /* Loop ran zero times. */
305 (void) regnode(NOTHING);
311 - regpiece - something followed by possible [*+?]
313 * Note that the branching code sequences used for ? and the general cases
314 * of * and + are somewhat optimized: they use the same NOTHING node as
315 * both the endmarker for their branch list and the body of the last branch.
316 * It might seem that this node could be dispensed with entirely, but the
317 * endmarker role is not redundant.
328 ret = regatom(&flags);
338 if (!(flags&HASWIDTH) && op != '?')
339 FAIL("*+ operand could be empty");
340 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
342 if (op == '*' && (flags&SIMPLE))
343 reginsert(STAR, ret);
344 else if (op == '*') {
345 /* Emit x* as (x&|), where & means "self". */
346 reginsert(BRANCH, ret); /* Either x */
347 regoptail(ret, regnode(BACK)); /* and loop */
348 regoptail(ret, ret); /* back */
349 regtail(ret, regnode(BRANCH)); /* or */
350 regtail(ret, regnode(NOTHING)); /* null. */
351 } else if (op == '+' && (flags&SIMPLE))
352 reginsert(PLUS, ret);
353 else if (op == '+') {
354 /* Emit x+ as x(&|), where & means "self". */
355 next = regnode(BRANCH); /* Either */
357 regtail(regnode(BACK), ret); /* loop back */
358 regtail(next, regnode(BRANCH)); /* or */
359 regtail(ret, regnode(NOTHING)); /* null. */
360 } else if (op == '?') {
361 /* Emit x? as (x|) */
362 reginsert(BRANCH, ret); /* Either x */
363 regtail(ret, regnode(BRANCH)); /* or */
364 next = regnode(NOTHING); /* null. */
366 regoptail(ret, next);
369 if (ISMULT(*regparse))
376 - regatom - the lowest level
378 * Optimization: gobbles an entire sequence of ordinary characters so that
379 * it can turn them into a single node, which is smaller to store and
380 * faster to run. Backslashed characters are exceptions, each becoming a
381 * separate node; the code is simpler that way and it's not worth fixing.
390 *flagp = WORST; /* Tentatively. */
392 switch (*regparse++) {
401 *flagp |= HASWIDTH|SIMPLE;
405 register int classend;
407 if (*regparse == '^') { /* Complement of range. */
408 ret = regnode(ANYBUT);
411 ret = regnode(ANYOF);
412 if (*regparse == ']' || *regparse == '-')
414 while (*regparse != '\0' && *regparse != ']') {
415 if (*regparse == '-') {
417 if (*regparse == ']' || *regparse == '\0')
420 clss = UCHARAT(regparse-2)+1;
421 classend = UCHARAT(regparse);
422 if (clss > classend+1)
423 FAIL("invalid [] range");
424 for (; clss <= classend; clss++)
432 if (*regparse != ']')
433 FAIL("unmatched []");
435 *flagp |= HASWIDTH|SIMPLE;
439 ret = reg(1, &flags);
442 *flagp |= flags&(HASWIDTH|SPSTART);
447 FAIL("internal urp"); /* Supposed to be caught earlier. */
453 FAIL("?+* follows nothing");
457 if (*regparse == '\0')
459 ret = regnode(EXACTLY);
462 *flagp |= HASWIDTH|SIMPLE;
469 len = strcspn(regparse, META);
471 FAIL("internal disaster");
472 ender = *(regparse+len);
473 if (len > 1 && ISMULT(ender))
474 len--; /* Back off clear of ?+* operand. */
478 ret = regnode(EXACTLY);
492 - regnode - emit a node
494 static char * /* Location. */
502 if (ret == ®dummy) {
509 *ptr++ = '\0'; /* Null "next" pointer. */
517 - regc - emit (if appropriate) a byte of code
523 if (regcode != ®dummy)
524 *regcode++ = (char)b;
530 - reginsert - insert an operator in front of already-emitted operand
532 * Means relocating the operand.
541 register char *place;
543 if (regcode == ®dummy) {
554 place = opnd; /* Op node, where operand used to be. */
561 - regtail - set the next-pointer at the end of a node chain
575 /* Find last node. */
578 temp = regnext(scan);
584 if (OP(scan) == BACK)
588 *(scan+1) = (char)(offset>>8)&0377;
589 *(scan+2) = (char)offset&0377;
593 - regoptail - regtail on operand of first argument; nop if operandless
600 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
601 if (p == NULL || p == ®dummy || OP(p) != BRANCH)
603 regtail(OPERAND(p), val);
607 * regexec and friends
611 * Global work variables for regexec().
613 static char *reginput; /* String-input pointer. */
614 NOTSTATIC char *regbol; /* Beginning of input, for ^ check. */
615 static char **regstartp; /* Pointer to startp array. */
616 static char **regendp; /* Ditto for endp. */
622 NOTSTATIC int regtry();
623 STATIC int regmatch();
624 STATIC int regrepeat();
629 STATIC char *regprop();
634 - regexec - match a regexp against a string
637 regexec(prog, string, stringlength, matchlength)
638 register regexp *prog;
639 register char *string; /* note: CURRENTLY ASSUMED TO BE NULL-TERMINATED!!! */
640 int stringlength; /* length of string */
641 int *matchlength; /* number of chars matched (or to be skipped) */
642 /* set when MATCH or CANT_MATCH */
645 extern char *strchr();
648 if (prog == NULL || string == NULL) {
649 regerror("NULL parameter");
650 return(EXP_TCLERROR);
653 /* Check validity of program. */
654 if (UCHARAT(prog->program) != MAGIC) {
655 regerror("corrupted program");
656 return(EXP_KM_ERROR);
660 /* no need for this shortcut anyway */
661 /* If there is a "must appear" string, look for it. */
662 if (prog->regmust != NULL) {
664 while ((s = strchr(s, prog->regmust[0])) != NULL) {
665 if (strncmp(s, prog->regmust, prog->regmlen) == 0)
666 break; /* Found it. */
669 if (s == NULL) /* Not present. */
674 /* Mark beginning of line for ^ . */
677 /* Simplest case: anchored match need be tried only once. */
679 int r = regtry(prog,string,matchlength);
680 if (r == CANT_MATCH) *matchlength = stringlength;
684 /* Messy cases: unanchored match. */
686 if (prog->regstart != '\0') {
687 register char *s2 = s;
689 /* We know what char it must start with. */
693 s2 = strchr(s2,prog->regstart);
695 *matchlength = stringlength;
698 r = regtry(prog,s2,matchlength);
699 if (r == CANT_MATCH) {
703 if (s2 == s) return(r);
708 /* We don't -- general case. */
709 register char *s2 = s;
710 int r = regtry(prog,s,matchlength);
711 if (r == EXP_MATCH) return(r);
712 else if (r == EXP_CANMATCH) return(r);
713 /* at this point, we know some characters at front */
714 /* of string don't match */
715 for (s2++;*s2;s2++) {
716 r = regtry(prog,s2,matchlength);
717 if (r == CANT_MATCH) continue;
718 /* if we match or can_match, say cant_match and */
719 /* record the number of chars at front that don't match */
723 /* made it thru string with CANT_MATCH all the way */
724 *matchlength = stringlength;
731 - regtry - try match at specific point
733 /* return CAN_MATCH, CANT_MATCH or MATCH */
734 int /* 0 failure, 1 success */
735 regtry(prog, string, matchlength)
738 int *matchlength; /* only set for MATCH */
743 int r; /* result of regmatch */
746 regstartp = prog->startp;
747 regendp = prog->endp;
751 for (i = NSUBEXP; i > 0; i--) {
755 r = regmatch(prog->program + 1);
756 if (EXP_MATCH == r) {
757 prog->startp[0] = string;
758 prog->endp[0] = reginput;
759 *matchlength = reginput-string;
762 return(r); /* CAN_MATCH or CANT_MATCH */
766 - regmatch - main matching routine
768 * Conceptually the strategy is simple: check to see whether the current
769 * node matches, call self recursively to see whether the rest matches,
770 * and then act accordingly. In practice we make some effort to avoid
771 * recursion, in particular by going through "ordinary" nodes (that don't
772 * need to know whether the rest of the match failed) by a loop instead of
775 /* returns CAN, CANT or MATCH */
776 static int /* 0 failure, 1 success */
780 register char *scan; /* Current node. */
781 char *next; /* Next node. */
782 #ifndef strchr /* May be #defined to something else */
783 extern char *strchr();
788 if (scan != NULL && regnarrate)
789 fprintf(stderr, "%s(\n", regprop(scan));
791 while (scan != NULL) {
794 fprintf(stderr, "%s...\n", regprop(scan));
796 next = regnext(scan);
800 if (reginput != regbol)
802 return(EXP_CANTMATCH);
805 if (*reginput != '\0')
807 /* note this implies that "$" must match everything received to this point! */
808 return(EXP_CANTMATCH);
811 if (*reginput == '\0')
813 return(EXP_CANMATCH);
817 /* register int len;*/
820 opnd = OPERAND(scan);
822 /* this section of code is totally rewritten - DEL */
823 /* group of literal chars in pattern */
824 /* compare each one */
826 if (*opnd != *reginput) {
827 if (*reginput == '\0') {
829 } else return EXP_CANTMATCH;
834 } while (*opnd != '\0');
838 /* if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
841 if (*reginput == '\0')
842 return(EXP_CANMATCH);
843 if (strchr(OPERAND(scan),*reginput) == NULL)
844 return(EXP_CANTMATCH);
848 /* if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
851 if (*reginput == '\0')
852 return(EXP_CANMATCH);
853 if (strchr(OPERAND(scan),*reginput) != NULL)
854 return(EXP_CANTMATCH);
872 int r; /* result of regmatch */
875 no = OP(scan) - OPEN;
879 if (r == EXP_MATCH) {
881 * Don't set startp if some later
882 * invocation of the same parentheses
885 if (regstartp[no] == NULL)
886 regstartp[no] = save;
903 int r; /* result of regmatch */
906 no = OP(scan) - CLOSE;
910 if (r == EXP_MATCH) {
912 * Don't set endp if some later
913 * invocation of the same parentheses
916 if (regendp[no] == NULL)
927 if (OP(next) != BRANCH) /* No choice. */
928 next = OPERAND(scan); /* Avoid recursion. */
930 match_status = EXP_CANTMATCH;
936 r = regmatch(OPERAND(scan));
937 if (r == EXP_MATCH) return(r);
938 if (r == EXP_CANMATCH) {
942 scan = regnext(scan);
943 } while (scan != NULL && OP(scan) == BRANCH);
944 return(match_status);
952 register char nextch;
958 if (*reginput == '\0') return EXP_CANMATCH;
961 * Lookahead to avoid useless match attempts
962 * when we know what character comes next.
964 match_status = EXP_CANTMATCH;
966 if (OP(next) == EXACTLY)
967 nextch = *OPERAND(next);
968 min = (OP(scan) == STAR) ? 0 : 1;
970 no = regrepeat(OPERAND(scan));
972 /* If it could work, try it. */
973 /* 3rd condition allows for CAN_MATCH */
974 if (nextch == '\0' || *reginput == nextch || *reginput == '\0') {
975 int r = regmatch(next);
978 if (r == EXP_CANMATCH)
979 /* match_status = r;*/
980 return(EXP_CANMATCH);
982 /* Couldn't or didn't -- back up. */
984 reginput = save + no;
986 return(match_status);
992 if (*reginput == '\0') {
993 return(EXP_CANMATCH);
997 /* return(EXP_CANMATCH); Success! */
1001 if (OP(scan) > OPEN && OP(scan) < OPEN+NSUBEXP) {
1003 } else if (OP(scan) > CLOSE && OP(scan) < CLOSE+NSUBEXP) {
1006 regerror("memory corruption");
1007 return(EXP_TCLERROR);
1016 * We get here only if there's trouble -- normally "case END" is
1017 * the terminating point.
1019 regerror("corrupted pointers");
1020 return(EXP_TCLERROR);
1024 - regrepeat - repeatedly match something simple, report how many
1030 register int count = 0;
1031 register char *scan;
1032 register char *opnd;
1033 #ifndef strchr /* May be #defined to something else */
1034 /*DEL*/ extern char *strchr();
1041 count = strlen(scan);
1045 while (*opnd == *scan) {
1051 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1057 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1062 default: /* Oh dear. Called inappropriately. */
1063 regerror("internal foulup");
1064 count = 0; /* Best compromise. */
1073 - regnext - dig the "next" pointer out of a node
1079 register int offset;
1096 STATIC char *regprop();
1099 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1106 register char op = EXACTLY; /* Arbitrary non-END op. */
1107 register char *next;
1108 extern char *strchr();
1112 while (op != END) { /* While that wasn't END last time... */
1114 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1116 if (next == NULL) /* Next ptr. */
1119 printf("(%d)", (s-r->program)+(next-s));
1121 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1122 /* Literal string, where present. */
1123 while (*s != '\0') {
1132 /* Header fields of interest. */
1133 if (r->regstart != '\0')
1134 printf("start `%c' ", r->regstart);
1136 printf("anchored ");
1137 if (r->regmust != NULL)
1138 printf("must have \"%s\"", r->regmust);
1143 - regprop - printable representation of opcode
1150 static char buf[50];
1152 (void) strcpy(buf, ":");
1194 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1206 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1216 if (OP(op) > OPEN && OP(op) < OPEN+NSUBEXP) {
1217 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1220 } else if (OP(op) > CLOSE && OP(op) < CLOSE+NSUBEXP) {
1221 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1224 TclRegError("corrupted opcode");
1229 (void) strcat(buf, p);
1235 * The following is provided for those people who do not have strcspn() in
1236 * their C libraries. They should get off their butts and do something
1237 * about it; at least one public-domain implementation of those (highly
1238 * useful) string routines has been published on Usenet.
1242 * strcspn - find length of initial segment of s1 consisting entirely
1243 * of characters not from s2
1251 register char *scan1;
1252 register char *scan2;
1256 for (scan1 = s1; *scan1 != '\0'; scan1++) {
1257 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1258 if (*scan1 == *scan2++)
1265 #endif /* 0 WHOLE FILE */