2 * re2glob - C implementation
3 * (c) 2007 ActiveState Software Inc.
11 ExpChopNested _ANSI_ARGS_ ((Tcl_UniChar** xstr,
17 ExpLiteral _ANSI_ARGS_ ((Tcl_UniChar* nexto,
22 ExpCollapseStar _ANSI_ARGS_ ((Tcl_UniChar* src,
25 ExpCollapseQForward _ANSI_ARGS_ ((Tcl_UniChar* src,
29 ExpCollapseQBack _ANSI_ARGS_ ((Tcl_UniChar* src,
33 ExpBackslash _ANSI_ARGS_ ((char prefix,
38 ExpCountStar _ANSI_ARGS_ ((Tcl_UniChar* src, Tcl_UniChar* last));
42 xxx (Tcl_UniChar* x, int xl)
44 static Tcl_DString ds;
45 Tcl_DStringInit (&ds);
46 return Tcl_UniCharToUtfDString (x,xl,&ds);
56 * Output: x2 size of input (literal where every character has to be
58 * Location: For next translated unit, in output.
59 * Size of last generated unit, in characters.
60 * Stack of output locations at opening parens. x1 size of input.
61 * Location for next location on stack.
64 static Tcl_UniChar litprefix [] = {'*','*','*','='};
65 static Tcl_UniChar areprefix [] = {'*','*','*',':'};
66 static Tcl_UniChar areopts [] = {'(','?'};
67 static Tcl_UniChar nocapture [] = {'?',':'};
68 static Tcl_UniChar lookhas [] = {'?','='};
69 static Tcl_UniChar looknot [] = {'?','!'};
70 static Tcl_UniChar xcomment [] = {'?','#'};
72 static Tcl_UniChar classa [] = {'[','.'};
73 static Tcl_UniChar classb [] = {'[','='};
74 static Tcl_UniChar classc [] = {'[',':'};
90 out = nexto = (Tcl_UniChar*) Tcl_Alloc (strlen*2*sizeof (Tcl_UniChar));
91 paren = nextp = (Tcl_UniChar**) Tcl_Alloc (strlen* sizeof (Tcl_UniChar*));
96 * Start processing ...
99 #define CHOP(n) {str += (n); strlen -= (n);}
100 #define CHOPC(c) {while (*str != (c) && strlen) CHOP(1) ;}
101 #define EMIT(c) {lastsz = 1; *nexto++ = (c);}
102 #define EMITX(c) {lastsz++; *nexto++ = (c);}
103 #define MATCH(lit) ((strlen >= (sizeof (lit)/sizeof (Tcl_UniChar))) && (0 == Tcl_UniCharNcmp (str,(lit),sizeof(lit)/sizeof (Tcl_UniChar))))
104 #define MATCHC(c) (strlen && (*str == (c)))
105 #define PUSHPAREN {*nextp++ = nexto;}
106 #define UNEMIT {nexto -= lastsz; lastsz = -1;}
107 /* Tcl_UniCharIsDigit ? */
108 #define MATCH_DIGIT (MATCHC ('0') || MATCHC ('1') || \
109 MATCHC ('2') || MATCHC ('3') || \
110 MATCHC ('4') || MATCHC ('5') || \
111 MATCHC ('6') || MATCHC ('7') || \
112 MATCHC ('8') || MATCHC ('9'))
113 #define MATCH_HEXDIGIT (MATCH_DIGIT || \
114 MATCHC ('a') || MATCHC ('A') || \
115 MATCHC ('b') || MATCHC ('B') || \
116 MATCHC ('c') || MATCHC ('C') || \
117 MATCHC ('d') || MATCHC ('D') || \
118 MATCHC ('e') || MATCHC ('E') || \
119 MATCHC ('f') || MATCHC ('F'))
120 #define EMITC(c) {if (((c) == '\\') || \
126 EMIT ('\\'); EMITX ((c)); \
129 #define MATCH_AREOPTS(c) (c == 'b' || c == 'c' || \
130 c == 'e' || c == 'i' || c == 'm' || c == 'n' || \
131 c == 'p' || c == 'q' || c == 's' || c == 't' || \
132 c == 'w' || c == 'x')
135 #define LOG if (1) fprintf
136 #define FF fflush (stderr)
137 #define MARK(s) LOG (stderr,#s "\n"); FF;
139 #define LOG if (0) fprintf
144 /* ***= -> literal string follows */
146 LOG (stderr,"RE-2-GLOB '%s'\n", xxx(str,strlen)); FF;
148 if (MATCH (litprefix)) {
150 nexto = ExpLiteral (nexto, str, strlen);
154 /* ***: -> RE is ARE. Always for Expect. Therefore ignore */
156 if (MATCH (areprefix)) {
158 LOG (stderr,"ARE '%s'\n", xxx(str,strlen)); FF;
161 /* (?xyz) ARE options, in {bceimnpqstwx}. Not validating that the
162 * options are legal. We assume that the RE is valid.
165 if (MATCH (areopts)) { /* "(?" */
166 Tcl_UniChar* save = str;
169 int save_strlen = strlen;
170 int all_ARE_opts = 1;
172 /* First, ensure that this is actually an ARE opts string.
173 * It could be something else (e.g., a non-capturing block).
176 mark = str; CHOPC (')');
177 stop = str; /* Remember closing parens location, allows */
178 stoplen = strlen; /* us to avoid a second CHOPC run later */
181 if (MATCH_AREOPTS(*mark)) {
189 /* Reset back to our entry point. */
191 strlen = save_strlen;
194 /* Now actually perform the ARE option processing */
195 LOG (stderr, "%s\n", "Processing AREOPTS"); FF;
199 /* Equivalent to CHOPC (')') */
206 nexto = ExpLiteral (nexto, str, strlen);
208 } else if (*mark == 'x') {
210 LOG (stderr,"EXPANDED\n"); FF;
220 LOG (stderr,"'%s' <-- ",xxx(out,nexto-out)); FF;
221 LOG (stderr,"'%s'\n", xxx(str,strlen)); FF;
224 /* Expanded syntax, whitespace and comments, ignore. */
225 while (MATCHC (' ') ||
227 MATCHC (0xa)) CHOP (1);
230 if (strlen) CHOP (1);
236 /* branching is too complex */
238 } else if (MATCHC ('(')) {
241 if (MATCH (nocapture)) { /* "?:" */
242 /* non capturing -save location */
245 } else if (MATCH (lookhas) || /* "?=" */
246 MATCH (looknot)) { /* "?!" */
247 /* lookahead - ignore */
249 ExpChopNested (&str, &strlen, '(', ')');
250 } else if (MATCH (xcomment)) { /* "?#" */
251 /* comment - ignore */
252 CHOPC (')'); CHOP (1);
254 /* plain capturing */
257 } else if (MATCHC (')')) {
258 /* Closing parens. */
260 /* Everything coming after the saved result is new, and
261 * collapsed into a single entry for a possible coming operator
264 nextp --; /* Back to last save */
265 mark = *nextp; /* Location where generation for this parens started */
266 lastsz = (nexto - mark); /* This many chars generated */
267 /* Now lastsz has the correct value for a possibly following
270 } else if (MATCHC ('$') || MATCHC ('^')) {
271 /* anchor constraints - ignore */
273 } else if (MATCHC ('[')) {
274 /* Classes - reduce to any char [[=chars=]] [[.chars.]]
275 * [[:name:]] [chars] Count brackets to find end.
277 * These are a bit complicated ... [= =], [. .], [: {] sequences
278 * always have to be complete. '[' does NOT nest otherwise. And
279 * a ']' after the opening '[' (with only '^' allowed to
280 * intervene is a character, not the closing bracket. We have to
281 * process the class in pieces to handle all this. The Tcl level
282 * implementations (0-2 all have bugs one way or other, all
290 if (first && MATCHC ('^')) {
291 /* ^ as first keeps allowed ok for one more cycle */
295 } else if (allowed && MATCHC (']')) {
296 /* Not a closing bracket! */
298 } else if (MATCHC (']')) {
299 /* Closing bracket found */
302 } else if (MATCH (classa) ||
305 Tcl_UniChar delim[2];
309 while (!MATCH (delim)) CHOP (1);
312 /* Any char in class */
315 /* Reset flags handling start of class */
320 } else if (MATCHC ('\\')) {
323 if (MATCHC ('d') || MATCHC ('D') ||
324 MATCHC ('s') || MATCHC ('S') ||
325 MATCHC ('w') || MATCHC ('W')) {
326 /* Class shorthands - reduce to any char */
329 } else if (MATCHC ('m') || MATCHC ('M') ||
330 MATCHC ('y') || MATCHC ('Y') ||
331 MATCHC ('A') || MATCHC ('Z')) {
332 /* constraint escapes - ignore */
334 } else if (MATCHC ('B')) {
339 } else if (MATCHC ('0')) {
343 } else if (MATCHC ('e')) {
347 } else if (MATCHC ('a')) {
351 } else if (MATCHC ('b')) {
355 } else if (MATCHC ('f')) {
359 } else if (MATCHC ('n')) {
363 } else if (MATCHC ('r')) {
367 } else if (MATCHC ('t')) {
371 } else if (MATCHC ('v')) {
375 } else if (MATCHC ('c') && (strlen >= 2)) {
376 /* Escape \cX - reduce to (.) */
379 } else if (MATCHC ('x')) {
381 if (MATCH_HEXDIGIT) {
382 /* Escape hex character */
384 while (MATCH_HEXDIGIT) CHOP (1);
385 if ((str - mark) > 2) { mark = str - 2; }
386 ch = ExpBackslash ('x',mark,str-mark);
389 /* Without hex digits following this is a plain char */
392 } else if (MATCHC ('u')) {
393 /* Escapes unicode short. */
397 ch = ExpBackslash ('u',mark,str-mark);
399 } else if (MATCHC ('U')) {
400 /* Escapes unicode long. */
404 ch = ExpBackslash ('U',mark,str-mark);
406 } else if (MATCH_DIGIT) {
407 /* Escapes, octal, and backreferences - reduce (.*) */
409 while (MATCH_DIGIT) CHOP (1);
412 /* Plain escaped characters - copy over, requote */
416 } else if (MATCHC ('{')) {
417 /* Non-greedy and greedy bounds - reduce to (*) */
420 /* Locate closing brace and remove operator */
421 CHOPC ('}'); CHOP (1);
422 /* Remove optional greedy quantifier */
423 if (MATCHC ('?')) { CHOP (1);}
427 /* Brace is plain character, copy over */
429 /* CHOP already done */
431 } else if (MATCHC ('*') ||
434 /* (Non-)greedy operators - reduce to (*) */
436 /* Remove optional greedy quantifier */
437 if (MATCHC ('?')) { CHOP (1);}
440 } else if (MATCHC ('.')) {
441 /* anychar - copy over */
445 /* Plain char, copy over. */
451 LOG (stderr,"'%s' <-- ",xxx(out,nexto-out)); FF;
452 LOG (stderr,"'%s'\n", xxx(str,strlen)); FF;
455 * Clean up the output a bit (collapse *-sequences and absorb ?'s
460 nexto = ExpCollapseQForward (out,nexto);
461 LOG (stderr,"QF '%s'\n",xxx(out,nexto-out)); FF;
464 nexto = ExpCollapseQBack (out,nexto);
465 LOG (stderr,"QB '%s'\n",xxx(out,nexto-out)); FF;
468 nexto = ExpCollapseStar (out,nexto);
469 LOG (stderr,"ST '%s'\n",xxx(out,nexto-out)); FF;
472 * Heuristic: if there are more than two *s, the risk is far too
473 * large that the result actually is slower than the normal re
474 * matching. So bail out.
476 if (ExpCountStar (out,nexto) > 2) {
481 * Check if the result is actually useful.
482 * Empty or just a *, or ? are not. A series
483 * of ?'s is borderline, as they semi-count
487 if ((nexto == out) ||
488 (((nexto-out) == 1) &&
495 * Result generation and cleanup.
498 LOG (stderr,"RESULT_ '%s'\n", xxx(out,nexto-out)); FF;
499 glob = Tcl_NewUnicodeObj (out,(nexto-out));
503 LOG (stderr,"RESULT_ ERROR\n"); FF;
506 Tcl_Free ((char*)out);
507 Tcl_Free ((char*)paren);
514 ExpChopNested (Tcl_UniChar** xstr,
519 ExpChopNested (xstr,xstrlen, open, close)
526 Tcl_UniChar* str = *xstr;
527 int strlen = *xstrlen;
533 } else if (MATCHC (close)) {
548 ExpLiteral (nexto, str, strlen)
555 LOG (stderr,"LITERAL '%s'\n", xxx(str,strlen)); FF;
566 ExpBackslash (char prefix,
570 ExpBackslash (prefix, str, strlen)
578 char dst[TCL_UTF_MAX+1];
582 /* Construct an utf backslash sequence we can throw to Tcl */
591 Tcl_UtfBackslash (buf, NULL, dst);
592 TclUtfToUniChar (dst, &ch);
597 ExpCollapseStar (src, last)
601 Tcl_UniChar* dst, *base;
605 /* Collapses series of *'s into a single *. State machine. The
606 * complexity is due to the need of handling escaped characters.
609 LOG (stderr,"Q-STAR\n"); FF;
611 for (dst = base = src; src < last;) {
613 LOG (stderr,"@%1d /%1d '%s' <-- ", star,skip,xxx(base,dst-base)); FF;
614 LOG (stderr,"'%s'\n", xxx(src,last-src)); FF;
619 } else if (*src == '\\') {
620 skip = 1; /* Copy next char, whatever its value */
622 } else if (*src == '*') {
624 /* Previous char was *, do not copy the current * to collapse
630 star = 1; /* *-series starts here */
637 LOG (stderr,"@%1d /%1d '%s' <-- ", star,skip,xxx(base,dst-base)); FF;
638 LOG (stderr,"'%s'\n", xxx(src,last-src)); FF;
644 ExpCollapseQForward (src, last)
648 Tcl_UniChar* dst, *base;
652 /* Collapses series of ?'s coming after a *. State machine. The
653 * complexity is due to the need of handling escaped characters.
656 LOG (stderr,"Q-Forward\n"); FF;
658 for (dst = base = src; src < last;) {
660 LOG (stderr,"?%1d /%1d '%s' <-- ", quest,skip,xxx(base,dst-base)); FF;
661 LOG (stderr,"'%s'\n", xxx(src,last-src)); FF;
666 } else if (*src == '\\') {
669 /* Copy next char, whatever its value */
670 } else if (*src == '?') {
672 /* Previous char was *, do not copy the current ? to collapse
678 } else if (*src == '*') {
686 LOG (stderr,"?%1d /%1d '%s' <-- ", quest,skip,xxx(base,dst-base)); FF;
687 LOG (stderr,"'%s'\n", xxx(src,last-src)); FF;
692 ExpCollapseQBack (src, last)
696 Tcl_UniChar* dst, *base;
699 /* Collapses series of ?'s coming before a *. State machine. The
700 * complexity is due to the need of handling escaped characters.
703 LOG (stderr,"Q-Backward\n"); FF;
705 for (dst = base = src; src < last;) {
706 LOG (stderr,"/%1d '%s' <-- ",skip,xxx(base,dst-base)); FF;
707 LOG (stderr,"'%s'\n", xxx(src,last-src)); FF;
711 } else if (*src == '\\') {
713 /* Copy next char, whatever its value */
714 } else if (*src == '*') {
715 /* Move backward in the output while the previous character is
716 * an unescaped question mark. If there is a previous character,
717 * or a character before that..
720 while ((((dst-base) > 2) && (dst[-1] == '?') && (dst[-2] != '\\')) ||
721 (((dst-base) == 1) && (dst[-1] == '?'))) {
728 LOG (stderr,"/%1d '%s' <-- \n",skip,xxx(base,dst-base)); FF;
729 LOG (stderr,"'%s'\n", xxx(src,last-src)); FF;
734 ExpCountStar (src, last)
741 /* Count number of *'s. State machine. The complexity is due to the
742 * need of handling escaped characters.
745 for (; src < last; src++) {
748 } else if (*src == '\\') {
750 } else if (*src == '*') {
765 #undef MATCH_HEXDIGIT