2 * re.c - compile regular expressions.
6 * Copyright (C) 1991-2013 the Free Software Foundation, Inc.
8 * This file is part of GAWK, the GNU implementation of the
9 * AWK Programming Language.
11 * GAWK is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 3 of the License, or
14 * (at your option) any later version.
16 * GAWK is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
28 static reg_syntax_t syn;
29 static void check_bracket_exp(char *s, size_t len);
30 const char *regexflags2str(int flags);
32 /* make_regexp --- generate compiled regular expressions */
35 make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
37 static char metas[] = ".*+(){}[]|?^$\\";
41 static char *buf = NULL;
43 const char *end = s + len;
46 static bool first = true;
47 static bool no_dfa = false;
48 bool has_anchor = false;
53 * The number of bytes in the current multibyte character.
54 * It is 0, when the current character is a singlebyte character.
56 size_t is_multibyte = 0;
60 if (gawk_mb_cur_max > 1)
61 memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize. */
66 /* for debugging and testing */
67 no_dfa = (getenv("GAWK_NO_DFA") != NULL);
71 check_bracket_exp((char *) s, len);
73 /* Handle escaped characters first. */
76 * Build a copy of the string (in buf) with the
77 * escaped characters translated, and generate the regex
81 emalloc(buf, char *, len + 2, "make_regexp");
83 } else if (len > buflen) {
84 erealloc(buf, char *, len + 2, "make_regexp");
91 if (gawk_mb_cur_max > 1 && ! is_multibyte) {
92 /* The previous byte is a singlebyte character, or last byte
93 of a multibyte character. We check the next character. */
94 is_multibyte = mbrlen(src, end - src, &mbs);
95 if ( is_multibyte == 1
96 || is_multibyte == (size_t) -1
97 || is_multibyte == (size_t) -2
98 || is_multibyte == 0) {
99 /* We treat it as a single-byte character. */
105 /* We skip multibyte character, since it must not be a special
107 if ((gawk_mb_cur_max == 1 || ! is_multibyte) &&
127 c2 = parse_escape(&src);
131 * Unix awk treats octal (and hex?) chars
132 * literally in re's, so escape regexp
137 && (isdigit(c) || c == 'x')
138 && strchr("()|*+?.^$\\[]", c2) != NULL)
143 case '9': /* a\9b not valid */
147 case 'y': /* normally \b */
149 if (! do_traditional) {
155 /* else, fall through */
164 if (c == '^' || c == '$')
167 *dest++ = *src++; /* not '\\' */
169 if (gawk_mb_cur_max > 1 && is_multibyte)
176 emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
177 memset((char *) rp, 0, sizeof(*rp));
179 rp->pat.allocated = 0; /* regex will allocate the buffer */
180 emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
183 * Lo these many years ago, had I known what a P.I.T.A. IGNORECASE
184 * was going to turn out to be, I wouldn't have bothered with it.
186 * In the case where we have a multibyte character set, we have no
187 * choice but to use RE_ICASE, since the casetable is for single-byte
188 * character sets only.
190 * On the other hand, if we do have a single-byte character set,
191 * using the casetable should give a performance improvement, since
192 * it's computed only once, not each time a regex is compiled. We
193 * also think it's probably better for portability. See the
194 * discussion by the definition of casetable[] in eval.c.
197 ignorecase = !! ignorecase; /* force to 1 or 0 */
199 if (gawk_mb_cur_max > 1) {
201 rp->pat.translate = NULL;
204 rp->pat.translate = (RE_TRANSLATE_TYPE) casetable;
207 rp->pat.translate = NULL;
214 dfasyntax(dfa_syn, ignorecase, '\n');
217 if ((rerr = re_compile_pattern(buf, len, &(rp->pat))) != NULL) {
220 /* rerr already gettextized inside regex routines */
221 error("%s: /%s/", rerr, buf);
224 fatal("%s: /%s/", rerr, buf);
227 /* gack. this must be done *after* re_compile_pattern */
228 rp->pat.newline_anchor = false; /* don't get \n in middle of string */
229 if (dfa && ! no_dfa) {
231 rp->dfareg = dfaalloc();
232 dfacomp(buf, len, rp->dfareg, true);
235 rp->has_anchor = has_anchor;
237 /* Additional flags that help with RS as regexp. */
238 for (i = 0; i < len; i++) {
239 if (strchr(metas, buf[i]) != NULL) {
245 for (i = len - 1; i >= 0; i--) {
246 if (strchr("*+|?", buf[i]) != NULL) {
247 rp->maybe_long = true;
255 /* research --- do a regexp search. use dfa if possible */
258 research(Regexp *rp, char *str, int start,
259 size_t len, int flags)
261 const char *ret = str;
262 int try_backref = false;
267 need_start = ((flags & RE_NEED_START) != 0);
268 no_bol = ((flags & RE_NO_BOL) != 0);
274 * Always do dfa search if can; if it fails, then even if
275 * need_start is true, we won't bother with the regex search.
277 * The dfa matcher doesn't have a no_bol flag, so don't bother
278 * trying it in that case.
280 * 7/2008: Skip the dfa matcher if need_start. The dfa matcher
281 * has bugs in certain multibyte cases and it's too difficult
282 * to try to special case things.
284 if (rp->dfa && ! no_bol && ! need_start) {
288 * dfa likes to stick a '\n' right after the matched
289 * text. So we just save and restore the character.
291 save = str[start+len];
292 ret = dfaexec(rp->dfareg, str+start, str+start+len, true,
293 &count, &try_backref);
294 str[start+len] = save;
298 if (need_start || rp->dfa == false || try_backref) {
300 * Passing NULL as last arg speeds up search for cases
301 * where we don't need the start/end info.
303 res = re_search(&(rp->pat), str, start+len,
304 start, len, need_start ? &(rp->regs) : NULL);
314 /* refree --- free up the dynamic memory used by a compiled regexp */
321 rp->pat.translate = NULL;
324 free(rp->regs.start);
334 /* dfaerror --- print an error message for the dfa routines */
337 dfaerror(const char *s)
340 exit(EXIT_FATAL); /* for DJGPP */
343 /* re_update --- recompile a dynamic regexp */
350 if ((t->re_flags & CASE) == IGNORECASE) {
351 /* regex was compiled with settings matching IGNORECASE */
352 if ((t->re_flags & CONSTANT) != 0) {
353 /* it's a constant, so just return it as is */
354 assert(t->type == Node_regex);
358 if (t->re_text != NULL) {
359 /* if contents haven't changed, just return it */
360 if (cmp_nodes(t->re_text, t1) == 0)
362 /* things changed, fall through to recompile */
365 /* get fresh copy of the text of the regexp */
366 t->re_text = dupnode(t1);
368 /* was compiled with different IGNORECASE or text changed */
371 if (t->re_reg != NULL)
377 if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) {
378 /* reset regexp text if needed */
381 t->re_text = dupnode(t1);
384 t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
385 IGNORECASE, t->re_cnt, true);
387 /* clear case flag */
388 t->re_flags &= ~CASE;
389 /* set current value of case flag */
390 t->re_flags |= IGNORECASE;
394 /* resetup --- choose what kind of regexps we match */
400 * Syntax bits: _that_ is yet another mind trip. Recreational drugs
401 * are helpful for recovering from the experience.
403 * Aharon Robbins <arnold@skeeve.com>
404 * Sun, 21 Oct 2007 23:55:33 +0200
407 syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */
408 else if (do_traditional)
409 syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */
411 syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */
414 * Interval expressions are now on by default, as POSIX is
415 * wide-spread enough that people want it. The do_intervals
416 * variable remains for use with --traditional.
419 syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES;
421 (void) re_set_syntax(syn);
422 dfasyntax(syn, false, '\n');
425 /* avoid_dfa --- return true if we should not use the DFA matcher */
428 avoid_dfa(NODE *re, char *str, size_t len)
432 if (! re->re_reg->has_anchor)
435 for (end = str + len; str < end; str++)
442 /* reisstring --- return true if the RE match is a simple string match */
445 reisstring(const char *text, size_t len, Regexp *re, const char *buf)
450 /* simple checking for meta characters in re */
452 return false; /* give up early, can't be string match */
454 /* make accessable to gdb */
455 matched = &buf[RESTART(re, buf)];
457 res = (memcmp(text, matched, len) == 0);
462 /* reflags2str --- make a regex flags value readable */
465 reflags2str(int flagval)
467 static const struct flagtab values[] = {
468 { RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
469 { RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
470 { RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
471 { RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
472 { RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
473 { RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
474 { RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
475 { RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
476 { RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
477 { RE_INTERVALS, "RE_INTERVALS" },
478 { RE_LIMITED_OPS, "RE_LIMITED_OPS" },
479 { RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
480 { RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
481 { RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
482 { RE_NO_BK_REFS, "RE_NO_BK_REFS" },
483 { RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
484 { RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
485 { RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
486 { RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
487 { RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
488 { RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
489 { RE_ICASE, "RE_ICASE" },
490 { RE_CARET_ANCHORS_HERE, "RE_CARET_ANCHORS_HERE" },
491 { RE_CONTEXT_INVALID_DUP, "RE_CONTEXT_INVALID_DUP" },
492 { RE_NO_SUB, "RE_NO_SUB" },
496 if (flagval == RE_SYNTAX_EMACS) /* == 0 */
497 return "RE_SYNTAX_EMACS";
499 return genflags2str(flagval, values);
503 * dfawarn() is called by the dfa routines whenever a regex is compiled
504 * must supply a dfawarn.
508 dfawarn(const char *dfa_warning)
511 * This routine does nothing, since gawk does its own
512 * (better) check for bad [[:foo:]] syntax.
516 /* check_bracket_exp --- look for /[:space:]/ that should be /[[:space:]]/ */
519 check_bracket_exp(char *s, size_t length)
521 static struct reclass {
527 * Ordered by what we hope is frequency,
528 * since it's linear searched.
530 { "[:alpha:]", 9, false },
531 { "[:digit:]", 9, false },
532 { "[:alnum:]", 9, false },
533 { "[:upper:]", 9, false },
534 { "[:lower:]", 9, false },
535 { "[:space:]", 9, false },
536 { "[:xdigit:]", 10, false },
537 { "[:punct:]", 9, false },
538 { "[:print:]", 9, false },
539 { "[:graph:]", 9, false },
540 { "[:cntrl:]", 9, false },
541 { "[:blank:]", 9, false },
547 char *sp, *sp2, *end;
560 sp = sp2 = memchr(sp, '[', (end - sp));
564 for (count++, sp++; *sp != '\0'; sp++) {
568 * ] as first char after open [ is skipped
572 if (*sp == ']' && sp > sp2) {
576 else if ((sp - sp2) >= 2
577 && sp[-1] == '^' && sp[-2] == '[')
584 sp++; /* skip past ']' */
589 if (count > 0) { /* bad regex, give up */
595 for (i = 0; classes[i].name != NULL; i++) {
596 if (classes[i].warned)
598 len = classes[i].len;
599 if ( len == (sp - sp2)
600 && memcmp(sp2, classes[i].name, len) == 0) {
606 if (found && ! classes[i].warned) {
607 warning(_("regexp component `%.*s' should probably be `[%.*s]'"),
609 classes[i].warned = true;
620 /* regexflags2str --- make regex flags printable */
623 regexflags2str(int flags)
625 static const struct flagtab regextab[] = {
626 { RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
627 { RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
628 { RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
629 { RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
630 { RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
631 { RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
632 { RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
633 { RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
634 { RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
635 { RE_INTERVALS, "RE_INTERVALS" },
636 { RE_LIMITED_OPS, "RE_LIMITED_OPS" },
637 { RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
638 { RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
639 { RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
640 { RE_NO_BK_REFS, "RE_NO_BK_REFS" },
641 { RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
642 { RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
643 { RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
644 { RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
645 { RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
646 { RE_DEBUG, "RE_DEBUG" },
647 { RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
648 { RE_ICASE, "RE_ICASE" },
649 { RE_CARET_ANCHORS_HERE, "RE_CARET_ANCHORS_HERE" },
650 { RE_CONTEXT_INVALID_DUP, "RE_CONTEXT_INVALID_DUP" },
654 return genflags2str(flags, regextab);