4 * Copyright (C) 2007-2016 David Lutterkort
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * Author: David Lutterkort <dlutter@redhat.com>
31 static const struct string empty_pattern_string = {
32 .ref = REF_MAX, .str = (char *) "()"
35 static const struct string *const empty_pattern = &empty_pattern_string;
37 char *regexp_escape(const struct regexp *r) {
48 /* Use a range with from > to to force conversion of ranges into
50 ret = fa_restrict_alphabet(r->pattern->str, strlen(r->pattern->str),
51 &nre, &nre_len, 2, 1);
53 pat = escape(nre, nre_len, RX_ESCAPES);
59 /* simplify the regexp by removing some artifacts of reserving
60 chanaracters for internal purposes */
61 if (index(r->pattern->str, RESERVED_FROM_CH)) {
62 char *s = strdup(r->pattern->str);
64 for (char *p = s; *p; p++) {
65 if (STREQLEN(p, RESERVED_RANGE_RX, strlen(RESERVED_RANGE_RX))) {
66 /* Completely eliminate mentions of the reserved range */
67 p += strlen(RESERVED_RANGE_RX);
68 } else if (STREQLEN(p,
69 RESERVED_DOT_RX, strlen(RESERVED_DOT_RX))) {
70 /* Replace what amounts to a '.' by one */
71 p += strlen(RESERVED_DOT_RX);
77 pat = escape(s, -1, RX_ESCAPES);
80 pat = escape(r->pattern->str, -1, RX_ESCAPES);
87 /* Remove unneeded '()' from pat */
88 for (int changed = 1; changed;) {
90 for (char *p = pat; *p != '\0'; p++) {
91 if (*p == '(' && p[1] == ')') {
92 memmove(p, p+2, strlen(p+2)+1);
98 if (pat[0] == '(' && pat[strlen(pat)-1] == ')') {
100 for (int i=1; i < strlen(pat)-1; i++) {
109 memmove(pat, pat+1, strlen(pat+1)+1);
110 pat[strlen(pat)-1] = '\0';
117 void print_regexp(FILE *out, struct regexp *r) {
119 fprintf(out, "<NULL>");
124 if (r->pattern == NULL)
125 fprintf(out, "%p", r);
129 fa_restrict_alphabet(r->pattern->str, strlen(r->pattern->str),
131 print_chars(out, rx, rx_len);
140 make_regexp_unescape(struct info *info, const char *pat, int nocase) {
141 char *p = unescape(pat, strlen(pat), NULL);
145 return make_regexp(info, p, nocase);
148 struct regexp *make_regexp(struct info *info, char *pat, int nocase) {
149 struct regexp *regexp;
152 regexp->info = ref(info);
154 make_ref(regexp->pattern);
155 regexp->pattern->str = pat;
156 regexp->nocase = nocase;
160 /* Take a POSIX glob and turn it into a regexp. The regexp is constructed
161 * by doing the following translations of characters in the string:
164 * leave characters escaped with a backslash alone
165 * escape any of ".|{}()+^$" with a backslash
167 * Note that that ignores some of the finer points of globs, like
170 struct regexp *make_regexp_from_glob(struct info *info, const char *glob) {
171 static const char *const star = "[^/]*";
172 static const char *const qmark = "[^/]";
173 static const char *const special = ".|{}()+^$";
174 int newlen = strlen(glob);
177 for (const char *s = glob; *s; s++) {
178 if (*s == '\\' && *(s+1))
181 newlen += strlen(star)-1;
183 newlen += strlen(qmark)-1;
184 else if (strchr(special, *s) != NULL)
188 if (ALLOC_N(pat, newlen + 1) < 0)
192 for (const char *s = glob; *s; s++) {
193 if (*s == '\\' && *(s+1)) {
196 } else if (*s == '*') {
198 } else if (*s == '?') {
199 t = stpcpy(t, qmark);
200 } else if (strchr(special, *s) != NULL) {
208 return make_regexp(info, pat, 0);
211 void free_regexp(struct regexp *regexp) {
214 assert(regexp->ref == 0);
215 unref(regexp->info, info);
216 unref(regexp->pattern, string);
217 if (regexp->re != NULL) {
224 int regexp_is_empty_pattern(struct regexp *r) {
225 for (char *s = r->pattern->str; *s; s++) {
226 if (*s != '(' && *s != ')')
232 struct regexp *make_regexp_literal(struct info *info, const char *text) {
235 /* Escape special characters in text since it should be taken
237 if (ALLOC_N(pattern, 2*strlen(text)+1) < 0)
240 for (const char *t = text; *t != '\0'; t++) {
241 if ((*t == '\\') && t[1]) {
244 } else if (strchr(".|{}[]()+*?", *t) != NULL) {
251 return make_regexp(info, pattern, 0);
255 regexp_union(struct info *info, struct regexp *r1, struct regexp *r2) {
260 return regexp_union_n(info, 2, r);
263 char *regexp_expand_nocase(struct regexp *r) {
264 const char *p = r->pattern->str, *t;
268 int psub = 0, rsub = 0;
273 ret = fa_expand_nocase(p, strlen(p), &s, &len);
274 ERR_NOMEM(ret == REG_ESPACE, r->info);
275 BUG_ON(ret != REG_NOERROR, r->info, NULL);
277 /* Make sure that r->pattern->str and ret have the same number
278 * of parentheses/groups, since our parser critically depends
279 * on the fact that the regexp for a union/concat and those
280 * of its children have groups that are in direct relation */
281 for (t = p; *t; t++) if (*t == '(') psub += 1;
282 for (t = s; *t; t++) if (*t == '(') rsub += 1;
283 BUG_ON(psub < rsub, r->info, NULL);
286 char *adjusted = NULL, *a;
287 if (ALLOC_N(adjusted, strlen(s) + 2*psub + 1) < 0)
288 ERR_NOMEM(true, r->info);
290 for (int i=0; i < psub; i++) *a++ = '(';
292 for (int i=0; i < psub; i++) *a++ = ')';
300 static char *append_expanded(struct regexp *r, char **pat, char *p,
302 char *expanded = NULL;
303 size_t ofs = p - *pat;
306 expanded = regexp_expand_nocase(r);
309 *len += strlen(expanded) - strlen(r->pattern->str);
311 ret = REALLOC_N(*pat, *len);
312 ERR_NOMEM(ret < 0, r->info);
314 p = stpcpy(*pat + ofs, expanded);
321 regexp_union_n(struct info *info, int n, struct regexp **r) {
323 char *pat = NULL, *p, *expanded = NULL;
324 int nnocase = 0, npresent = 0;
326 for (int i=0; i < n; i++)
328 len += strlen(r[i]->pattern->str) + strlen("()|");
334 bool mixedcase = nnocase > 0 && nnocase < npresent;
339 if (ALLOC_N(pat, len) < 0)
344 for (int i=0; i < n; i++) {
350 if (mixedcase && r[i]->nocase) {
351 p = append_expanded(r[i], &pat, p, &len);
352 ERR_BAIL(r[i]->info);
354 p = stpcpy(p, r[i]->pattern->str);
360 return make_regexp(info, pat, nnocase == npresent);
368 regexp_concat(struct info *info, struct regexp *r1, struct regexp *r2) {
373 return regexp_concat_n(info, 2, r);
377 regexp_concat_n(struct info *info, int n, struct regexp **r) {
379 char *pat = NULL, *p, *expanded = NULL;
380 int nnocase = 0, npresent = 0;
382 for (int i=0; i < n; i++)
384 len += strlen(r[i]->pattern->str) + strlen("()");
390 bool mixedcase = nnocase > 0 && nnocase < npresent;
396 if (ALLOC_N(pat, len) < 0)
400 for (int i=0; i < n; i++) {
404 if (mixedcase && r[i]->nocase) {
405 p = append_expanded(r[i], &pat, p, &len);
406 ERR_BAIL(r[i]->info);
408 p = stpcpy(p, r[i]->pattern->str);
413 return make_regexp(info, pat, nnocase == npresent);
420 static struct fa *regexp_to_fa(struct regexp *r) {
421 const char *p = r->pattern->str;
423 struct fa *fa = NULL;
425 ret = fa_compile(p, strlen(p), &fa);
426 ERR_NOMEM(ret == REG_ESPACE, r->info);
427 BUG_ON(ret != REG_NOERROR, r->info, NULL);
431 ERR_NOMEM(ret < 0, r->info);
441 regexp_minus(struct info *info, struct regexp *r1, struct regexp *r2) {
442 struct regexp *result = NULL;
443 struct fa *fa = NULL, *fa1 = NULL, *fa2 = NULL;
448 fa1 = regexp_to_fa(r1);
451 fa2 = regexp_to_fa(r2);
454 fa = fa_minus(fa1, fa2);
458 r = fa_as_regexp(fa, &s, &s_len);
463 /* FA is the empty set, which we can't represent as a regexp */
467 if (regexp_c_locale(&s, NULL) < 0)
470 result = make_regexp(info, s, fa_is_nocase(fa));
480 unref(result, regexp);
486 regexp_iter(struct info *info, struct regexp *r, int min, int max) {
495 if ((min == 0 || min == 1) && max == -1) {
496 char q = (min == 0) ? '*' : '+';
497 ret = asprintf(&s, "(%s)%c", p, q);
498 } else if (min == max) {
499 ret = asprintf(&s, "(%s){%d}", p, min);
501 ret = asprintf(&s, "(%s){%d,%d}", p, min, max);
503 return (ret == -1) ? NULL : make_regexp(info, s, r->nocase);
507 regexp_maybe(struct info *info, struct regexp *r) {
515 ret = asprintf(&s, "(%s)?", p);
516 return (ret == -1) ? NULL : make_regexp(info, s, r->nocase);
519 struct regexp *regexp_make_empty(struct info *info) {
520 struct regexp *regexp;
523 if (regexp != NULL) {
524 regexp->info = ref(info);
525 /* Casting away the CONST for EMPTY_PATTERN is ok since it
526 is protected against changes because REF == REF_MAX */
527 regexp->pattern = (struct string *) empty_pattern;
533 static int regexp_compile_internal(struct regexp *r, const char **c) {
534 /* See the GNU regex manual or regex.h in gnulib for
535 * an explanation of these flags. They are set so that the regex
536 * matcher interprets regular expressions the same way that libfa
539 static const reg_syntax_t syntax =
540 RE_CONTEXT_INDEP_OPS|RE_CONTEXT_INVALID_OPS|RE_DOT_NOT_NULL
541 |RE_INTERVALS|RE_NO_BK_BRACES|RE_NO_BK_PARENS|RE_NO_BK_REFS
542 |RE_NO_BK_VBAR|RE_NO_EMPTY_RANGES
543 |RE_NO_POSIX_BACKTRACKING|RE_CONTEXT_INVALID_DUP|RE_NO_GNU_OPS;
544 reg_syntax_t old_syntax = re_syntax_options;
549 if (ALLOC(r->re) < 0)
553 re_syntax_options = syntax;
555 re_syntax_options |= RE_ICASE;
556 *c = re_compile_pattern(r->pattern->str, strlen(r->pattern->str), r->re);
557 re_syntax_options = old_syntax;
559 r->re->regs_allocated = REGS_REALLOCATE;
565 int regexp_compile(struct regexp *r) {
568 return regexp_compile_internal(r, &c);
571 int regexp_check(struct regexp *r, const char **msg) {
572 return regexp_compile_internal(r, msg);
575 int regexp_match(struct regexp *r,
576 const char *string, const int size,
577 const int start, struct re_registers *regs) {
579 if (regexp_compile(r) == -1)
582 return re_match(r->re, string, size, start, regs);
585 int regexp_matches_empty(struct regexp *r) {
586 return regexp_match(r, "", 0, 0, NULL) == 0;
589 int regexp_nsub(struct regexp *r) {
591 if (regexp_compile(r) == -1)
593 return r->re->re_nsub;
596 void regexp_release(struct regexp *regexp) {
597 if (regexp != NULL && regexp->re != NULL) {
605 * indent-tabs-mode: nil