1 /* $Id: regex.c,v 1.22 2003/09/24 18:49:00 ukai Exp $ */
3 * regex: Regular expression pattern match library
5 * by A.ITO, December 1989
6 * Revised by A.ITO, January 2002
10 #include <sys/types.h>
12 #endif /* REGEX_DEBUG */
33 #define RE_ITER_LIMIT 65535
35 #define RE_MATCHMODE 0x07
36 #define RE_NORMAL 0x00
39 #define RE_EXCEPT 0x03
40 #define RE_SUBREGEX 0x04
43 #define RE_ENDMARK 0x07
46 #define RE_ANYTIME 0x10
47 #define RE_IGNCASE 0x40
49 #define RE_MODE(x) ((x)->mode&RE_MATCHMODE)
50 #define RE_SET_MODE(x,v) ((x)->mode = (((x)->mode&~RE_MATCHMODE)|((v)&RE_MATCHMODE)))
53 void debugre(regexchar *);
54 char *lc2c(longchar *, int);
56 #endif /* REGEX_DEBUG */
59 #define get_mclen(c) wtf_len1((wc_uchar *)(c))
61 #define get_mclen(c) 1
66 #define TOLOWER(x) tolower(x)
67 #define TOUPPER(x) toupper(x)
71 #define RE_TYPE_CHAR 1
72 #define RE_TYPE_WCHAR_T 2
73 #define RE_WHICH_RANGE 3
74 #define RE_TYPE_SYMBOL 4
77 set_longchar(char *str)
79 unsigned char *p = (unsigned char *)str;
84 r.wch = wtf_parse1(&p);
85 if (r.wch.ccs == WC_CCS_SPECIAL || r.wch.ccs == WC_CCS_SPECIAL_W) {
86 r.type = RE_TYPE_SYMBOL;
90 if (WC_CCS_IS_UNICODE(r.wch.ccs)) {
91 if (WC_CCS_SET(r.wch.ccs) == WC_CCS_UCS_TAG)
92 r.wch.code = wc_ucs_tag_to_ucs(r.wch.code);
93 r.wch.ccs = WC_CCS_UCS4;
97 r.wch.ccs = WC_CCS_SET(r.wch.ccs);
98 r.type = RE_TYPE_WCHAR_T;
103 r.type = RE_TYPE_CHAR;
107 static Regex DefaultRegex;
108 #define CompiledRegex DefaultRegex.re
109 #define Cstorage DefaultRegex.storage
111 static int regmatch(regexchar *, char *, char *, int, char **);
112 static int regmatch1(regexchar *, longchar *);
113 static int matchWhich(longchar *, longchar *, int);
114 static int match_longchar(longchar *, longchar *, int);
115 static int match_range_longchar(longchar *, longchar *, longchar *, int);
118 * regexCompile: compile regular expression
121 regexCompile(char *ex, int igncase)
124 newRegex(ex, igncase, &DefaultRegex, &msg);
129 newRegex0(char **ex, int igncase, Regex *regex, char **msg, int level)
138 regex = (Regex *)GC_malloc(sizeof(Regex));
139 regex->alt_regex = NULL;
141 st_ptr = regex->storage;
142 for (p = *ex; *p != '\0'; p++) {
146 re->p.pattern = NULL;
147 RE_SET_MODE(re, RE_ANY);
151 re->p.pattern = NULL;
152 RE_SET_MODE(re, RE_END);
156 re->p.pattern = NULL;
157 RE_SET_MODE(re, RE_BEGIN);
161 if (re == regex->re ||
162 (RE_MODE(re - 1) != RE_ANY && (re - 1)->p.pattern == NULL)) {
164 *msg = "Invalid regular expression";
168 re->mode |= RE_ANYTIME;
172 if (re == regex->re ||
173 (RE_MODE(re - 1) != RE_ANY && (re - 1)->p.pattern == NULL)) {
175 *msg = "Invalid regular expression";
178 (re - 1)->mode |= RE_ANYTIME;
181 if (re == regex->re ||
182 (RE_MODE(re - 1) != RE_ANY && (re - 1)->p.pattern == NULL)) {
184 *msg = "Invalid regular expression";
187 (re - 1)->mode |= RE_OPT;
197 if (*p == '-' || *p == ']')
198 *(st_ptr++) = set_longchar(p);
202 *(st_ptr++) = set_longchar(p);
205 else if (*p == '-' && *(p + 1) != ']') {
206 (st_ptr++)->type = RE_WHICH_RANGE;
209 else if (*p == '\0') {
215 *(st_ptr++) = set_longchar(p);
218 if (st_ptr >= ®ex->storage[STORAGE_MAX]) {
220 *msg = "Regular expression too long";
224 (st_ptr++)->type = RE_TYPE_END;
228 re->mode |= RE_IGNCASE;
232 RE_SET_MODE(re, RE_ENDMARK);
235 regex->alt_regex = newRegex0(&p, igncase, NULL, msg, level);
236 if (regex->alt_regex == NULL)
241 RE_SET_MODE(re, RE_SUBREGEX);
243 re->p.sub = newRegex0(&p, igncase, NULL, msg, level + 1);
244 if (re->p.sub == NULL)
251 *msg = "Too many ')'";
254 RE_SET_MODE(re, RE_ENDMARK);
261 *(st_ptr) = set_longchar(p);
262 p += get_mclen(p) - 1;
263 re->p.pattern = st_ptr;
265 RE_SET_MODE(re, RE_NORMAL);
267 re->mode |= RE_IGNCASE;
270 if (st_ptr >= ®ex->storage[STORAGE_MAX] ||
271 re >= ®ex->re[REGEX_MAX]) {
273 *msg = "Regular expression too long";
277 RE_SET_MODE(re, RE_ENDMARK);
285 newRegex(char *ex, int igncase, Regex *regex, char **msg)
287 return newRegex0(&ex, igncase, regex, msg, 0);
291 * regexMatch: match regular expression
294 regexMatch(char *str, int len, int firstp)
296 return RegexMatch(&DefaultRegex, str, len, firstp);
300 RegexMatch(Regex *re, char *str, int len, int firstp)
312 for (p = str; p <= ep; p++) {
314 re->lposition = NULL;
315 for (r = re; r != NULL; r = r->alt_regex) {
316 switch (regmatch(r->re, p, ep, firstp && (p == str), &lpos)) {
317 case 1: /* matched */
319 if (re->lposition == NULL || re->lposition < lpos)
320 re->lposition = lpos;
327 if (re->lposition != NULL) {
331 p += get_mclen(p) - 1;
337 * matchedPosition: last matched position
340 MatchedPosition(Regex *re, char **first, char **last)
342 *first = re->position;
343 *last = re->lposition;
347 matchedPosition(char **first, char **last)
349 *first = DefaultRegex.position;
350 *last = DefaultRegex.lposition;
357 struct MatchingContext1 {
367 struct MatchingContext1 *sub_ctx;
368 struct MatchingContext2 *ctx2;
371 struct MatchingContext2 {
375 struct MatchingContext1 *ctx;
376 struct MatchingContext2 *ctx2;
383 #define YIELD(retval,context,lnum) (context)->label = lnum; return (retval); label##lnum:
385 static int regmatch_iter(struct MatchingContext1 *,
386 regexchar *, char *, char *, int);
389 regmatch_sub_anytime(struct MatchingContext2 *c, Regex *regex,
391 char *str, char *end_p, int iter_limit, int firstp)
401 c->ctx = GC_malloc(sizeof(struct MatchingContext1));
402 c->ctx2 = GC_malloc(sizeof(struct MatchingContext2));
410 while (regmatch_iter(c->ctx, c->regex->re, c->str, end_p, c->firstp)) {
411 c->n_any = c->ctx->lastpos - c->str;
415 if (RE_MODE(pat2) == RE_ENDMARK) {
416 c->lastpos = c->str + c->n_any;
419 else if (regmatch(pat2, c->str + c->n_any, end_p,
420 c->firstp, &c->lastpos) == 1) {
426 while (regmatch_sub_anytime(c->ctx2, regex, pat2,
427 c->str + c->n_any, end_p,
428 iter_limit - 1, c->firstp)) {
430 c->lastpos = c->ctx2->lastpos;
434 if (c->regex->alt_regex == NULL)
436 c->regex = c->regex->alt_regex;
442 regmatch_iter(struct MatchingContext1 *c,
443 regexchar * re, char *str, char *end_p, int firstp)
461 if (RE_MODE(re) == RE_ENDMARK)
469 while (RE_MODE(c->re) != RE_ENDMARK) {
470 if (c->re->mode & (RE_ANYTIME | RE_OPT)) {
471 if (c->re->mode & RE_ANYTIME)
472 c->iter_limit = RE_ITER_LIMIT;
476 while (c->n_any < c->iter_limit) {
477 if (c->str + c->n_any >= c->end_p) {
481 if (RE_MODE(c->re) == RE_SUBREGEX) {
482 c->ctx2 = GC_malloc(sizeof(struct MatchingContext2));
484 while (regmatch_sub_anytime(c->ctx2,
491 c->n_any = c->ctx2->lastpos - c->str;
492 c->lastpos = c->ctx2->lastpos;
499 k = set_longchar(c->str + c->n_any);
500 if (regmatch1(c->re, &k)) {
501 c->n_any += get_mclen(c->str + c->n_any);
511 if (RE_MODE(c->re + 1) == RE_ENDMARK) {
512 c->lastpos = c->str + c->n_any;
515 else if (regmatch(c->re + 1, c->str + c->n_any, c->end_p,
516 c->firstp, &c->lastpos) == 1) {
522 /* regexp other than pat*, pat+ and pat? */
523 switch (RE_MODE(c->re)) {
530 if (c->str >= c->end_p) {
541 if (c->sub_ctx == NULL) {
542 c->sub_ctx = GC_malloc(sizeof(struct MatchingContext1));
544 c->sub_regex = c->re->p.sub;
546 c->sub_ctx->label = 0;
547 while (regmatch_iter(c->sub_ctx, c->sub_regex->re,
548 c->str, c->end_p, c->firstp)) {
549 if (c->sub_ctx->lastpos != c->str)
551 if (RE_MODE(c->re + 1) == RE_ENDMARK) {
552 c->lastpos = c->sub_ctx->lastpos;
555 else if (regmatch(c->re + 1, c->sub_ctx->lastpos, c->end_p,
556 c->firstp, &c->lastpos) == 1) {
560 if (c->sub_regex->alt_regex == NULL)
562 c->sub_regex = c->sub_regex->alt_regex;
568 k = set_longchar(c->str);
569 c->str += get_mclen(c->str);
570 if (!regmatch1(c->re, &k))
576 if (c->str > c->end_p) {
583 printf("Succeed: %s %d\n", c->str, c->lastpos - c->str);
590 regmatch(regexchar * re, char *str, char *end_p, int firstp, char **lastpos)
592 struct MatchingContext1 contx;
597 while (regmatch_iter(&contx, re, str, end_p, firstp)) {
601 printf("regmatch: matched <");
602 for (p = str; p < contx.lastpos; p++)
607 if (*lastpos == NULL || *lastpos < contx.lastpos)
608 *lastpos = contx.lastpos;
610 if (*lastpos == NULL)
617 regmatch1(regexchar * re, longchar * c)
622 if (c->type == RE_TYPE_SYMBOL)
625 switch (RE_MODE(re)) {
629 printf("%s vs any. -> 1\n", lc2c(c, 1));
630 #endif /* REGEX_DEBUG */
633 ans = match_longchar(re->p.pattern, c, re->mode & RE_IGNCASE);
636 printf("RE=%s vs %s -> %d\n", lc2c(re->p.pattern, 1), lc2c(c, 1),
638 #endif /* REGEX_DEBUG */
641 return matchWhich(re->p.pattern, c, re->mode & RE_IGNCASE);
643 return !matchWhich(re->p.pattern, c, re->mode & RE_IGNCASE);
649 matchWhich(longchar * pattern, longchar * c, int igncase)
651 longchar *p = pattern;
656 printf("RE pattern = %s char=%s", lc2c(pattern, 10000), lc2c(c, 1));
657 #endif /* REGEX_DEBUG */
658 while (p->type != RE_TYPE_END) {
659 if ((p + 1)->type == RE_WHICH_RANGE && (p + 2)->type != RE_TYPE_END) {
660 if (match_range_longchar(p, p + 2, c, igncase)) {
667 if (match_longchar(p, c, igncase)) {
676 printf(" -> %d\n", ans);
677 #endif /* REGEX_DEBUG */
682 match_longchar(longchar * a, longchar * b, int ignore)
685 if (a->type != b->type)
687 if (a->type == RE_TYPE_WCHAR_T)
688 return (a->wch.ccs == b->wch.ccs) && (a->wch.code == b->wch.code);
690 if (ignore && IS_ALPHA(b->ch))
691 return (a->ch == TOLOWER(b->ch) || a->ch == TOUPPER(b->ch));
693 return a->ch == b->ch;
697 match_range_longchar(longchar * a, longchar * b, longchar * c, int ignore)
700 if (a->type != b->type || a->type != c->type)
702 if (a->type == RE_TYPE_WCHAR_T)
703 return ((a->wch.ccs == c->wch.ccs && c->wch.ccs == b->wch.ccs) &&
704 (a->wch.code <= c->wch.code && c->wch.code <= b->wch.code));
706 if (ignore && IS_ALPHA(c->ch))
707 return ((a->ch <= TOLOWER(c->ch) && TOLOWER(c->ch) <= b->ch) ||
708 (a->ch <= TOUPPER(c->ch) && TOUPPER(c->ch) <= b->ch));
710 return (a->ch <= c->ch && c->ch <= b->ch);
715 lc2c(longchar * x, int len)
721 while (x[j].type != RE_TYPE_END && j < len) {
722 if (x[j].type == RE_WHICH_RANGE)
725 else if (x[j].type == RE_TYPE_WCHAR_T) {
727 sprintf(buf, "[%x-%x]", x[j].wch.ccs, x[j].wch.code);
737 r = GC_malloc_atomic(i + 1);
743 debugre(regexchar * re)
745 for (; RE_MODE(re) != RE_ENDMARK; re++) {
746 switch (RE_MODE(re)) {
754 if (re->mode & RE_ANYTIME)
756 if (re->mode & RE_OPT)
759 switch (RE_MODE(re)) {
764 printf("Match-to'%c' ", *re->p.pattern);
767 printf("One-of\"%s\" ", lc2c(re->p.pattern, 10000));
770 printf("Other-than\"%s\" ", lc2c(re->p.pattern, 10000));
774 Regex *r = re->p.sub;
791 #endif /* REGEX_DEBUG */
795 main(int argc, char **argv)
797 char buf[128], buf2[128];
805 wtf_init(WC_CES_EUC_JP, WC_CES_EUC_JP);
808 for (i = 1; i < argc; i++) {
809 if (strcmp(argv[i], "-v") == 0)
817 f = fopen(argv[i], "r");
819 fprintf(stderr, "Can't open %s\n", argv[i]);
822 while (fscanf(f, "%s%s", buf, buf2) == 2) {
823 re = newRegex(buf, 0, NULL, &msg);
825 printf("Error on regexp /%s/: %s\n", buf, msg);
828 if (RegexMatch(re, buf2, -1, 1)) {
829 printf("/%s/\t\"%s\"\t\"", buf, buf2);
830 MatchedPosition(re, &fpos, &epos);
836 printf("/%s/\t\"%s\"\tno_match", buf, buf2);