1 /* Extended regular expression matching and search library,
3 (Implements POSIX draft P1003.2/D11.2, except for some of the
4 internationalization features.)
5 Copyright (C) 1993-1999, 2000, 2001 Free Software Foundation, Inc.
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Library General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
17 You should have received a copy of the GNU Library General Public
18 License along with the GNU C Library; see the file COPYING.LIB. If not,
19 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* AIX requires this to be the first thing in the file. */
23 #if defined _AIX && !defined REGEX_MALLOC
35 # if defined __GNUC__ || (defined __STDC__ && __STDC__)
36 # define PARAMS(args) args
38 # define PARAMS(args) ()
40 #endif /* Not PARAMS. */
42 #if defined STDC_HEADERS && !defined emacs
45 /* We need this for `regex.h', and perhaps for the Emacs include files. */
46 # include <sys/types.h>
49 #define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC)
51 /* For platform which support the ISO C amendement 1 functionality we
52 support user defined character classes. */
53 #if defined _LIBC || WIDE_CHAR_SUPPORT
54 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
59 /* This is for multi byte string support. */
61 # define CHAR_TYPE wchar_t
62 # define US_CHAR_TYPE wchar_t/* unsigned character type */
63 # define COMPILED_BUFFER_VAR wc_buffer
64 # define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */
65 # define PUT_CHAR(c) printf ("%C", c) /* Should we use wide stream?? */
69 # define CHAR_TYPE char
70 # define US_CHAR_TYPE unsigned char /* unsigned character type */
71 # define COMPILED_BUFFER_VAR bufp->buffer
72 # define OFFSET_ADDRESS_SIZE 2
73 # define PUT_CHAR(c) putchar (c)
74 #endif /* MBS_SUPPORT */
77 /* We have to keep the namespace clean. */
78 # define regfree(preg) __regfree (preg)
79 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
80 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
81 # define regerror(errcode, preg, errbuf, errbuf_size) \
82 __regerror(errcode, preg, errbuf, errbuf_size)
83 # define re_set_registers(bu, re, nu, st, en) \
84 __re_set_registers (bu, re, nu, st, en)
85 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
86 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
87 # define re_match(bufp, string, size, pos, regs) \
88 __re_match (bufp, string, size, pos, regs)
89 # define re_search(bufp, string, size, startpos, range, regs) \
90 __re_search (bufp, string, size, startpos, range, regs)
91 # define re_compile_pattern(pattern, length, bufp) \
92 __re_compile_pattern (pattern, length, bufp)
93 # define re_set_syntax(syntax) __re_set_syntax (syntax)
94 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
95 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
96 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
98 # define btowc __btowc
100 /* We are also using some library internals. */
101 # include <locale/localeinfo.h>
102 # include <locale/elem-hash.h>
103 # include <langinfo.h>
104 # include <locale/coll-lookup.h>
107 /* This is for other GNU distributions with internationalized messages. */
108 #if HAVE_LIBINTL_H || defined _LIBC
109 # include <libintl.h>
112 # define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES)
115 # define gettext(msgid) (msgid)
119 /* This define is so xgettext can find the internationalizable
121 # define gettext_noop(String) String
124 /* The `emacs' switch turns on certain matching commands
125 that make sense only in Emacs. */
132 #else /* not emacs */
134 /* If we are not linking with Emacs proper,
135 we can't use the relocating allocator
136 even if config.h says that we can. */
139 # if defined STDC_HEADERS || defined _LIBC
146 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
147 If nothing else has been done, use the method below. */
148 # ifdef INHIBIT_STRING_HEADER
149 # if !(defined HAVE_BZERO && defined HAVE_BCOPY)
150 # if !defined bzero && !defined bcopy
151 # undef INHIBIT_STRING_HEADER
156 /* This is the normal way of making sure we have a bcopy and a bzero.
157 This is used in most programs--a few other programs avoid this
158 by defining INHIBIT_STRING_HEADER. */
159 # ifndef INHIBIT_STRING_HEADER
160 # if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
164 # define bzero(s, n) (memset (s, '\0', n), (s))
166 # define bzero(s, n) __bzero (s, n)
170 # include <strings.h>
172 # define memcmp(s1, s2, n) bcmp (s1, s2, n)
175 # define memcpy(d, s, n) (bcopy (s, d, n), (d))
180 /* Define the syntax stuff for \<, \>, etc. */
182 /* This must be nonzero for the wordchar and notwordchar pattern
183 commands in re_match_2. */
188 # ifdef SWITCH_ENUM_BUG
189 # define SWITCH_ENUM_CAST(x) ((int)(x))
191 # define SWITCH_ENUM_CAST(x) (x)
194 #endif /* not emacs */
196 #if defined _LIBC || HAVE_LIMITS_H
201 # define MB_LEN_MAX 1
204 /* Get the interface, including the syntax bits. */
207 /* isalpha etc. are used for the character classes. */
210 /* Jim Meyering writes:
212 "... Some ctype macros are valid only for character codes that
213 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
214 using /bin/cc or gcc but without giving an ansi option). So, all
215 ctype uses should be through macros like ISPRINT... If
216 STDC_HEADERS is defined, then autoconf has verified that the ctype
217 macros don't need to be guarded with references to isascii. ...
218 Defining isascii to 1 should let any compiler worth its salt
219 eliminate the && through constant folding."
220 Solaris defines some of these symbols so we must undefine them first. */
223 #if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
224 # define ISASCII(c) 1
226 # define ISASCII(c) isascii(c)
230 # define ISBLANK(c) (ISASCII (c) && isblank (c))
232 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
235 # define ISGRAPH(c) (ISASCII (c) && isgraph (c))
237 # define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
241 #define ISPRINT(c) (ISASCII (c) && isprint (c))
242 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
243 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
244 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
245 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
246 #define ISLOWER(c) (ISASCII (c) && islower (c))
247 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
248 #define ISSPACE(c) (ISASCII (c) && isspace (c))
249 #define ISUPPER(c) (ISASCII (c) && isupper (c))
250 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
253 # define TOLOWER(c) _tolower(c)
255 # define TOLOWER(c) tolower(c)
259 # define NULL (void *)0
262 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
263 since ours (we hope) works properly with all combinations of
264 machines, compilers, `char' and `unsigned char' argument types.
265 (Per Bothner suggested the basic approach.) */
266 #undef SIGN_EXTEND_CHAR
268 # define SIGN_EXTEND_CHAR(c) ((signed char) (c))
269 #else /* not __STDC__ */
270 /* As in Harbison and Steele. */
271 # define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
275 /* How many characters in the character set. */
276 # define CHAR_SET_SIZE 256
280 extern char *re_syntax_table;
282 # else /* not SYNTAX_TABLE */
284 static char re_syntax_table[CHAR_SET_SIZE];
294 bzero (re_syntax_table, sizeof re_syntax_table);
296 for (c = 0; c < CHAR_SET_SIZE; ++c)
298 re_syntax_table[c] = Sword;
300 re_syntax_table['_'] = Sword;
305 # endif /* not SYNTAX_TABLE */
307 # define SYNTAX(c) re_syntax_table[(unsigned char) (c)]
311 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
312 use `alloca' instead of `malloc'. This is because using malloc in
313 re_search* or re_match* could cause memory leaks when C-g is used in
314 Emacs; also, malloc is slower and causes storage fragmentation. On
315 the other hand, malloc is more portable, and easier to debug.
317 Because we sometimes use alloca, some routines have to be macros,
318 not functions -- `alloca'-allocated space disappears at the end of the
319 function it is called in. */
323 # define REGEX_ALLOCATE malloc
324 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
325 # define REGEX_FREE free
327 #else /* not REGEX_MALLOC */
329 /* Emacs already defines alloca, sometimes. */
332 /* Make alloca work the best possible way. */
334 # define alloca __builtin_alloca
335 # else /* not __GNUC__ */
338 # endif /* HAVE_ALLOCA_H */
339 # endif /* not __GNUC__ */
341 # endif /* not alloca */
343 # define REGEX_ALLOCATE alloca
345 /* Assumes a `char *destination' variable. */
346 # define REGEX_REALLOCATE(source, osize, nsize) \
347 (destination = (char *) alloca (nsize), \
348 memcpy (destination, source, osize))
350 /* No need to do anything to free, after alloca. */
351 # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
353 #endif /* not REGEX_MALLOC */
355 /* Define how to allocate the failure stack. */
357 #if defined REL_ALLOC && defined REGEX_MALLOC
359 # define REGEX_ALLOCATE_STACK(size) \
360 r_alloc (&failure_stack_ptr, (size))
361 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
362 r_re_alloc (&failure_stack_ptr, (nsize))
363 # define REGEX_FREE_STACK(ptr) \
364 r_alloc_free (&failure_stack_ptr)
366 #else /* not using relocating allocator */
370 # define REGEX_ALLOCATE_STACK malloc
371 # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
372 # define REGEX_FREE_STACK free
374 # else /* not REGEX_MALLOC */
376 # define REGEX_ALLOCATE_STACK alloca
378 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
379 REGEX_REALLOCATE (source, osize, nsize)
380 /* No need to explicitly free anything. */
381 # define REGEX_FREE_STACK(arg)
383 # endif /* not REGEX_MALLOC */
384 #endif /* not using relocating allocator */
387 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
388 `string1' or just past its end. This works if PTR is NULL, which is
390 #define FIRST_STRING_P(ptr) \
391 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
393 /* (Re)Allocate N items of type T using malloc, or fail. */
394 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
395 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
396 #define RETALLOC_IF(addr, n, t) \
397 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
398 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
400 #define BYTEWIDTH 8 /* In bits. */
402 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
406 #define MAX(a, b) ((a) > (b) ? (a) : (b))
407 #define MIN(a, b) ((a) < (b) ? (a) : (b))
409 typedef char boolean;
413 static int re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp,
414 const char *string1, int size1,
415 const char *string2, int size2,
417 struct re_registers *regs,
420 /* These are the command codes that appear in compiled regular
421 expressions. Some opcodes are followed by argument bytes. A
422 command code can specify any interpretation whatsoever for its
423 arguments. Zero bytes may appear in the compiled regular expression. */
429 /* Succeed right away--no more backtracking. */
432 /* Followed by one byte giving n, then by n literal bytes. */
436 /* Same as exactn, but contains binary data. */
440 /* Matches any (more or less) character. */
443 /* Matches any one char belonging to specified set. First
444 following byte is number of bitmap bytes. Then come bytes
445 for a bitmap saying which chars are in. Bits in each byte
446 are ordered low-bit-first. A character is in the set if its
447 bit is 1. A character too large to have a bit in the map is
448 automatically not in the set. */
449 /* ifdef MBS_SUPPORT, following element is length of character
450 classes, length of collating symbols, length of equivalence
451 classes, length of character ranges, and length of characters.
452 Next, character class element, collating symbols elements,
453 equivalence class elements, range elements, and character
455 See regex_compile function. */
458 /* Same parameters as charset, but match any character that is
459 not one of those specified. */
462 /* Start remembering the text that is matched, for storing in a
463 register. Followed by one byte with the register number, in
464 the range 0 to one less than the pattern buffer's re_nsub
465 field. Then followed by one byte with the number of groups
466 inner to this one. (This last has to be part of the
467 start_memory only because we need it in the on_failure_jump
471 /* Stop remembering the text that is matched and store it in a
472 memory register. Followed by one byte with the register
473 number, in the range 0 to one less than `re_nsub' in the
474 pattern buffer, and one byte with the number of inner groups,
475 just like `start_memory'. (We need the number of inner
476 groups here because we don't have any easy way of finding the
477 corresponding start_memory when we're at a stop_memory.) */
480 /* Match a duplicate of something remembered. Followed by one
481 byte containing the register number. */
484 /* Fail unless at beginning of line. */
487 /* Fail unless at end of line. */
490 /* Succeeds if at beginning of buffer (if emacs) or at beginning
491 of string to be matched (if not). */
494 /* Analogously, for end of buffer/string. */
497 /* Followed by two byte relative address to which to jump. */
500 /* Same as jump, but marks the end of an alternative. */
503 /* Followed by two-byte relative address of place to resume at
504 in case of failure. */
505 /* ifdef MBS_SUPPORT, the size of address is 1. */
508 /* Like on_failure_jump, but pushes a placeholder instead of the
509 current string position when executed. */
510 on_failure_keep_string_jump,
512 /* Throw away latest failure point and then jump to following
513 two-byte relative address. */
514 /* ifdef MBS_SUPPORT, the size of address is 1. */
517 /* Change to pop_failure_jump if know won't have to backtrack to
518 match; otherwise change to jump. This is used to jump
519 back to the beginning of a repeat. If what follows this jump
520 clearly won't match what the repeat does, such that we can be
521 sure that there is no use backtracking out of repetitions
522 already matched, then we change it to a pop_failure_jump.
523 Followed by two-byte address. */
524 /* ifdef MBS_SUPPORT, the size of address is 1. */
527 /* Jump to following two-byte address, and push a dummy failure
528 point. This failure point will be thrown away if an attempt
529 is made to use it for a failure. A `+' construct makes this
530 before the first repeat. Also used as an intermediary kind
531 of jump when compiling an alternative. */
532 /* ifdef MBS_SUPPORT, the size of address is 1. */
535 /* Push a dummy failure point and continue. Used at the end of
539 /* Followed by two-byte relative address and two-byte number n.
540 After matching N times, jump to the address upon failure. */
541 /* ifdef MBS_SUPPORT, the size of address is 1. */
544 /* Followed by two-byte relative address, and two-byte number n.
545 Jump to the address N times, then fail. */
546 /* ifdef MBS_SUPPORT, the size of address is 1. */
549 /* Set the following two-byte relative address to the
550 subsequent two-byte number. The address *includes* the two
552 /* ifdef MBS_SUPPORT, the size of address is 1. */
555 wordchar, /* Matches any word-constituent character. */
556 notwordchar, /* Matches any char that is not a word-constituent. */
558 wordbeg, /* Succeeds if at word beginning. */
559 wordend, /* Succeeds if at word end. */
561 wordbound, /* Succeeds if at a word boundary. */
562 notwordbound /* Succeeds if not at a word boundary. */
565 ,before_dot, /* Succeeds if before point. */
566 at_dot, /* Succeeds if at point. */
567 after_dot, /* Succeeds if after point. */
569 /* Matches any character whose syntax is specified. Followed by
570 a byte which contains a syntax code, e.g., Sword. */
573 /* Matches any character whose syntax is not that specified. */
578 /* Common operations on the compiled pattern. */
580 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
581 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
584 # define STORE_NUMBER(destination, number) \
586 *(destination) = (US_CHAR_TYPE)(number); \
589 # define STORE_NUMBER(destination, number) \
591 (destination)[0] = (number) & 0377; \
592 (destination)[1] = (number) >> 8; \
594 #endif /* MBS_SUPPORT */
596 /* Same as STORE_NUMBER, except increment DESTINATION to
597 the byte after where the number is stored. Therefore, DESTINATION
598 must be an lvalue. */
599 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
601 #define STORE_NUMBER_AND_INCR(destination, number) \
603 STORE_NUMBER (destination, number); \
604 (destination) += OFFSET_ADDRESS_SIZE; \
607 /* Put into DESTINATION a number stored in two contiguous bytes starting
609 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
612 # define EXTRACT_NUMBER(destination, source) \
614 (destination) = *(source); \
617 # define EXTRACT_NUMBER(destination, source) \
619 (destination) = *(source) & 0377; \
620 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
625 static void extract_number _RE_ARGS ((int *dest, US_CHAR_TYPE *source));
627 extract_number (dest, source)
629 US_CHAR_TYPE *source;
634 int temp = SIGN_EXTEND_CHAR (*(source + 1));
635 *dest = *source & 0377;
640 # ifndef EXTRACT_MACROS /* To debug the macros. */
641 # undef EXTRACT_NUMBER
642 # define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
643 # endif /* not EXTRACT_MACROS */
647 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
648 SOURCE must be an lvalue. */
650 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
652 EXTRACT_NUMBER (destination, source); \
653 (source) += OFFSET_ADDRESS_SIZE; \
657 static void extract_number_and_incr _RE_ARGS ((int *destination,
658 US_CHAR_TYPE **source));
660 extract_number_and_incr (destination, source)
662 US_CHAR_TYPE **source;
664 extract_number (destination, *source);
665 *source += OFFSET_ADDRESS_SIZE;
668 # ifndef EXTRACT_MACROS
669 # undef EXTRACT_NUMBER_AND_INCR
670 # define EXTRACT_NUMBER_AND_INCR(dest, src) \
671 extract_number_and_incr (&dest, &src)
672 # endif /* not EXTRACT_MACROS */
676 /* If DEBUG is defined, Regex prints many voluminous messages about what
677 it is doing (if the variable `debug' is nonzero). If linked with the
678 main program in `iregex.c', you can enter patterns and strings
679 interactively. And if linked with the main program in `main.c' and
680 the other test files, you can run the already-written tests. */
684 /* We use standard I/O for debugging. */
687 /* It is useful to test things that ``must'' be true when debugging. */
692 # define DEBUG_STATEMENT(e) e
693 # define DEBUG_PRINT1(x) if (debug) printf (x)
694 # define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
695 # define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
696 # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
697 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
698 if (debug) print_partial_compiled_pattern (s, e)
699 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
700 if (debug) print_double_string (w, s1, sz1, s2, sz2)
703 /* Print the fastmap in human-readable form. */
706 print_fastmap (fastmap)
709 unsigned was_a_range = 0;
712 while (i < (1 << BYTEWIDTH))
718 while (i < (1 << BYTEWIDTH) && fastmap[i])
734 /* Print a compiled pattern string in human-readable form, starting at
735 the START pointer into it and ending just before the pointer END. */
738 print_partial_compiled_pattern (start, end)
744 US_CHAR_TYPE *p = start;
745 US_CHAR_TYPE *pend = end;
753 /* Loop over pattern commands. */
757 printf ("%t:\t", p - start);
759 printf ("%ld:\t", (long int) (p - start));
762 switch ((re_opcode_t) *p++)
770 printf ("/exactn/%d", mcnt);
782 printf ("/exactn_bin/%d", mcnt);
789 #endif /* MBS_SUPPORT */
793 printf ("/start_memory/%d/%d", mcnt, *p++);
798 printf ("/stop_memory/%d/%d", mcnt, *p++);
802 printf ("/duplicate/%d", *p++);
815 printf ("/charset [%s",
816 (re_opcode_t) *(workp - 1) == charset_not ? "^" : "");
818 length = *workp++; /* the length of char_classes */
819 for (i=0 ; i<length ; i++)
820 printf("[:%x:]", *p++);
821 length = *workp++; /* the length of collating_symbol */
822 for (i=0 ; i<length ;)
826 PUT_CHAR((i++,*p++));
830 length = *workp++; /* the length of equivalence_class */
831 for (i=0 ; i<length ;)
835 PUT_CHAR((i++,*p++));
839 length = *workp++; /* the length of char_range */
840 for (i=0 ; i<length ; i++)
842 wchar_t range_start = *p++;
843 wchar_t range_end = *p++;
844 printf("%C-%C", range_start, range_end);
846 length = *workp++; /* the length of char */
847 for (i=0 ; i<length ; i++)
851 register int c, last = -100;
852 register int in_range = 0;
854 printf ("/charset [%s",
855 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
857 assert (p + *p < pend);
859 for (c = 0; c < 256; c++)
861 && (p[1 + (c/8)] & (1 << (c % 8))))
863 /* Are we starting a range? */
864 if (last + 1 == c && ! in_range)
869 /* Have we broken a range? */
870 else if (last + 1 != c && in_range)
888 #endif /* MBS_SUPPORT */
900 case on_failure_jump:
901 extract_number_and_incr (&mcnt, &p);
903 printf ("/on_failure_jump to %t", p + mcnt - start);
905 printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start));
909 case on_failure_keep_string_jump:
910 extract_number_and_incr (&mcnt, &p);
912 printf ("/on_failure_keep_string_jump to %t", p + mcnt - start);
914 printf ("/on_failure_keep_string_jump to %ld",
915 (long int) (p + mcnt - start));
919 case dummy_failure_jump:
920 extract_number_and_incr (&mcnt, &p);
922 printf ("/dummy_failure_jump to %t", p + mcnt - start);
924 printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start));
928 case push_dummy_failure:
929 printf ("/push_dummy_failure");
933 extract_number_and_incr (&mcnt, &p);
935 printf ("/maybe_pop_jump to %t", p + mcnt - start);
937 printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start));
941 case pop_failure_jump:
942 extract_number_and_incr (&mcnt, &p);
944 printf ("/pop_failure_jump to %t", p + mcnt - start);
946 printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start));
951 extract_number_and_incr (&mcnt, &p);
953 printf ("/jump_past_alt to %t", p + mcnt - start);
955 printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start));
960 extract_number_and_incr (&mcnt, &p);
962 printf ("/jump to %t", p + mcnt - start);
964 printf ("/jump to %ld", (long int) (p + mcnt - start));
969 extract_number_and_incr (&mcnt, &p);
971 extract_number_and_incr (&mcnt2, &p);
973 printf ("/succeed_n to %t, %d times", p1 - start, mcnt2);
975 printf ("/succeed_n to %ld, %d times",
976 (long int) (p1 - start), mcnt2);
981 extract_number_and_incr (&mcnt, &p);
983 extract_number_and_incr (&mcnt2, &p);
984 printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
988 extract_number_and_incr (&mcnt, &p);
990 extract_number_and_incr (&mcnt2, &p);
992 printf ("/set_number_at location %t to %d", p1 - start, mcnt2);
994 printf ("/set_number_at location %ld to %d",
995 (long int) (p1 - start), mcnt2);
1000 printf ("/wordbound");
1004 printf ("/notwordbound");
1008 printf ("/wordbeg");
1012 printf ("/wordend");
1017 printf ("/before_dot");
1025 printf ("/after_dot");
1029 printf ("/syntaxspec");
1031 printf ("/%d", mcnt);
1035 printf ("/notsyntaxspec");
1037 printf ("/%d", mcnt);
1042 printf ("/wordchar");
1046 printf ("/notwordchar");
1058 printf ("?%d", *(p-1));
1065 printf ("%t:\tend of pattern.\n", p - start);
1067 printf ("%ld:\tend of pattern.\n", (long int) (p - start));
1073 print_compiled_pattern (bufp)
1074 struct re_pattern_buffer *bufp;
1076 US_CHAR_TYPE *buffer = (US_CHAR_TYPE*) bufp->buffer;
1078 print_partial_compiled_pattern (buffer, buffer
1079 + bufp->used / sizeof(US_CHAR_TYPE));
1080 printf ("%ld bytes used/%ld bytes allocated.\n",
1081 bufp->used, bufp->allocated);
1083 if (bufp->fastmap_accurate && bufp->fastmap)
1085 printf ("fastmap: ");
1086 print_fastmap (bufp->fastmap);
1090 printf ("re_nsub: %Zd\t", bufp->re_nsub);
1092 printf ("re_nsub: %ld\t", (long int) bufp->re_nsub);
1094 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1095 printf ("can_be_null: %d\t", bufp->can_be_null);
1096 printf ("newline_anchor: %d\n", bufp->newline_anchor);
1097 printf ("no_sub: %d\t", bufp->no_sub);
1098 printf ("not_bol: %d\t", bufp->not_bol);
1099 printf ("not_eol: %d\t", bufp->not_eol);
1100 printf ("syntax: %lx\n", bufp->syntax);
1101 /* Perhaps we should print the translate table? */
1106 print_double_string (where, string1, size1, string2, size2)
1107 const CHAR_TYPE *where;
1108 const CHAR_TYPE *string1;
1109 const CHAR_TYPE *string2;
1119 if (FIRST_STRING_P (where))
1121 for (this_char = where - string1; this_char < size1; this_char++)
1122 PUT_CHAR (string1[this_char]);
1127 for (this_char = where - string2; this_char < size2; this_char++)
1128 PUT_CHAR (string2[this_char]);
1139 #else /* not DEBUG */
1144 # define DEBUG_STATEMENT(e)
1145 # define DEBUG_PRINT1(x)
1146 # define DEBUG_PRINT2(x1, x2)
1147 # define DEBUG_PRINT3(x1, x2, x3)
1148 # define DEBUG_PRINT4(x1, x2, x3, x4)
1149 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1150 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1152 #endif /* not DEBUG */
1155 /* This convert a multibyte string to a wide character string.
1156 And write their correspondances to offset_buffer(see below)
1157 and write whether each wchar_t is binary data to is_binary.
1158 This assume invalid multibyte sequences as binary data.
1159 We assume offset_buffer and is_binary is already allocated
1162 convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary)
1164 const unsigned char* src;
1165 size_t len; /* the length of multibyte string. */
1167 /* It hold correspondances between src(char string) and
1168 dest(wchar_t string) for optimization.
1170 dest = {'X', 'Y', 'Z'}
1171 (each "xxx", "y" and "zz" represent one multibyte character
1172 corresponding to 'X', 'Y' and 'Z'.)
1173 offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")}
1179 wchar_t *pdest = dest;
1180 const unsigned char *psrc = src;
1181 size_t wc_count = 0;
1183 if (MB_CUR_MAX == 1)
1184 { /* We don't need conversion. */
1185 for ( ; wc_count < len ; ++wc_count)
1188 is_binary[wc_count] = FALSE;
1189 offset_buffer[wc_count] = wc_count;
1191 offset_buffer[wc_count] = wc_count;
1195 /* We need conversion. */
1198 size_t mb_remain = len;
1199 size_t mb_count = 0;
1201 /* Initialize the conversion state. */
1202 memset (&mbs, 0, sizeof (mbstate_t));
1204 offset_buffer[0] = 0;
1205 for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed,
1208 consumed = mbrtowc (pdest, psrc, mb_remain, &mbs);
1211 /* failed to convert. maybe src contains binary data.
1212 So we consume 1 byte manualy. */
1216 is_binary[wc_count] = TRUE;
1219 is_binary[wc_count] = FALSE;
1220 /* In sjis encoding, we use yen sign as escape character in
1221 place of reverse solidus. So we convert 0x5c(yen sign in
1222 sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse
1223 solidus in UCS2). */
1224 if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5)
1225 *pdest = (wchar_t) *psrc;
1227 offset_buffer[wc_count + 1] = mb_count += consumed;
1234 #endif /* MBS_SUPPORT */
1236 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1237 also be assigned to arbitrarily: each pattern buffer stores its own
1238 syntax, so it can be changed between regex compilations. */
1239 /* This has no initializer because initialized variables in Emacs
1240 become read-only after dumping. */
1241 reg_syntax_t re_syntax_options;
1244 /* Specify the precise syntax of regexps for compilation. This provides
1245 for compatibility for various utilities which historically have
1246 different, incompatible syntaxes.
1248 The argument SYNTAX is a bit mask comprised of the various bits
1249 defined in regex.h. We return the old syntax. */
1252 re_set_syntax (syntax)
1253 reg_syntax_t syntax;
1255 reg_syntax_t ret = re_syntax_options;
1257 re_syntax_options = syntax;
1259 if (syntax & RE_DEBUG)
1261 else if (debug) /* was on but now is not */
1267 weak_alias (__re_set_syntax, re_set_syntax)
1270 /* This table gives an error message for each of the error codes listed
1271 in regex.h. Obviously the order here has to be same as there.
1272 POSIX doesn't require that we do anything for REG_NOERROR,
1273 but why not be nice? */
1275 static const char re_error_msgid[] =
1277 #define REG_NOERROR_IDX 0
1278 gettext_noop ("Success") /* REG_NOERROR */
1280 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
1281 gettext_noop ("No match") /* REG_NOMATCH */
1283 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
1284 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
1286 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
1287 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
1289 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
1290 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
1292 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
1293 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
1295 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
1296 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
1298 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
1299 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
1301 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
1302 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
1304 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
1305 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
1307 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
1308 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
1310 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
1311 gettext_noop ("Invalid range end") /* REG_ERANGE */
1313 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
1314 gettext_noop ("Memory exhausted") /* REG_ESPACE */
1316 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
1317 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
1319 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
1320 gettext_noop ("Premature end of regular expression") /* REG_EEND */
1322 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
1323 gettext_noop ("Regular expression too big") /* REG_ESIZE */
1325 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
1326 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
1329 static const size_t re_error_msgid_idx[] =
1350 /* Avoiding alloca during matching, to placate r_alloc. */
1352 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1353 searching and matching functions should not call alloca. On some
1354 systems, alloca is implemented in terms of malloc, and if we're
1355 using the relocating allocator routines, then malloc could cause a
1356 relocation, which might (if the strings being searched are in the
1357 ralloc heap) shift the data out from underneath the regexp
1360 Here's another reason to avoid allocation: Emacs
1361 processes input from X in a signal handler; processing X input may
1362 call malloc; if input arrives while a matching routine is calling
1363 malloc, then we're scrod. But Emacs can't just block input while
1364 calling matching routines; then we don't notice interrupts when
1365 they come in. So, Emacs blocks input around all regexp calls
1366 except the matching calls, which it leaves unprotected, in the
1367 faith that they will not malloc. */
1369 /* Normally, this is fine. */
1370 #define MATCH_MAY_ALLOCATE
1372 /* When using GNU C, we are not REALLY using the C alloca, no matter
1373 what config.h may say. So don't take precautions for it. */
1378 /* The match routines may not allocate if (1) they would do it with malloc
1379 and (2) it's not safe for them to use malloc.
1380 Note that if REL_ALLOC is defined, matching would not use malloc for the
1381 failure stack, but we would still use it for the register vectors;
1382 so REL_ALLOC should not affect this. */
1383 #if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1384 # undef MATCH_MAY_ALLOCATE
1388 /* Failure stack declarations and macros; both re_compile_fastmap and
1389 re_match_2 use a failure stack. These have to be macros because of
1390 REGEX_ALLOCATE_STACK. */
1393 /* Number of failure points for which to initially allocate space
1394 when matching. If this number is exceeded, we allocate more
1395 space, so it is not a hard limit. */
1396 #ifndef INIT_FAILURE_ALLOC
1397 # define INIT_FAILURE_ALLOC 5
1400 /* Roughly the maximum number of failure points on the stack. Would be
1401 exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
1402 This is a variable only so users of regex can assign to it; we never
1403 change it ourselves. */
1407 # if defined MATCH_MAY_ALLOCATE
1408 /* 4400 was enough to cause a crash on Alpha OSF/1,
1409 whose default stack limit is 2mb. */
1410 long int re_max_failures = 4000;
1412 long int re_max_failures = 2000;
1415 union fail_stack_elt
1417 US_CHAR_TYPE *pointer;
1421 typedef union fail_stack_elt fail_stack_elt_t;
1425 fail_stack_elt_t *stack;
1426 unsigned long int size;
1427 unsigned long int avail; /* Offset of next open position. */
1430 #else /* not INT_IS_16BIT */
1432 # if defined MATCH_MAY_ALLOCATE
1433 /* 4400 was enough to cause a crash on Alpha OSF/1,
1434 whose default stack limit is 2mb. */
1435 int re_max_failures = 4000;
1437 int re_max_failures = 2000;
1440 union fail_stack_elt
1442 US_CHAR_TYPE *pointer;
1446 typedef union fail_stack_elt fail_stack_elt_t;
1450 fail_stack_elt_t *stack;
1452 unsigned avail; /* Offset of next open position. */
1455 #endif /* INT_IS_16BIT */
1457 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1458 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1459 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1462 /* Define macros to initialize and free the failure stack.
1463 Do `return -2' if the alloc fails. */
1465 #ifdef MATCH_MAY_ALLOCATE
1466 # define INIT_FAIL_STACK() \
1468 fail_stack.stack = (fail_stack_elt_t *) \
1469 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1471 if (fail_stack.stack == NULL) \
1474 fail_stack.size = INIT_FAILURE_ALLOC; \
1475 fail_stack.avail = 0; \
1478 # define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1480 # define INIT_FAIL_STACK() \
1482 fail_stack.avail = 0; \
1485 # define RESET_FAIL_STACK()
1489 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1491 Return 1 if succeeds, and 0 if either ran out of memory
1492 allocating space for it or it was already too large.
1494 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1496 #define DOUBLE_FAIL_STACK(fail_stack) \
1497 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
1499 : ((fail_stack).stack = (fail_stack_elt_t *) \
1500 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1501 (fail_stack).size * sizeof (fail_stack_elt_t), \
1502 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1504 (fail_stack).stack == NULL \
1506 : ((fail_stack).size <<= 1, \
1510 /* Push pointer POINTER on FAIL_STACK.
1511 Return 1 if was able to do so and 0 if ran out of memory allocating
1513 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1514 ((FAIL_STACK_FULL () \
1515 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1517 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1520 /* Push a pointer value onto the failure stack.
1521 Assumes the variable `fail_stack'. Probably should only
1522 be called from within `PUSH_FAILURE_POINT'. */
1523 #define PUSH_FAILURE_POINTER(item) \
1524 fail_stack.stack[fail_stack.avail++].pointer = (US_CHAR_TYPE *) (item)
1526 /* This pushes an integer-valued item onto the failure stack.
1527 Assumes the variable `fail_stack'. Probably should only
1528 be called from within `PUSH_FAILURE_POINT'. */
1529 #define PUSH_FAILURE_INT(item) \
1530 fail_stack.stack[fail_stack.avail++].integer = (item)
1532 /* Push a fail_stack_elt_t value onto the failure stack.
1533 Assumes the variable `fail_stack'. Probably should only
1534 be called from within `PUSH_FAILURE_POINT'. */
1535 #define PUSH_FAILURE_ELT(item) \
1536 fail_stack.stack[fail_stack.avail++] = (item)
1538 /* These three POP... operations complement the three PUSH... operations.
1539 All assume that `fail_stack' is nonempty. */
1540 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1541 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1542 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1544 /* Used to omit pushing failure point id's when we're not debugging. */
1546 # define DEBUG_PUSH PUSH_FAILURE_INT
1547 # define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1549 # define DEBUG_PUSH(item)
1550 # define DEBUG_POP(item_addr)
1554 /* Push the information about the state we will need
1555 if we ever fail back to it.
1557 Requires variables fail_stack, regstart, regend, reg_info, and
1558 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination'
1561 Does `return FAILURE_CODE' if runs out of memory. */
1563 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1565 char *destination; \
1566 /* Must be int, so when we don't save any registers, the arithmetic \
1567 of 0 + -1 isn't done as unsigned. */ \
1568 /* Can't be int, since there is not a shred of a guarantee that int \
1569 is wide enough to hold a value of something to which pointer can \
1571 active_reg_t this_reg; \
1573 DEBUG_STATEMENT (failure_id++); \
1574 DEBUG_STATEMENT (nfailure_points_pushed++); \
1575 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1576 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1577 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1579 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \
1580 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1582 /* Ensure we have enough space allocated for what we will push. */ \
1583 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1585 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1586 return failure_code; \
1588 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1589 (fail_stack).size); \
1590 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1593 /* Push the info, starting with the registers. */ \
1594 DEBUG_PRINT1 ("\n"); \
1597 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1600 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \
1601 DEBUG_STATEMENT (num_regs_pushed++); \
1603 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1604 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1606 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1607 PUSH_FAILURE_POINTER (regend[this_reg]); \
1609 DEBUG_PRINT2 (" info: %p\n ", \
1610 reg_info[this_reg].word.pointer); \
1611 DEBUG_PRINT2 (" match_null=%d", \
1612 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1613 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1614 DEBUG_PRINT2 (" matched_something=%d", \
1615 MATCHED_SOMETHING (reg_info[this_reg])); \
1616 DEBUG_PRINT2 (" ever_matched=%d", \
1617 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1618 DEBUG_PRINT1 ("\n"); \
1619 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1622 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\
1623 PUSH_FAILURE_INT (lowest_active_reg); \
1625 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\
1626 PUSH_FAILURE_INT (highest_active_reg); \
1628 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \
1629 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1630 PUSH_FAILURE_POINTER (pattern_place); \
1632 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \
1633 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1635 DEBUG_PRINT1 ("'\n"); \
1636 PUSH_FAILURE_POINTER (string_place); \
1638 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1639 DEBUG_PUSH (failure_id); \
1642 /* This is the number of items that are pushed and popped on the stack
1643 for each register. */
1644 #define NUM_REG_ITEMS 3
1646 /* Individual items aside from the registers. */
1648 # define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1650 # define NUM_NONREG_ITEMS 4
1653 /* We push at most this many items on the stack. */
1654 /* We used to use (num_regs - 1), which is the number of registers
1655 this regexp will save; but that was changed to 5
1656 to avoid stack overflow for a regexp with lots of parens. */
1657 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1659 /* We actually push this many items. */
1660 #define NUM_FAILURE_ITEMS \
1662 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1666 /* How many items can still be added to the stack without overflowing it. */
1667 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1670 /* Pops what PUSH_FAIL_STACK pushes.
1672 We restore into the parameters, all of which should be lvalues:
1673 STR -- the saved data position.
1674 PAT -- the saved pattern position.
1675 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1676 REGSTART, REGEND -- arrays of string positions.
1677 REG_INFO -- array of information about each subexpression.
1679 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1680 `pend', `string1', `size1', `string2', and `size2'. */
1681 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1683 DEBUG_STATEMENT (unsigned failure_id;) \
1684 active_reg_t this_reg; \
1685 const US_CHAR_TYPE *string_temp; \
1687 assert (!FAIL_STACK_EMPTY ()); \
1689 /* Remove failure points and point to how many regs pushed. */ \
1690 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1691 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1692 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1694 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1696 DEBUG_POP (&failure_id); \
1697 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1699 /* If the saved string location is NULL, it came from an \
1700 on_failure_keep_string_jump opcode, and we want to throw away the \
1701 saved NULL, thus retaining our current position in the string. */ \
1702 string_temp = POP_FAILURE_POINTER (); \
1703 if (string_temp != NULL) \
1704 str = (const CHAR_TYPE *) string_temp; \
1706 DEBUG_PRINT2 (" Popping string %p: `", str); \
1707 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1708 DEBUG_PRINT1 ("'\n"); \
1710 pat = (US_CHAR_TYPE *) POP_FAILURE_POINTER (); \
1711 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
1712 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1714 /* Restore register info. */ \
1715 high_reg = (active_reg_t) POP_FAILURE_INT (); \
1716 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \
1718 low_reg = (active_reg_t) POP_FAILURE_INT (); \
1719 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \
1722 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1724 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \
1726 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1727 DEBUG_PRINT2 (" info: %p\n", \
1728 reg_info[this_reg].word.pointer); \
1730 regend[this_reg] = (const CHAR_TYPE *) POP_FAILURE_POINTER (); \
1731 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1733 regstart[this_reg] = (const CHAR_TYPE *) POP_FAILURE_POINTER ();\
1734 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1738 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1740 reg_info[this_reg].word.integer = 0; \
1741 regend[this_reg] = 0; \
1742 regstart[this_reg] = 0; \
1744 highest_active_reg = high_reg; \
1747 set_regs_matched_done = 0; \
1748 DEBUG_STATEMENT (nfailure_points_popped++); \
1749 } /* POP_FAILURE_POINT */
1752 /* Structure for per-register (a.k.a. per-group) information.
1753 Other register information, such as the
1754 starting and ending positions (which are addresses), and the list of
1755 inner groups (which is a bits list) are maintained in separate
1758 We are making a (strictly speaking) nonportable assumption here: that
1759 the compiler will pack our bit fields into something that fits into
1760 the type of `word', i.e., is something that fits into one item on the
1764 /* Declarations and macros for re_match_2. */
1768 fail_stack_elt_t word;
1771 /* This field is one if this group can match the empty string,
1772 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1773 #define MATCH_NULL_UNSET_VALUE 3
1774 unsigned match_null_string_p : 2;
1775 unsigned is_active : 1;
1776 unsigned matched_something : 1;
1777 unsigned ever_matched_something : 1;
1779 } register_info_type;
1781 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1782 #define IS_ACTIVE(R) ((R).bits.is_active)
1783 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1784 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1787 /* Call this when have matched a real character; it sets `matched' flags
1788 for the subexpressions which we are currently inside. Also records
1789 that those subexprs have matched. */
1790 #define SET_REGS_MATCHED() \
1793 if (!set_regs_matched_done) \
1796 set_regs_matched_done = 1; \
1797 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1799 MATCHED_SOMETHING (reg_info[r]) \
1800 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1807 /* Registers are set to a sentinel when they haven't yet matched. */
1808 static CHAR_TYPE reg_unset_dummy;
1809 #define REG_UNSET_VALUE (®_unset_dummy)
1810 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1812 /* Subroutine declarations and macros for regex_compile. */
1814 static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size,
1815 reg_syntax_t syntax,
1816 struct re_pattern_buffer *bufp));
1817 static void store_op1 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc, int arg));
1818 static void store_op2 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc,
1819 int arg1, int arg2));
1820 static void insert_op1 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc,
1821 int arg, US_CHAR_TYPE *end));
1822 static void insert_op2 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc,
1823 int arg1, int arg2, US_CHAR_TYPE *end));
1824 static boolean at_begline_loc_p _RE_ARGS ((const CHAR_TYPE *pattern,
1826 reg_syntax_t syntax));
1827 static boolean at_endline_loc_p _RE_ARGS ((const CHAR_TYPE *p,
1828 const CHAR_TYPE *pend,
1829 reg_syntax_t syntax));
1831 static reg_errcode_t compile_range _RE_ARGS ((CHAR_TYPE range_start,
1832 const CHAR_TYPE **p_ptr,
1833 const CHAR_TYPE *pend,
1835 reg_syntax_t syntax,
1837 CHAR_TYPE *char_set));
1838 static void insert_space _RE_ARGS ((int num, CHAR_TYPE *loc, CHAR_TYPE *end));
1840 static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start,
1841 const CHAR_TYPE **p_ptr,
1842 const CHAR_TYPE *pend,
1844 reg_syntax_t syntax,
1846 #endif /* MBS_SUPPORT */
1848 /* Fetch the next character in the uncompiled pattern---translating it
1849 if necessary. Also cast from a signed character in the constant
1850 string passed to us by the user to an unsigned char that we can use
1851 as an array index (in, e.g., `translate'). */
1852 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1853 because it is impossible to allocate 4GB array for some encodings
1854 which have 4 byte character_set like UCS4. */
1857 # define PATFETCH(c) \
1858 do {if (p == pend) return REG_EEND; \
1859 c = (US_CHAR_TYPE) *p++; \
1860 if (translate && (c <= 0xff)) c = (US_CHAR_TYPE) translate[c]; \
1863 # define PATFETCH(c) \
1864 do {if (p == pend) return REG_EEND; \
1865 c = (unsigned char) *p++; \
1866 if (translate) c = (unsigned char) translate[c]; \
1868 # endif /* MBS_SUPPORT */
1871 /* Fetch the next character in the uncompiled pattern, with no
1873 #define PATFETCH_RAW(c) \
1874 do {if (p == pend) return REG_EEND; \
1875 c = (US_CHAR_TYPE) *p++; \
1878 /* Go backwards one character in the pattern. */
1879 #define PATUNFETCH p--
1882 /* If `translate' is non-null, return translate[D], else just D. We
1883 cast the subscript to translate because some data is declared as
1884 `char *', to avoid warnings when a string constant is passed. But
1885 when we use a character as a subscript we must make it unsigned. */
1886 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1887 because it is impossible to allocate 4GB array for some encodings
1888 which have 4 byte character_set like UCS4. */
1891 # define TRANSLATE(d) \
1892 (translate && (sizeof(d) <= 1)? (char) translate[(unsigned char) (d)] : (d))
1894 # define TRANSLATE(d) \
1895 (translate ? (char) translate[(unsigned char) (d)] : (d))
1896 # endif /* MBS_SUPPORT */
1900 /* Macros for outputting the compiled pattern into `buffer'. */
1902 /* If the buffer isn't allocated when it comes in, use this. */
1903 #define INIT_BUF_SIZE (32 * sizeof(US_CHAR_TYPE))
1905 /* Make sure we have at least N more bytes of space in buffer. */
1907 # define GET_BUFFER_SPACE(n) \
1908 while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \
1909 + (n)*sizeof(CHAR_TYPE)) > bufp->allocated) \
1912 # define GET_BUFFER_SPACE(n) \
1913 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
1915 #endif /* MBS_SUPPORT */
1917 /* Make sure we have one more byte of buffer space and then add C to it. */
1918 #define BUF_PUSH(c) \
1920 GET_BUFFER_SPACE (1); \
1921 *b++ = (US_CHAR_TYPE) (c); \
1925 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1926 #define BUF_PUSH_2(c1, c2) \
1928 GET_BUFFER_SPACE (2); \
1929 *b++ = (US_CHAR_TYPE) (c1); \
1930 *b++ = (US_CHAR_TYPE) (c2); \
1934 /* As with BUF_PUSH_2, except for three bytes. */
1935 #define BUF_PUSH_3(c1, c2, c3) \
1937 GET_BUFFER_SPACE (3); \
1938 *b++ = (US_CHAR_TYPE) (c1); \
1939 *b++ = (US_CHAR_TYPE) (c2); \
1940 *b++ = (US_CHAR_TYPE) (c3); \
1943 /* Store a jump with opcode OP at LOC to location TO. We store a
1944 relative address offset by the three bytes the jump itself occupies. */
1945 #define STORE_JUMP(op, loc, to) \
1946 store_op1 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)))
1948 /* Likewise, for a two-argument jump. */
1949 #define STORE_JUMP2(op, loc, to, arg) \
1950 store_op2 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg)
1952 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1953 #define INSERT_JUMP(op, loc, to) \
1954 insert_op1 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b)
1956 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1957 #define INSERT_JUMP2(op, loc, to, arg) \
1958 insert_op2 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\
1962 /* This is not an arbitrary limit: the arguments which represent offsets
1963 into the pattern are two bytes long. So if 2^16 bytes turns out to
1964 be too small, many things would have to change. */
1965 /* Any other compiler which, like MSC, has allocation limit below 2^16
1966 bytes will have to use approach similar to what was done below for
1967 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1968 reallocating to 0 bytes. Such thing is not going to work too well.
1969 You have been warned!! */
1970 #if defined _MSC_VER && !defined WIN32
1971 /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
1972 The REALLOC define eliminates a flurry of conversion warnings,
1973 but is not required. */
1974 # define MAX_BUF_SIZE 65500L
1975 # define REALLOC(p,s) realloc ((p), (size_t) (s))
1977 # define MAX_BUF_SIZE (1L << 16)
1978 # define REALLOC(p,s) realloc ((p), (s))
1981 /* Extend the buffer by twice its current size via realloc and
1982 reset the pointers that pointed into the old block to point to the
1983 correct places in the new one. If extending the buffer results in it
1984 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1985 #if __BOUNDED_POINTERS__
1986 # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1987 # define MOVE_BUFFER_POINTER(P) \
1988 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
1989 # define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1992 SET_HIGH_BOUND (b); \
1993 SET_HIGH_BOUND (begalt); \
1994 if (fixup_alt_jump) \
1995 SET_HIGH_BOUND (fixup_alt_jump); \
1997 SET_HIGH_BOUND (laststart); \
1998 if (pending_exact) \
1999 SET_HIGH_BOUND (pending_exact); \
2002 # define MOVE_BUFFER_POINTER(P) (P) += incr
2003 # define ELSE_EXTEND_BUFFER_HIGH_BOUND
2007 # define EXTEND_BUFFER() \
2009 US_CHAR_TYPE *old_buffer = COMPILED_BUFFER_VAR; \
2011 if (bufp->allocated + sizeof(US_CHAR_TYPE) > MAX_BUF_SIZE) \
2013 bufp->allocated <<= 1; \
2014 if (bufp->allocated > MAX_BUF_SIZE) \
2015 bufp->allocated = MAX_BUF_SIZE; \
2016 /* How many characters the new buffer can have? */ \
2017 wchar_count = bufp->allocated / sizeof(US_CHAR_TYPE); \
2018 if (wchar_count == 0) wchar_count = 1; \
2019 /* Truncate the buffer to CHAR_TYPE align. */ \
2020 bufp->allocated = wchar_count * sizeof(US_CHAR_TYPE); \
2021 RETALLOC (COMPILED_BUFFER_VAR, wchar_count, US_CHAR_TYPE); \
2022 bufp->buffer = (char*)COMPILED_BUFFER_VAR; \
2023 if (COMPILED_BUFFER_VAR == NULL) \
2024 return REG_ESPACE; \
2025 /* If the buffer moved, move all the pointers into it. */ \
2026 if (old_buffer != COMPILED_BUFFER_VAR) \
2028 int incr = COMPILED_BUFFER_VAR - old_buffer; \
2029 MOVE_BUFFER_POINTER (b); \
2030 MOVE_BUFFER_POINTER (begalt); \
2031 if (fixup_alt_jump) \
2032 MOVE_BUFFER_POINTER (fixup_alt_jump); \
2034 MOVE_BUFFER_POINTER (laststart); \
2035 if (pending_exact) \
2036 MOVE_BUFFER_POINTER (pending_exact); \
2038 ELSE_EXTEND_BUFFER_HIGH_BOUND \
2041 # define EXTEND_BUFFER() \
2043 US_CHAR_TYPE *old_buffer = COMPILED_BUFFER_VAR; \
2044 if (bufp->allocated == MAX_BUF_SIZE) \
2046 bufp->allocated <<= 1; \
2047 if (bufp->allocated > MAX_BUF_SIZE) \
2048 bufp->allocated = MAX_BUF_SIZE; \
2049 bufp->buffer = (US_CHAR_TYPE *) REALLOC (COMPILED_BUFFER_VAR, \
2051 if (COMPILED_BUFFER_VAR == NULL) \
2052 return REG_ESPACE; \
2053 /* If the buffer moved, move all the pointers into it. */ \
2054 if (old_buffer != COMPILED_BUFFER_VAR) \
2056 int incr = COMPILED_BUFFER_VAR - old_buffer; \
2057 MOVE_BUFFER_POINTER (b); \
2058 MOVE_BUFFER_POINTER (begalt); \
2059 if (fixup_alt_jump) \
2060 MOVE_BUFFER_POINTER (fixup_alt_jump); \
2062 MOVE_BUFFER_POINTER (laststart); \
2063 if (pending_exact) \
2064 MOVE_BUFFER_POINTER (pending_exact); \
2066 ELSE_EXTEND_BUFFER_HIGH_BOUND \
2068 #endif /* MBS_SUPPORT */
2070 /* Since we have one byte reserved for the register number argument to
2071 {start,stop}_memory, the maximum number of groups we can report
2072 things about is what fits in that byte. */
2073 #define MAX_REGNUM 255
2075 /* But patterns can have more than `MAX_REGNUM' registers. We just
2076 ignore the excess. */
2077 typedef unsigned regnum_t;
2080 /* Macros for the compile stack. */
2082 /* Since offsets can go either forwards or backwards, this type needs to
2083 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
2084 /* int may be not enough when sizeof(int) == 2. */
2085 typedef long pattern_offset_t;
2089 pattern_offset_t begalt_offset;
2090 pattern_offset_t fixup_alt_jump;
2091 pattern_offset_t inner_group_offset;
2092 pattern_offset_t laststart_offset;
2094 } compile_stack_elt_t;
2099 compile_stack_elt_t *stack;
2101 unsigned avail; /* Offset of next open position. */
2102 } compile_stack_type;
2105 #define INIT_COMPILE_STACK_SIZE 32
2107 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
2108 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
2110 /* The next available element. */
2111 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
2114 /* Set the bit for character C in a list. */
2115 #define SET_LIST_BIT(c) \
2116 (b[((unsigned char) (c)) / BYTEWIDTH] \
2117 |= 1 << (((unsigned char) c) % BYTEWIDTH))
2120 /* Get the next unsigned number in the uncompiled pattern. */
2121 #define GET_UNSIGNED_NUMBER(num) \
2125 while ('0' <= c && c <= '9') \
2129 num = num * 10 + c - '0'; \
2137 #if defined _LIBC || WIDE_CHAR_SUPPORT
2138 /* The GNU C library provides support for user-defined character classes
2139 and the functions from ISO C amendement 1. */
2140 # ifdef CHARCLASS_NAME_MAX
2141 # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
2143 /* This shouldn't happen but some implementation might still have this
2144 problem. Use a reasonable default value. */
2145 # define CHAR_CLASS_MAX_LENGTH 256
2149 # define IS_CHAR_CLASS(string) __wctype (string)
2151 # define IS_CHAR_CLASS(string) wctype (string)
2154 # define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
2156 # define IS_CHAR_CLASS(string) \
2157 (STREQ (string, "alpha") || STREQ (string, "upper") \
2158 || STREQ (string, "lower") || STREQ (string, "digit") \
2159 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
2160 || STREQ (string, "space") || STREQ (string, "print") \
2161 || STREQ (string, "punct") || STREQ (string, "graph") \
2162 || STREQ (string, "cntrl") || STREQ (string, "blank"))
2165 #ifndef MATCH_MAY_ALLOCATE
2167 /* If we cannot allocate large objects within re_match_2_internal,
2168 we make the fail stack and register vectors global.
2169 The fail stack, we grow to the maximum size when a regexp
2171 The register vectors, we adjust in size each time we
2172 compile a regexp, according to the number of registers it needs. */
2174 static fail_stack_type fail_stack;
2176 /* Size with which the following vectors are currently allocated.
2177 That is so we can make them bigger as needed,
2178 but never make them smaller. */
2179 static int regs_allocated_size;
2181 static const char ** regstart, ** regend;
2182 static const char ** old_regstart, ** old_regend;
2183 static const char **best_regstart, **best_regend;
2184 static register_info_type *reg_info;
2185 static const char **reg_dummy;
2186 static register_info_type *reg_info_dummy;
2188 /* Make the register vectors big enough for NUM_REGS registers,
2189 but don't make them smaller. */
2192 regex_grow_registers (num_regs)
2195 if (num_regs > regs_allocated_size)
2197 RETALLOC_IF (regstart, num_regs, const char *);
2198 RETALLOC_IF (regend, num_regs, const char *);
2199 RETALLOC_IF (old_regstart, num_regs, const char *);
2200 RETALLOC_IF (old_regend, num_regs, const char *);
2201 RETALLOC_IF (best_regstart, num_regs, const char *);
2202 RETALLOC_IF (best_regend, num_regs, const char *);
2203 RETALLOC_IF (reg_info, num_regs, register_info_type);
2204 RETALLOC_IF (reg_dummy, num_regs, const char *);
2205 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
2207 regs_allocated_size = num_regs;
2211 #endif /* not MATCH_MAY_ALLOCATE */
2213 static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2217 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2218 Returns one of error codes defined in `regex.h', or zero for success.
2220 Assumes the `allocated' (and perhaps `buffer') and `translate'
2221 fields are set in BUFP on entry.
2223 If it succeeds, results are put in BUFP (if it returns an error, the
2224 contents of BUFP are undefined):
2225 `buffer' is the compiled pattern;
2226 `syntax' is set to SYNTAX;
2227 `used' is set to the length of the compiled pattern;
2228 `fastmap_accurate' is zero;
2229 `re_nsub' is the number of subexpressions in PATTERN;
2230 `not_bol' and `not_eol' are zero;
2232 The `fastmap' and `newline_anchor' fields are neither
2233 examined nor set. */
2235 /* Return, freeing storage we allocated. */
2237 # define FREE_STACK_RETURN(value) \
2238 return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value)
2240 # define FREE_STACK_RETURN(value) \
2241 return (free (compile_stack.stack), value)
2242 #endif /* MBS_SUPPORT */
2244 static reg_errcode_t
2246 regex_compile (cpattern, csize, syntax, bufp)
2247 const char *cpattern;
2250 regex_compile (pattern, size, syntax, bufp)
2251 const char *pattern;
2253 #endif /* MBS_SUPPORT */
2254 reg_syntax_t syntax;
2255 struct re_pattern_buffer *bufp;
2257 /* We fetch characters from PATTERN here. Even though PATTERN is
2258 `char *' (i.e., signed), we declare these variables as unsigned, so
2259 they can be reliably used as array indices. */
2260 register US_CHAR_TYPE c, c1;
2263 /* A temporary space to keep wchar_t pattern and compiled pattern. */
2264 CHAR_TYPE *pattern, *COMPILED_BUFFER_VAR;
2266 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
2267 int *mbs_offset = NULL;
2268 /* It hold whether each wchar_t is binary data or not. */
2269 int *is_binary = NULL;
2270 /* A flag whether exactn is handling binary data or not. */
2271 int is_exactn_bin = FALSE;
2272 #endif /* MBS_SUPPORT */
2274 /* A random temporary spot in PATTERN. */
2275 const CHAR_TYPE *p1;
2277 /* Points to the end of the buffer, where we should append. */
2278 register US_CHAR_TYPE *b;
2280 /* Keeps track of unclosed groups. */
2281 compile_stack_type compile_stack;
2283 /* Points to the current (ending) position in the pattern. */
2286 const CHAR_TYPE *pend;
2288 const CHAR_TYPE *p = pattern;
2289 const CHAR_TYPE *pend = pattern + size;
2290 #endif /* MBS_SUPPORT */
2292 /* How to translate the characters in the pattern. */
2293 RE_TRANSLATE_TYPE translate = bufp->translate;
2295 /* Address of the count-byte of the most recently inserted `exactn'
2296 command. This makes it possible to tell if a new exact-match
2297 character can be added to that command or if the character requires
2298 a new `exactn' command. */
2299 US_CHAR_TYPE *pending_exact = 0;
2301 /* Address of start of the most recently finished expression.
2302 This tells, e.g., postfix * where to find the start of its
2303 operand. Reset at the beginning of groups and alternatives. */
2304 US_CHAR_TYPE *laststart = 0;
2306 /* Address of beginning of regexp, or inside of last group. */
2307 US_CHAR_TYPE *begalt;
2309 /* Place in the uncompiled pattern (i.e., the {) to
2310 which to go back if the interval is invalid. */
2312 const US_CHAR_TYPE *beg_interval;
2314 const char *beg_interval;
2315 #endif /* MBS_SUPPORT */
2317 /* Address of the place where a forward jump should go to the end of
2318 the containing expression. Each alternative of an `or' -- except the
2319 last -- ends with a forward jump of this sort. */
2320 US_CHAR_TYPE *fixup_alt_jump = 0;
2322 /* Counts open-groups as they are encountered. Remembered for the
2323 matching close-group on the compile stack, so the same register
2324 number is put in the stop_memory as the start_memory. */
2325 regnum_t regnum = 0;
2328 /* Initialize the wchar_t PATTERN and offset_buffer. */
2329 p = pend = pattern = TALLOC(csize, CHAR_TYPE);
2330 mbs_offset = TALLOC(csize + 1, int);
2331 is_binary = TALLOC(csize + 1, int);
2332 if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
2334 if (pattern) free(pattern);
2335 if (mbs_offset) free(mbs_offset);
2336 if (is_binary) free(is_binary);
2339 size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
2343 if (pattern) free(pattern);
2344 if (mbs_offset) free(mbs_offset);
2345 if (is_binary) free(is_binary);
2351 DEBUG_PRINT1 ("\nCompiling pattern: ");
2354 unsigned debug_count;
2356 for (debug_count = 0; debug_count < size; debug_count++)
2357 PUT_CHAR (pattern[debug_count]);
2362 /* Initialize the compile stack. */
2363 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2364 if (compile_stack.stack == NULL)
2367 if (pattern) free(pattern);
2368 if (mbs_offset) free(mbs_offset);
2369 if (is_binary) free(is_binary);
2374 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2375 compile_stack.avail = 0;
2377 /* Initialize the pattern buffer. */
2378 bufp->syntax = syntax;
2379 bufp->fastmap_accurate = 0;
2380 bufp->not_bol = bufp->not_eol = 0;
2382 /* Set `used' to zero, so that if we return an error, the pattern
2383 printer (for debugging) will think there's no pattern. We reset it
2387 /* Always count groups, whether or not bufp->no_sub is set. */
2390 #if !defined emacs && !defined SYNTAX_TABLE
2391 /* Initialize the syntax table. */
2392 init_syntax_once ();
2395 if (bufp->allocated == 0)
2398 { /* If zero allocated, but buffer is non-null, try to realloc
2399 enough space. This loses if buffer's address is bogus, but
2400 that is the user's responsibility. */
2402 /* Free bufp->buffer and allocate an array for wchar_t pattern
2405 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(US_CHAR_TYPE),
2408 RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, US_CHAR_TYPE);
2409 #endif /* MBS_SUPPORT */
2412 { /* Caller did not allocate a buffer. Do it for them. */
2413 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(US_CHAR_TYPE),
2417 if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE);
2419 bufp->buffer = (char*)COMPILED_BUFFER_VAR;
2420 #endif /* MBS_SUPPORT */
2421 bufp->allocated = INIT_BUF_SIZE;
2425 COMPILED_BUFFER_VAR = (US_CHAR_TYPE*) bufp->buffer;
2428 begalt = b = COMPILED_BUFFER_VAR;
2430 /* Loop through the uncompiled pattern until we're at the end. */
2439 if ( /* If at start of pattern, it's an operator. */
2441 /* If context independent, it's an operator. */
2442 || syntax & RE_CONTEXT_INDEP_ANCHORS
2443 /* Otherwise, depends on what's come before. */
2444 || at_begline_loc_p (pattern, p, syntax))
2454 if ( /* If at end of pattern, it's an operator. */
2456 /* If context independent, it's an operator. */
2457 || syntax & RE_CONTEXT_INDEP_ANCHORS
2458 /* Otherwise, depends on what's next. */
2459 || at_endline_loc_p (p, pend, syntax))
2469 if ((syntax & RE_BK_PLUS_QM)
2470 || (syntax & RE_LIMITED_OPS))
2474 /* If there is no previous pattern... */
2477 if (syntax & RE_CONTEXT_INVALID_OPS)
2478 FREE_STACK_RETURN (REG_BADRPT);
2479 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2484 /* Are we optimizing this jump? */
2485 boolean keep_string_p = false;
2487 /* 1 means zero (many) matches is allowed. */
2488 char zero_times_ok = 0, many_times_ok = 0;
2490 /* If there is a sequence of repetition chars, collapse it
2491 down to just one (the right one). We can't combine
2492 interval operators with these because of, e.g., `a{2}*',
2493 which should only match an even number of `a's. */
2497 zero_times_ok |= c != '+';
2498 many_times_ok |= c != '?';
2506 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2509 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2511 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2514 if (!(c1 == '+' || c1 == '?'))
2529 /* If we get here, we found another repeat character. */
2532 /* Star, etc. applied to an empty pattern is equivalent
2533 to an empty pattern. */
2537 /* Now we know whether or not zero matches is allowed
2538 and also whether or not two or more matches is allowed. */
2540 { /* More than one repetition is allowed, so put in at the
2541 end a backward relative jump from `b' to before the next
2542 jump we're going to put in below (which jumps from
2543 laststart to after this jump).
2545 But if we are at the `*' in the exact sequence `.*\n',
2546 insert an unconditional jump backwards to the .,
2547 instead of the beginning of the loop. This way we only
2548 push a failure point once, instead of every time
2549 through the loop. */
2550 assert (p - 1 > pattern);
2552 /* Allocate the space for the jump. */
2553 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2555 /* We know we are not at the first character of the pattern,
2556 because laststart was nonzero. And we've already
2557 incremented `p', by the way, to be the character after
2558 the `*'. Do we have to do something analogous here
2559 for null bytes, because of RE_DOT_NOT_NULL? */
2560 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2562 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
2563 && !(syntax & RE_DOT_NEWLINE))
2564 { /* We have .*\n. */
2565 STORE_JUMP (jump, b, laststart);
2566 keep_string_p = true;
2569 /* Anything else. */
2570 STORE_JUMP (maybe_pop_jump, b, laststart -
2571 (1 + OFFSET_ADDRESS_SIZE));
2573 /* We've added more stuff to the buffer. */
2574 b += 1 + OFFSET_ADDRESS_SIZE;
2577 /* On failure, jump from laststart to b + 3, which will be the
2578 end of the buffer after this jump is inserted. */
2579 /* ifdef MBS_SUPPORT, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of
2581 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2582 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2584 laststart, b + 1 + OFFSET_ADDRESS_SIZE);
2586 b += 1 + OFFSET_ADDRESS_SIZE;
2590 /* At least one repetition is required, so insert a
2591 `dummy_failure_jump' before the initial
2592 `on_failure_jump' instruction of the loop. This
2593 effects a skip over that instruction the first time
2594 we hit that loop. */
2595 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2596 INSERT_JUMP (dummy_failure_jump, laststart, laststart +
2597 2 + 2 * OFFSET_ADDRESS_SIZE);
2598 b += 1 + OFFSET_ADDRESS_SIZE;
2612 boolean had_char_class = false;
2614 CHAR_TYPE range_start = 0xffffffff;
2616 unsigned int range_start = 0xffffffff;
2618 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2621 /* We assume a charset(_not) structure as a wchar_t array.
2622 charset[0] = (re_opcode_t) charset(_not)
2623 charset[1] = l (= length of char_classes)
2624 charset[2] = m (= length of collating_symbols)
2625 charset[3] = n (= length of equivalence_classes)
2626 charset[4] = o (= length of char_ranges)
2627 charset[5] = p (= length of chars)
2629 charset[6] = char_class (wctype_t)
2631 charset[l+5] = char_class (wctype_t)
2633 charset[l+6] = collating_symbol (wchar_t)
2635 charset[l+m+5] = collating_symbol (wchar_t)
2636 ifdef _LIBC we use the index if
2637 _NL_COLLATE_SYMB_EXTRAMB instead of
2640 charset[l+m+6] = equivalence_classes (wchar_t)
2642 charset[l+m+n+5] = equivalence_classes (wchar_t)
2643 ifdef _LIBC we use the index in
2644 _NL_COLLATE_WEIGHT instead of
2647 charset[l+m+n+6] = range_start
2648 charset[l+m+n+7] = range_end
2650 charset[l+m+n+2o+4] = range_start
2651 charset[l+m+n+2o+5] = range_end
2652 ifdef _LIBC we use the value looked up
2653 in _NL_COLLATE_COLLSEQ instead of
2656 charset[l+m+n+2o+6] = char
2658 charset[l+m+n+2o+p+5] = char
2662 /* We need at least 6 spaces: the opcode, the length of
2663 char_classes, the length of collating_symbols, the length of
2664 equivalence_classes, the length of char_ranges, the length of
2666 GET_BUFFER_SPACE (6);
2668 /* Save b as laststart. And We use laststart as the pointer
2669 to the first element of the charset here.
2670 In other words, laststart[i] indicates charset[i]. */
2673 /* We test `*p == '^' twice, instead of using an if
2674 statement, so we only need one BUF_PUSH. */
2675 BUF_PUSH (*p == '^' ? charset_not : charset);
2679 /* Push the length of char_classes, the length of
2680 collating_symbols, the length of equivalence_classes, the
2681 length of char_ranges and the length of chars. */
2682 BUF_PUSH_3 (0, 0, 0);
2685 /* Remember the first position in the bracket expression. */
2688 /* charset_not matches newline according to a syntax bit. */
2689 if ((re_opcode_t) b[-6] == charset_not
2690 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2693 laststart[5]++; /* Update the length of characters */
2696 /* Read in characters and ranges, setting map bits. */
2699 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2703 /* \ might escape characters inside [...] and [^...]. */
2704 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2706 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2710 laststart[5]++; /* Update the length of chars */
2715 /* Could be the end of the bracket expression. If it's
2716 not (i.e., when the bracket expression is `[]' so
2717 far), the ']' character bit gets set way below. */
2718 if (c == ']' && p != p1 + 1)
2721 /* Look ahead to see if it's a range when the last thing
2722 was a character class. */
2723 if (had_char_class && c == '-' && *p != ']')
2724 FREE_STACK_RETURN (REG_ERANGE);
2726 /* Look ahead to see if it's a range when the last thing
2727 was a character: if this is a hyphen not at the
2728 beginning or the end of a list, then it's the range
2731 && !(p - 2 >= pattern && p[-2] == '[')
2732 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2736 /* Allocate the space for range_start and range_end. */
2737 GET_BUFFER_SPACE (2);
2738 /* Update the pointer to indicate end of buffer. */
2740 ret = compile_range (range_start, &p, pend, translate,
2741 syntax, b, laststart);
2742 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2743 range_start = 0xffffffff;
2745 else if (p[0] == '-' && p[1] != ']')
2746 { /* This handles ranges made up of characters only. */
2749 /* Move past the `-'. */
2751 /* Allocate the space for range_start and range_end. */
2752 GET_BUFFER_SPACE (2);
2753 /* Update the pointer to indicate end of buffer. */
2755 ret = compile_range (c, &p, pend, translate, syntax, b,
2757 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2758 range_start = 0xffffffff;
2761 /* See if we're at the beginning of a possible character
2763 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2764 { /* Leave room for the null. */
2765 char str[CHAR_CLASS_MAX_LENGTH + 1];
2770 /* If pattern is `[[:'. */
2771 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2776 if ((c == ':' && *p == ']') || p == pend)
2778 if (c1 < CHAR_CLASS_MAX_LENGTH)
2781 /* This is in any case an invalid class name. */
2786 /* If isn't a word bracketed by `[:' and `:]':
2787 undo the ending character, the letters, and leave
2788 the leading `:' and `[' (but store them as character). */
2789 if (c == ':' && *p == ']')
2792 /* Query the character class as wctype_t. */
2793 wt = IS_CHAR_CLASS (str);
2795 FREE_STACK_RETURN (REG_ECTYPE);
2797 /* Throw away the ] at the end of the character
2801 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2803 /* Allocate the space for character class. */
2804 GET_BUFFER_SPACE(1);
2805 /* Update the pointer to indicate end of buffer. */
2807 /* Move data which follow character classes
2808 not to violate the data. */
2809 insert_space(1, laststart+6, b-1);
2810 /* Store the character class. */
2811 laststart[6] = (CHAR_TYPE) wt;
2812 laststart[1]++; /* Update length of char_classes */
2814 had_char_class = true;
2823 laststart[5] += 2; /* Update the length of characters */
2825 had_char_class = false;
2828 else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '='
2831 CHAR_TYPE str[128]; /* Should be large enough. */
2832 CHAR_TYPE delim = *p; /* '=' or '.' */
2835 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2840 /* If pattern is `[[=' or '[[.'. */
2841 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2846 if ((c == delim && *p == ']') || p == pend)
2848 if (c1 < sizeof (str) - 1)
2851 /* This is in any case an invalid class name. */
2856 if (c == delim && *p == ']' && str[0] != '\0')
2858 unsigned int i, offset;
2859 /* If we have no collation data we use the default
2860 collation in which each character is in a class
2861 by itself. It also means that ASCII is the
2862 character set and therefore we cannot have character
2863 with more than one byte in the multibyte
2866 /* If not defined _LIBC, we push the name and
2867 `\0' for the sake of matching performance. */
2868 int datasize = c1 + 1;
2876 FREE_STACK_RETURN (REG_ECOLLATE);
2881 const int32_t *table;
2882 const int32_t *weights;
2883 const int32_t *extra;
2884 const int32_t *indirect;
2887 /* This #include defines a local function! */
2888 # include <locale/weightwc.h>
2892 /* We push the index for equivalence class. */
2895 table = (const int32_t *)
2896 _NL_CURRENT (LC_COLLATE,
2897 _NL_COLLATE_TABLEWC);
2898 weights = (const int32_t *)
2899 _NL_CURRENT (LC_COLLATE,
2900 _NL_COLLATE_WEIGHTWC);
2901 extra = (const int32_t *)
2902 _NL_CURRENT (LC_COLLATE,
2903 _NL_COLLATE_EXTRAWC);
2904 indirect = (const int32_t *)
2905 _NL_CURRENT (LC_COLLATE,
2906 _NL_COLLATE_INDIRECTWC);
2908 idx = findidx ((const wint_t**)&cp);
2909 if (idx == 0 || cp < (wint_t*) str + c1)
2910 /* This is no valid character. */
2911 FREE_STACK_RETURN (REG_ECOLLATE);
2913 str[0] = (wchar_t)idx;
2915 else /* delim == '.' */
2917 /* We push collation sequence value
2918 for collating symbol. */
2920 const int32_t *symb_table;
2921 const unsigned char *extra;
2928 /* We have to convert the name to a single-byte
2929 string. This is possible since the names
2930 consist of ASCII characters and the internal
2931 representation is UCS4. */
2932 for (i = 0; i < c1; ++i)
2933 char_str[i] = str[i];
2936 _NL_CURRENT_WORD (LC_COLLATE,
2937 _NL_COLLATE_SYMB_HASH_SIZEMB);
2938 symb_table = (const int32_t *)
2939 _NL_CURRENT (LC_COLLATE,
2940 _NL_COLLATE_SYMB_TABLEMB);
2941 extra = (const unsigned char *)
2942 _NL_CURRENT (LC_COLLATE,
2943 _NL_COLLATE_SYMB_EXTRAMB);
2945 /* Locate the character in the hashing table. */
2946 hash = elem_hash (char_str, c1);
2949 elem = hash % table_size;
2950 second = hash % (table_size - 2);
2951 while (symb_table[2 * elem] != 0)
2953 /* First compare the hashing value. */
2954 if (symb_table[2 * elem] == hash
2955 && c1 == extra[symb_table[2 * elem + 1]]
2957 &extra[symb_table[2 * elem + 1]
2960 /* Yep, this is the entry. */
2961 idx = symb_table[2 * elem + 1];
2962 idx += 1 + extra[idx];
2970 if (symb_table[2 * elem] != 0)
2972 /* Compute the index of the byte sequence
2974 idx += 1 + extra[idx];
2975 /* Adjust for the alignment. */
2976 idx = (idx + 3) & ~4;
2978 str[0] = (wchar_t) &extra[idx + 4];
2980 else if (symb_table[2 * elem] == 0 && c1 == 1)
2982 /* No valid character. Match it as a
2983 single byte character. */
2984 had_char_class = false;
2986 /* Update the length of characters */
2988 range_start = str[0];
2990 /* Throw away the ] at the end of the
2991 collating symbol. */
2993 /* exit from the switch block. */
2997 FREE_STACK_RETURN (REG_ECOLLATE);
3002 /* Throw away the ] at the end of the equivalence
3003 class (or collating symbol). */
3006 /* Allocate the space for the equivalence class
3007 (or collating symbol) (and '\0' if needed). */
3008 GET_BUFFER_SPACE(datasize);
3009 /* Update the pointer to indicate end of buffer. */
3013 { /* equivalence class */
3014 /* Calculate the offset of char_ranges,
3015 which is next to equivalence_classes. */
3016 offset = laststart[1] + laststart[2]
3019 insert_space(datasize, laststart + offset, b - 1);
3021 /* Write the equivalence_class and \0. */
3022 for (i = 0 ; i < datasize ; i++)
3023 laststart[offset + i] = str[i];
3025 /* Update the length of equivalence_classes. */
3026 laststart[3] += datasize;
3027 had_char_class = true;
3029 else /* delim == '.' */
3030 { /* collating symbol */
3031 /* Calculate the offset of the equivalence_classes,
3032 which is next to collating_symbols. */
3033 offset = laststart[1] + laststart[2] + 6;
3034 /* Insert space and write the collationg_symbol
3036 insert_space(datasize, laststart + offset, b-1);
3037 for (i = 0 ; i < datasize ; i++)
3038 laststart[offset + i] = str[i];
3040 /* In re_match_2_internal if range_start < -1, we
3041 assume -range_start is the offset of the
3042 collating symbol which is specified as
3043 the character of the range start. So we assign
3044 -(laststart[1] + laststart[2] + 6) to
3046 range_start = -(laststart[1] + laststart[2] + 6);
3047 /* Update the length of collating_symbol. */
3048 laststart[2] += datasize;
3049 had_char_class = false;
3059 laststart[5] += 2; /* Update the length of characters */
3060 range_start = delim;
3061 had_char_class = false;
3066 had_char_class = false;
3068 laststart[5]++; /* Update the length of characters */
3073 #else /* not MBS_SUPPORT */
3074 /* Ensure that we have enough space to push a charset: the
3075 opcode, the length count, and the bitset; 34 bytes in all. */
3076 GET_BUFFER_SPACE (34);
3080 /* We test `*p == '^' twice, instead of using an if
3081 statement, so we only need one BUF_PUSH. */
3082 BUF_PUSH (*p == '^' ? charset_not : charset);
3086 /* Remember the first position in the bracket expression. */
3089 /* Push the number of bytes in the bitmap. */
3090 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
3092 /* Clear the whole map. */
3093 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
3095 /* charset_not matches newline according to a syntax bit. */
3096 if ((re_opcode_t) b[-2] == charset_not
3097 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
3098 SET_LIST_BIT ('\n');
3100 /* Read in characters and ranges, setting map bits. */
3103 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3107 /* \ might escape characters inside [...] and [^...]. */
3108 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
3110 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3118 /* Could be the end of the bracket expression. If it's
3119 not (i.e., when the bracket expression is `[]' so
3120 far), the ']' character bit gets set way below. */
3121 if (c == ']' && p != p1 + 1)
3124 /* Look ahead to see if it's a range when the last thing
3125 was a character class. */
3126 if (had_char_class && c == '-' && *p != ']')
3127 FREE_STACK_RETURN (REG_ERANGE);
3129 /* Look ahead to see if it's a range when the last thing
3130 was a character: if this is a hyphen not at the
3131 beginning or the end of a list, then it's the range
3134 && !(p - 2 >= pattern && p[-2] == '[')
3135 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
3139 = compile_range (range_start, &p, pend, translate,
3141 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3142 range_start = 0xffffffff;
3145 else if (p[0] == '-' && p[1] != ']')
3146 { /* This handles ranges made up of characters only. */
3149 /* Move past the `-'. */
3152 ret = compile_range (c, &p, pend, translate, syntax, b);
3153 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3154 range_start = 0xffffffff;
3157 /* See if we're at the beginning of a possible character
3160 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
3161 { /* Leave room for the null. */
3162 char str[CHAR_CLASS_MAX_LENGTH + 1];
3167 /* If pattern is `[[:'. */
3168 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3173 if ((c == ':' && *p == ']') || p == pend)
3175 if (c1 < CHAR_CLASS_MAX_LENGTH)
3178 /* This is in any case an invalid class name. */
3183 /* If isn't a word bracketed by `[:' and `:]':
3184 undo the ending character, the letters, and leave
3185 the leading `:' and `[' (but set bits for them). */
3186 if (c == ':' && *p == ']')
3188 # if defined _LIBC || WIDE_CHAR_SUPPORT
3189 boolean is_lower = STREQ (str, "lower");
3190 boolean is_upper = STREQ (str, "upper");
3194 wt = IS_CHAR_CLASS (str);
3196 FREE_STACK_RETURN (REG_ECTYPE);
3198 /* Throw away the ] at the end of the character
3202 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3204 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
3207 if (__iswctype (__btowc (ch), wt))
3210 if (iswctype (btowc (ch), wt))
3214 if (translate && (is_upper || is_lower)
3215 && (ISUPPER (ch) || ISLOWER (ch)))
3219 had_char_class = true;
3222 boolean is_alnum = STREQ (str, "alnum");
3223 boolean is_alpha = STREQ (str, "alpha");
3224 boolean is_blank = STREQ (str, "blank");
3225 boolean is_cntrl = STREQ (str, "cntrl");
3226 boolean is_digit = STREQ (str, "digit");
3227 boolean is_graph = STREQ (str, "graph");
3228 boolean is_lower = STREQ (str, "lower");
3229 boolean is_print = STREQ (str, "print");
3230 boolean is_punct = STREQ (str, "punct");
3231 boolean is_space = STREQ (str, "space");
3232 boolean is_upper = STREQ (str, "upper");
3233 boolean is_xdigit = STREQ (str, "xdigit");
3235 if (!IS_CHAR_CLASS (str))
3236 FREE_STACK_RETURN (REG_ECTYPE);
3238 /* Throw away the ] at the end of the character
3242 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3244 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
3246 /* This was split into 3 if's to
3247 avoid an arbitrary limit in some compiler. */
3248 if ( (is_alnum && ISALNUM (ch))
3249 || (is_alpha && ISALPHA (ch))
3250 || (is_blank && ISBLANK (ch))
3251 || (is_cntrl && ISCNTRL (ch)))
3253 if ( (is_digit && ISDIGIT (ch))
3254 || (is_graph && ISGRAPH (ch))
3255 || (is_lower && ISLOWER (ch))
3256 || (is_print && ISPRINT (ch)))
3258 if ( (is_punct && ISPUNCT (ch))
3259 || (is_space && ISSPACE (ch))
3260 || (is_upper && ISUPPER (ch))
3261 || (is_xdigit && ISXDIGIT (ch)))
3263 if ( translate && (is_upper || is_lower)
3264 && (ISUPPER (ch) || ISLOWER (ch)))
3267 had_char_class = true;
3268 # endif /* libc || wctype.h */
3278 had_char_class = false;
3281 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=')
3283 unsigned char str[MB_LEN_MAX + 1];
3286 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3292 /* If pattern is `[[='. */
3293 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3298 if ((c == '=' && *p == ']') || p == pend)
3300 if (c1 < MB_LEN_MAX)
3303 /* This is in any case an invalid class name. */
3308 if (c == '=' && *p == ']' && str[0] != '\0')
3310 /* If we have no collation data we use the default
3311 collation in which each character is in a class
3312 by itself. It also means that ASCII is the
3313 character set and therefore we cannot have character
3314 with more than one byte in the multibyte
3321 FREE_STACK_RETURN (REG_ECOLLATE);
3323 /* Throw away the ] at the end of the equivalence
3327 /* Set the bit for the character. */
3328 SET_LIST_BIT (str[0]);
3333 /* Try to match the byte sequence in `str' against
3334 those known to the collate implementation.
3335 First find out whether the bytes in `str' are
3336 actually from exactly one character. */
3337 const int32_t *table;
3338 const unsigned char *weights;
3339 const unsigned char *extra;
3340 const int32_t *indirect;
3342 const unsigned char *cp = str;
3345 /* This #include defines a local function! */
3346 # include <locale/weight.h>
3348 table = (const int32_t *)
3349 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3350 weights = (const unsigned char *)
3351 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3352 extra = (const unsigned char *)
3353 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3354 indirect = (const int32_t *)
3355 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3357 idx = findidx (&cp);
3358 if (idx == 0 || cp < str + c1)
3359 /* This is no valid character. */
3360 FREE_STACK_RETURN (REG_ECOLLATE);
3362 /* Throw away the ] at the end of the equivalence
3366 /* Now we have to go throught the whole table
3367 and find all characters which have the same
3370 XXX Note that this is not entirely correct.
3371 we would have to match multibyte sequences
3372 but this is not possible with the current
3374 for (ch = 1; ch < 256; ++ch)
3375 /* XXX This test would have to be changed if we
3376 would allow matching multibyte sequences. */
3379 int32_t idx2 = table[ch];
3380 size_t len = weights[idx2];
3382 /* Test whether the lenghts match. */
3383 if (weights[idx] == len)
3385 /* They do. New compare the bytes of
3390 && (weights[idx + 1 + cnt]
3391 == weights[idx2 + 1 + cnt]))
3395 /* They match. Mark the character as
3402 had_char_class = true;
3412 had_char_class = false;
3415 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.')
3417 unsigned char str[128]; /* Should be large enough. */
3420 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3426 /* If pattern is `[[.'. */
3427 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3432 if ((c == '.' && *p == ']') || p == pend)
3434 if (c1 < sizeof (str))
3437 /* This is in any case an invalid class name. */
3442 if (c == '.' && *p == ']' && str[0] != '\0')
3444 /* If we have no collation data we use the default
3445 collation in which each character is the name
3446 for its own class which contains only the one
3447 character. It also means that ASCII is the
3448 character set and therefore we cannot have character
3449 with more than one byte in the multibyte
3456 FREE_STACK_RETURN (REG_ECOLLATE);
3458 /* Throw away the ] at the end of the equivalence
3462 /* Set the bit for the character. */
3463 SET_LIST_BIT (str[0]);
3464 range_start = ((const unsigned char *) str)[0];
3469 /* Try to match the byte sequence in `str' against
3470 those known to the collate implementation.
3471 First find out whether the bytes in `str' are
3472 actually from exactly one character. */
3474 const int32_t *symb_table;
3475 const unsigned char *extra;
3482 _NL_CURRENT_WORD (LC_COLLATE,
3483 _NL_COLLATE_SYMB_HASH_SIZEMB);
3484 symb_table = (const int32_t *)
3485 _NL_CURRENT (LC_COLLATE,
3486 _NL_COLLATE_SYMB_TABLEMB);
3487 extra = (const unsigned char *)
3488 _NL_CURRENT (LC_COLLATE,
3489 _NL_COLLATE_SYMB_EXTRAMB);
3491 /* Locate the character in the hashing table. */
3492 hash = elem_hash (str, c1);
3495 elem = hash % table_size;
3496 second = hash % (table_size - 2);
3497 while (symb_table[2 * elem] != 0)
3499 /* First compare the hashing value. */
3500 if (symb_table[2 * elem] == hash
3501 && c1 == extra[symb_table[2 * elem + 1]]
3503 &extra[symb_table[2 * elem + 1]
3507 /* Yep, this is the entry. */
3508 idx = symb_table[2 * elem + 1];
3509 idx += 1 + extra[idx];
3517 if (symb_table[2 * elem] == 0)
3518 /* This is no valid character. */
3519 FREE_STACK_RETURN (REG_ECOLLATE);
3521 /* Throw away the ] at the end of the equivalence
3525 /* Now add the multibyte character(s) we found
3528 XXX Note that this is not entirely correct.
3529 we would have to match multibyte sequences
3530 but this is not possible with the current
3531 implementation. Also, we have to match
3532 collating symbols, which expand to more than
3533 one file, as a whole and not allow the
3534 individual bytes. */
3537 range_start = extra[idx];
3540 SET_LIST_BIT (extra[idx]);
3545 had_char_class = false;
3555 had_char_class = false;
3560 had_char_class = false;
3566 /* Discard any (non)matching list bytes that are all 0 at the
3567 end of the map. Decrease the map-length byte too. */
3568 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3571 #endif /* MBS_SUPPORT */
3577 if (syntax & RE_NO_BK_PARENS)
3584 if (syntax & RE_NO_BK_PARENS)
3591 if (syntax & RE_NEWLINE_ALT)
3598 if (syntax & RE_NO_BK_VBAR)
3605 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3606 goto handle_interval;
3612 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3614 /* Do not translate the character after the \, so that we can
3615 distinguish, e.g., \B from \b, even if we normally would
3616 translate, e.g., B to b. */
3622 if (syntax & RE_NO_BK_PARENS)
3623 goto normal_backslash;
3629 if (COMPILE_STACK_FULL)
3631 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3632 compile_stack_elt_t);
3633 if (compile_stack.stack == NULL) return REG_ESPACE;
3635 compile_stack.size <<= 1;
3638 /* These are the values to restore when we hit end of this
3639 group. They are all relative offsets, so that if the
3640 whole pattern moves because of realloc, they will still
3642 COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR;
3643 COMPILE_STACK_TOP.fixup_alt_jump
3644 = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0;
3645 COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR;
3646 COMPILE_STACK_TOP.regnum = regnum;
3648 /* We will eventually replace the 0 with the number of
3649 groups inner to this one. But do not push a
3650 start_memory for groups beyond the last one we can
3651 represent in the compiled pattern. */
3652 if (regnum <= MAX_REGNUM)
3654 COMPILE_STACK_TOP.inner_group_offset = b
3655 - COMPILED_BUFFER_VAR + 2;
3656 BUF_PUSH_3 (start_memory, regnum, 0);
3659 compile_stack.avail++;
3664 /* If we've reached MAX_REGNUM groups, then this open
3665 won't actually generate any code, so we'll have to
3666 clear pending_exact explicitly. */
3672 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3674 if (COMPILE_STACK_EMPTY)
3676 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3677 goto normal_backslash;
3679 FREE_STACK_RETURN (REG_ERPAREN);
3684 { /* Push a dummy failure point at the end of the
3685 alternative for a possible future
3686 `pop_failure_jump' to pop. See comments at
3687 `push_dummy_failure' in `re_match_2'. */
3688 BUF_PUSH (push_dummy_failure);
3690 /* We allocated space for this jump when we assigned
3691 to `fixup_alt_jump', in the `handle_alt' case below. */
3692 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
3695 /* See similar code for backslashed left paren above. */
3696 if (COMPILE_STACK_EMPTY)
3698 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3701 FREE_STACK_RETURN (REG_ERPAREN);
3704 /* Since we just checked for an empty stack above, this
3705 ``can't happen''. */
3706 assert (compile_stack.avail != 0);
3708 /* We don't just want to restore into `regnum', because
3709 later groups should continue to be numbered higher,
3710 as in `(ab)c(de)' -- the second group is #2. */
3711 regnum_t this_group_regnum;
3713 compile_stack.avail--;
3714 begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset;
3716 = COMPILE_STACK_TOP.fixup_alt_jump
3717 ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1
3719 laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset;
3720 this_group_regnum = COMPILE_STACK_TOP.regnum;
3721 /* If we've reached MAX_REGNUM groups, then this open
3722 won't actually generate any code, so we'll have to
3723 clear pending_exact explicitly. */
3726 /* We're at the end of the group, so now we know how many
3727 groups were inside this one. */
3728 if (this_group_regnum <= MAX_REGNUM)
3730 US_CHAR_TYPE *inner_group_loc
3731 = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset;
3733 *inner_group_loc = regnum - this_group_regnum;
3734 BUF_PUSH_3 (stop_memory, this_group_regnum,
3735 regnum - this_group_regnum);
3741 case '|': /* `\|'. */
3742 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3743 goto normal_backslash;
3745 if (syntax & RE_LIMITED_OPS)
3748 /* Insert before the previous alternative a jump which
3749 jumps to this alternative if the former fails. */
3750 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3751 INSERT_JUMP (on_failure_jump, begalt,
3752 b + 2 + 2 * OFFSET_ADDRESS_SIZE);
3754 b += 1 + OFFSET_ADDRESS_SIZE;
3756 /* The alternative before this one has a jump after it
3757 which gets executed if it gets matched. Adjust that
3758 jump so it will jump to this alternative's analogous
3759 jump (put in below, which in turn will jump to the next
3760 (if any) alternative's such jump, etc.). The last such
3761 jump jumps to the correct final destination. A picture:
3767 If we are at `b', then fixup_alt_jump right now points to a
3768 three-byte space after `a'. We'll put in the jump, set
3769 fixup_alt_jump to right after `b', and leave behind three
3770 bytes which we'll fill in when we get to after `c'. */
3773 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
3775 /* Mark and leave space for a jump after this alternative,
3776 to be filled in later either by next alternative or
3777 when know we're at the end of a series of alternatives. */
3779 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3780 b += 1 + OFFSET_ADDRESS_SIZE;
3788 /* If \{ is a literal. */
3789 if (!(syntax & RE_INTERVALS)
3790 /* If we're at `\{' and it's not the open-interval
3792 || (syntax & RE_NO_BK_BRACES))
3793 goto normal_backslash;
3797 /* If got here, then the syntax allows intervals. */
3799 /* At least (most) this many matches must be made. */
3800 int lower_bound = -1, upper_bound = -1;
3801 beg_interval = p - 1;
3805 if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
3806 goto unfetch_interval;
3808 FREE_STACK_RETURN (REG_EBRACE);
3811 GET_UNSIGNED_NUMBER (lower_bound);
3815 GET_UNSIGNED_NUMBER (upper_bound);
3816 if ((!(syntax & RE_NO_BK_BRACES) && c != '\\')
3817 || ((syntax & RE_NO_BK_BRACES) && c != '}'))
3818 FREE_STACK_RETURN (REG_BADBR);
3820 if (upper_bound < 0)
3821 upper_bound = RE_DUP_MAX;
3824 /* Interval such as `{1}' => match exactly once. */
3825 upper_bound = lower_bound;
3827 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
3828 || lower_bound > upper_bound)
3830 if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
3831 goto unfetch_interval;
3833 FREE_STACK_RETURN (REG_BADBR);
3836 if (!(syntax & RE_NO_BK_BRACES))
3838 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
3845 if (!(syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
3846 goto unfetch_interval;
3848 FREE_STACK_RETURN (REG_BADBR);
3851 /* We just parsed a valid interval. */
3853 /* If it's invalid to have no preceding re. */
3856 if (syntax & RE_CONTEXT_INVALID_OPS)
3857 FREE_STACK_RETURN (REG_BADRPT);
3858 else if (syntax & RE_CONTEXT_INDEP_OPS)
3861 goto unfetch_interval;
3864 /* If the upper bound is zero, don't want to succeed at
3865 all; jump from `laststart' to `b + 3', which will be
3866 the end of the buffer after we insert the jump. */
3867 /* ifdef MBS_SUPPORT, 'b + 1 + OFFSET_ADDRESS_SIZE'
3868 instead of 'b + 3'. */
3869 if (upper_bound == 0)
3871 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3872 INSERT_JUMP (jump, laststart, b + 1
3873 + OFFSET_ADDRESS_SIZE);
3874 b += 1 + OFFSET_ADDRESS_SIZE;
3877 /* Otherwise, we have a nontrivial interval. When
3878 we're all done, the pattern will look like:
3879 set_number_at <jump count> <upper bound>
3880 set_number_at <succeed_n count> <lower bound>
3881 succeed_n <after jump addr> <succeed_n count>
3883 jump_n <succeed_n addr> <jump count>
3884 (The upper bound and `jump_n' are omitted if
3885 `upper_bound' is 1, though.) */
3887 { /* If the upper bound is > 1, we need to insert
3888 more at the end of the loop. */
3889 unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE +
3890 (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE);
3892 GET_BUFFER_SPACE (nbytes);
3894 /* Initialize lower bound of the `succeed_n', even
3895 though it will be set during matching by its
3896 attendant `set_number_at' (inserted next),
3897 because `re_compile_fastmap' needs to know.
3898 Jump to the `jump_n' we might insert below. */
3899 INSERT_JUMP2 (succeed_n, laststart,
3900 b + 1 + 2 * OFFSET_ADDRESS_SIZE
3901 + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE)
3903 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3905 /* Code to initialize the lower bound. Insert
3906 before the `succeed_n'. The `5' is the last two
3907 bytes of this `set_number_at', plus 3 bytes of
3908 the following `succeed_n'. */
3909 /* ifdef MBS_SUPPORT, The '1+2*OFFSET_ADDRESS_SIZE'
3910 is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE'
3911 of the following `succeed_n'. */
3912 insert_op2 (set_number_at, laststart, 1
3913 + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b);
3914 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3916 if (upper_bound > 1)
3917 { /* More than one repetition is allowed, so
3918 append a backward jump to the `succeed_n'
3919 that starts this interval.
3921 When we've reached this during matching,
3922 we'll have matched the interval once, so
3923 jump back only `upper_bound - 1' times. */
3924 STORE_JUMP2 (jump_n, b, laststart
3925 + 2 * OFFSET_ADDRESS_SIZE + 1,
3927 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3929 /* The location we want to set is the second
3930 parameter of the `jump_n'; that is `b-2' as
3931 an absolute address. `laststart' will be
3932 the `set_number_at' we're about to insert;
3933 `laststart+3' the number to set, the source
3934 for the relative address. But we are
3935 inserting into the middle of the pattern --
3936 so everything is getting moved up by 5.
3937 Conclusion: (b - 2) - (laststart + 3) + 5,
3938 i.e., b - laststart.
3940 We insert this at the beginning of the loop
3941 so that if we fail during matching, we'll
3942 reinitialize the bounds. */
3943 insert_op2 (set_number_at, laststart, b - laststart,
3944 upper_bound - 1, b);
3945 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3949 beg_interval = NULL;
3954 /* If an invalid interval, match the characters as literals. */
3955 assert (beg_interval);
3957 beg_interval = NULL;
3959 /* normal_char and normal_backslash need `c'. */
3962 if (!(syntax & RE_NO_BK_BRACES))
3964 if (p > pattern && p[-1] == '\\')
3965 goto normal_backslash;
3970 /* There is no way to specify the before_dot and after_dot
3971 operators. rms says this is ok. --karl */
3979 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3985 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3991 if (syntax & RE_NO_GNU_OPS)
3994 BUF_PUSH (wordchar);
3999 if (syntax & RE_NO_GNU_OPS)
4002 BUF_PUSH (notwordchar);
4007 if (syntax & RE_NO_GNU_OPS)
4013 if (syntax & RE_NO_GNU_OPS)
4019 if (syntax & RE_NO_GNU_OPS)
4021 BUF_PUSH (wordbound);
4025 if (syntax & RE_NO_GNU_OPS)
4027 BUF_PUSH (notwordbound);
4031 if (syntax & RE_NO_GNU_OPS)
4037 if (syntax & RE_NO_GNU_OPS)
4042 case '1': case '2': case '3': case '4': case '5':
4043 case '6': case '7': case '8': case '9':
4044 if (syntax & RE_NO_BK_REFS)
4050 FREE_STACK_RETURN (REG_ESUBREG);
4052 /* Can't back reference to a subexpression if inside of it. */
4053 if (group_in_compile_stack (compile_stack, (regnum_t) c1))
4057 BUF_PUSH_2 (duplicate, c1);
4063 if (syntax & RE_BK_PLUS_QM)
4066 goto normal_backslash;
4070 /* You might think it would be useful for \ to mean
4071 not to translate; but if we don't translate it
4072 it will never match anything. */
4080 /* Expects the character in `c'. */
4082 /* If no exactn currently being built. */
4085 /* If last exactn handle binary(or character) and
4086 new exactn handle character(or binary). */
4087 || is_exactn_bin != is_binary[p - 1 - pattern]
4088 #endif /* MBS_SUPPORT */
4090 /* If last exactn not at current position. */
4091 || pending_exact + *pending_exact + 1 != b
4093 /* We have only one byte following the exactn for the count. */
4094 || *pending_exact == (1 << BYTEWIDTH) - 1
4096 /* If followed by a repetition operator. */
4097 || *p == '*' || *p == '^'
4098 || ((syntax & RE_BK_PLUS_QM)
4099 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
4100 : (*p == '+' || *p == '?'))
4101 || ((syntax & RE_INTERVALS)
4102 && ((syntax & RE_NO_BK_BRACES)
4104 : (p[0] == '\\' && p[1] == '{'))))
4106 /* Start building a new exactn. */
4111 /* Is this exactn binary data or character? */
4112 is_exactn_bin = is_binary[p - 1 - pattern];
4114 BUF_PUSH_2 (exactn_bin, 0);
4116 BUF_PUSH_2 (exactn, 0);
4118 BUF_PUSH_2 (exactn, 0);
4119 #endif /* MBS_SUPPORT */
4120 pending_exact = b - 1;
4127 } /* while p != pend */
4130 /* Through the pattern now. */
4133 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
4135 if (!COMPILE_STACK_EMPTY)
4136 FREE_STACK_RETURN (REG_EPAREN);
4138 /* If we don't want backtracking, force success
4139 the first time we reach the end of the compiled pattern. */
4140 if (syntax & RE_NO_POSIX_BACKTRACKING)
4148 free (compile_stack.stack);
4150 /* We have succeeded; set the length of the buffer. */
4152 bufp->used = (int) b - (int) COMPILED_BUFFER_VAR;
4154 bufp->used = b - bufp->buffer;
4160 DEBUG_PRINT1 ("\nCompiled pattern: \n");
4161 print_compiled_pattern (bufp);
4165 #ifndef MATCH_MAY_ALLOCATE
4166 /* Initialize the failure stack to the largest possible stack. This
4167 isn't necessary unless we're trying to avoid calling alloca in
4168 the search and match routines. */
4170 int num_regs = bufp->re_nsub + 1;
4172 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
4173 is strictly greater than re_max_failures, the largest possible stack
4174 is 2 * re_max_failures failure points. */
4175 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
4177 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
4180 if (! fail_stack.stack)
4182 = (fail_stack_elt_t *) xmalloc (fail_stack.size
4183 * sizeof (fail_stack_elt_t));
4186 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
4188 * sizeof (fail_stack_elt_t)));
4189 # else /* not emacs */
4190 if (! fail_stack.stack)
4192 = (fail_stack_elt_t *) malloc (fail_stack.size
4193 * sizeof (fail_stack_elt_t));
4196 = (fail_stack_elt_t *) realloc (fail_stack.stack,
4198 * sizeof (fail_stack_elt_t)));
4199 # endif /* not emacs */
4202 regex_grow_registers (num_regs);
4204 #endif /* not MATCH_MAY_ALLOCATE */
4207 } /* regex_compile */
4209 /* Subroutines for `regex_compile'. */
4211 /* Store OP at LOC followed by two-byte integer parameter ARG. */
4212 /* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
4215 store_op1 (op, loc, arg)
4220 *loc = (US_CHAR_TYPE) op;
4221 STORE_NUMBER (loc + 1, arg);
4225 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
4226 /* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
4229 store_op2 (op, loc, arg1, arg2)
4234 *loc = (US_CHAR_TYPE) op;
4235 STORE_NUMBER (loc + 1, arg1);
4236 STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2);
4240 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
4241 for OP followed by two-byte integer parameter ARG. */
4242 /* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
4245 insert_op1 (op, loc, arg, end)
4251 register US_CHAR_TYPE *pfrom = end;
4252 register US_CHAR_TYPE *pto = end + 1 + OFFSET_ADDRESS_SIZE;
4254 while (pfrom != loc)
4257 store_op1 (op, loc, arg);
4261 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
4262 /* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
4265 insert_op2 (op, loc, arg1, arg2, end)
4271 register US_CHAR_TYPE *pfrom = end;
4272 register US_CHAR_TYPE *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE;
4274 while (pfrom != loc)
4277 store_op2 (op, loc, arg1, arg2);
4281 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
4282 after an alternative or a begin-subexpression. We assume there is at
4283 least one character before the ^. */
4286 at_begline_loc_p (pattern, p, syntax)
4287 const CHAR_TYPE *pattern, *p;
4288 reg_syntax_t syntax;
4290 const CHAR_TYPE *prev = p - 2;
4291 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
4294 /* After a subexpression? */
4295 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
4296 /* After an alternative? */
4297 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
4301 /* The dual of at_begline_loc_p. This one is for $. We assume there is
4302 at least one character after the $, i.e., `P < PEND'. */
4305 at_endline_loc_p (p, pend, syntax)
4306 const CHAR_TYPE *p, *pend;
4307 reg_syntax_t syntax;
4309 const CHAR_TYPE *next = p;
4310 boolean next_backslash = *next == '\\';
4311 const CHAR_TYPE *next_next = p + 1 < pend ? p + 1 : 0;
4314 /* Before a subexpression? */
4315 (syntax & RE_NO_BK_PARENS ? *next == ')'
4316 : next_backslash && next_next && *next_next == ')')
4317 /* Before an alternative? */
4318 || (syntax & RE_NO_BK_VBAR ? *next == '|'
4319 : next_backslash && next_next && *next_next == '|');
4323 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
4324 false if it's not. */
4327 group_in_compile_stack (compile_stack, regnum)
4328 compile_stack_type compile_stack;
4333 for (this_element = compile_stack.avail - 1;
4336 if (compile_stack.stack[this_element].regnum == regnum)
4343 /* This insert space into the pattern. */
4345 insert_space (num, loc, end)
4350 register CHAR_TYPE *pto = end;
4351 register CHAR_TYPE *pfrom = end - num;
4353 while (pfrom >= loc)
4356 #endif /* MBS_SUPPORT */
4359 static reg_errcode_t
4360 compile_range (range_start_char, p_ptr, pend, translate, syntax, b,
4362 CHAR_TYPE range_start_char;
4363 const CHAR_TYPE **p_ptr, *pend;
4364 CHAR_TYPE *char_set, *b;
4365 RE_TRANSLATE_TYPE translate;
4366 reg_syntax_t syntax;
4368 const CHAR_TYPE *p = *p_ptr;
4369 CHAR_TYPE range_start, range_end;
4373 uint32_t start_val, end_val;
4379 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4382 const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE,
4383 _NL_COLLATE_COLLSEQWC);
4385 if (range_start_char < -1)
4387 /* range_start is a collating symbol. */
4389 /* Retreive the index and get collation sequence value. */
4390 wextra = (int32_t*)char_set[-range_start_char];
4391 start_val = wextra[1 + *wextra];
4394 start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char));
4396 end_val = collseq_table_lookup (collseq, TRANSLATE (p[0]));
4398 /* Report an error if the range is empty and the syntax prohibits
4400 ret = ((syntax & RE_NO_EMPTY_RANGES)
4401 && (start_val > end_val))? REG_ERANGE : REG_NOERROR;
4403 /* Insert space to the end of the char_ranges. */
4404 insert_space(2, b - char_set[5] - 2, b - 1);
4405 *(b - char_set[5] - 2) = (wchar_t)start_val;
4406 *(b - char_set[5] - 1) = (wchar_t)end_val;
4407 char_set[4]++; /* ranges_index */
4412 range_start = (range_start_char >= 0)? TRANSLATE (range_start_char):
4414 range_end = TRANSLATE (p[0]);
4415 /* Report an error if the range is empty and the syntax prohibits
4417 ret = ((syntax & RE_NO_EMPTY_RANGES)
4418 && (range_start > range_end))? REG_ERANGE : REG_NOERROR;
4420 /* Insert space to the end of the char_ranges. */
4421 insert_space(2, b - char_set[5] - 2, b - 1);
4422 *(b - char_set[5] - 2) = range_start;
4423 *(b - char_set[5] - 1) = range_end;
4424 char_set[4]++; /* ranges_index */
4426 /* Have to increment the pointer into the pattern string, so the
4427 caller isn't still at the ending character. */
4433 /* Read the ending character of a range (in a bracket expression) from the
4434 uncompiled pattern *P_PTR (which ends at PEND). We assume the
4435 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
4436 Then we set the translation of all bits between the starting and
4437 ending characters (inclusive) in the compiled pattern B.
4439 Return an error code.
4441 We use these short variable names so we can use the same macros as
4442 `regex_compile' itself. */
4444 static reg_errcode_t
4445 compile_range (range_start_char, p_ptr, pend, translate, syntax, b)
4446 unsigned int range_start_char;
4447 const char **p_ptr, *pend;
4448 RE_TRANSLATE_TYPE translate;
4449 reg_syntax_t syntax;
4453 const char *p = *p_ptr;
4456 const unsigned char *collseq;
4457 unsigned int start_colseq;
4458 unsigned int end_colseq;
4466 /* Have to increment the pointer into the pattern string, so the
4467 caller isn't still at the ending character. */
4470 /* Report an error if the range is empty and the syntax prohibits this. */
4471 ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
4474 collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
4475 _NL_COLLATE_COLLSEQMB);
4477 start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)];
4478 end_colseq = collseq[(unsigned char) TRANSLATE (p[0])];
4479 for (this_char = 0; this_char <= (unsigned char) -1; ++this_char)
4481 unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)];
4483 if (start_colseq <= this_colseq && this_colseq <= end_colseq)
4485 SET_LIST_BIT (TRANSLATE (this_char));
4490 /* Here we see why `this_char' has to be larger than an `unsigned
4491 char' -- we would otherwise go into an infinite loop, since all
4492 characters <= 0xff. */
4493 range_start_char = TRANSLATE (range_start_char);
4494 /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE,
4495 and some compilers cast it to int implicitly, so following for_loop
4496 may fall to (almost) infinite loop.
4497 e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff.
4498 To avoid this, we cast p[0] to unsigned int and truncate it. */
4499 end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1));
4501 for (this_char = range_start_char; this_char <= end_char; ++this_char)
4503 SET_LIST_BIT (TRANSLATE (this_char));
4510 #endif /* MBS_SUPPORT */
4512 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4513 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4514 characters can start a string that matches the pattern. This fastmap
4515 is used by re_search to skip quickly over impossible starting points.
4517 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4518 area as BUFP->fastmap.
4520 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4523 Returns 0 if we succeed, -2 if an internal error. */
4526 /* local function for re_compile_fastmap.
4527 truncate wchar_t character to char. */
4532 unsigned char buf[MB_LEN_MAX];
4533 int retval = wctomb(buf, c);
4534 return retval > 0 ? buf[0] : (unsigned char)c;
4536 #endif /* MBS_SUPPORT */
4539 re_compile_fastmap (bufp)
4540 struct re_pattern_buffer *bufp;
4543 #ifdef MATCH_MAY_ALLOCATE
4544 fail_stack_type fail_stack;
4546 #ifndef REGEX_MALLOC
4550 register char *fastmap = bufp->fastmap;
4553 /* We need to cast pattern to (wchar_t*), because we casted this compiled
4554 pattern to (char*) in regex_compile. */
4555 US_CHAR_TYPE *pattern = (US_CHAR_TYPE*)bufp->buffer;
4556 register US_CHAR_TYPE *pend = (US_CHAR_TYPE*) (bufp->buffer + bufp->used);
4558 US_CHAR_TYPE *pattern = bufp->buffer;
4559 register US_CHAR_TYPE *pend = pattern + bufp->used;
4560 #endif /* MBS_SUPPORT */
4561 US_CHAR_TYPE *p = pattern;
4564 /* This holds the pointer to the failure stack, when
4565 it is allocated relocatably. */
4566 fail_stack_elt_t *failure_stack_ptr;
4569 /* Assume that each path through the pattern can be null until
4570 proven otherwise. We set this false at the bottom of switch
4571 statement, to which we get only if a particular path doesn't
4572 match the empty string. */
4573 boolean path_can_be_null = true;
4575 /* We aren't doing a `succeed_n' to begin with. */
4576 boolean succeed_n_p = false;
4578 assert (fastmap != NULL && p != NULL);
4581 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4582 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4583 bufp->can_be_null = 0;
4587 if (p == pend || *p == succeed)
4589 /* We have reached the (effective) end of pattern. */
4590 if (!FAIL_STACK_EMPTY ())
4592 bufp->can_be_null |= path_can_be_null;
4594 /* Reset for next path. */
4595 path_can_be_null = true;
4597 p = fail_stack.stack[--fail_stack.avail].pointer;
4605 /* We should never be about to go beyond the end of the pattern. */
4608 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4611 /* I guess the idea here is to simply not bother with a fastmap
4612 if a backreference is used, since it's too hard to figure out
4613 the fastmap for the corresponding group. Setting
4614 `can_be_null' stops `re_search_2' from using the fastmap, so
4615 that is all we do. */
4617 bufp->can_be_null = 1;
4621 /* Following are the cases which match a character. These end
4626 fastmap[truncate_wchar(p[1])] = 1;
4635 #endif /* MBS_SUPPORT */
4639 /* It is hard to distinguish fastmap from (multi byte) characters
4640 which depends on current locale. */
4645 bufp->can_be_null = 1;
4649 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4650 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
4656 /* Chars beyond end of map must be allowed. */
4657 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
4660 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4661 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
4667 for (j = 0; j < (1 << BYTEWIDTH); j++)
4668 if (SYNTAX (j) == Sword)
4674 for (j = 0; j < (1 << BYTEWIDTH); j++)
4675 if (SYNTAX (j) != Sword)
4682 int fastmap_newline = fastmap['\n'];
4684 /* `.' matches anything ... */
4685 for (j = 0; j < (1 << BYTEWIDTH); j++)
4688 /* ... except perhaps newline. */
4689 if (!(bufp->syntax & RE_DOT_NEWLINE))
4690 fastmap['\n'] = fastmap_newline;
4692 /* Return if we have already set `can_be_null'; if we have,
4693 then the fastmap is irrelevant. Something's wrong here. */
4694 else if (bufp->can_be_null)
4697 /* Otherwise, have to check alternative paths. */
4704 for (j = 0; j < (1 << BYTEWIDTH); j++)
4705 if (SYNTAX (j) == (enum syntaxcode) k)
4712 for (j = 0; j < (1 << BYTEWIDTH); j++)
4713 if (SYNTAX (j) != (enum syntaxcode) k)
4718 /* All cases after this match the empty string. These end with
4738 case push_dummy_failure:
4743 case pop_failure_jump:
4744 case maybe_pop_jump:
4747 case dummy_failure_jump:
4748 EXTRACT_NUMBER_AND_INCR (j, p);
4753 /* Jump backward implies we just went through the body of a
4754 loop and matched nothing. Opcode jumped to should be
4755 `on_failure_jump' or `succeed_n'. Just treat it like an
4756 ordinary jump. For a * loop, it has pushed its failure
4757 point already; if so, discard that as redundant. */
4758 if ((re_opcode_t) *p != on_failure_jump
4759 && (re_opcode_t) *p != succeed_n)
4763 EXTRACT_NUMBER_AND_INCR (j, p);
4766 /* If what's on the stack is where we are now, pop it. */
4767 if (!FAIL_STACK_EMPTY ()
4768 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
4774 case on_failure_jump:
4775 case on_failure_keep_string_jump:
4776 handle_on_failure_jump:
4777 EXTRACT_NUMBER_AND_INCR (j, p);
4779 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
4780 end of the pattern. We don't want to push such a point,
4781 since when we restore it above, entering the switch will
4782 increment `p' past the end of the pattern. We don't need
4783 to push such a point since we obviously won't find any more
4784 fastmap entries beyond `pend'. Such a pattern can match
4785 the null string, though. */
4788 if (!PUSH_PATTERN_OP (p + j, fail_stack))
4790 RESET_FAIL_STACK ();
4795 bufp->can_be_null = 1;
4799 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
4800 succeed_n_p = false;
4807 /* Get to the number of times to succeed. */
4808 p += OFFSET_ADDRESS_SIZE;
4810 /* Increment p past the n for when k != 0. */
4811 EXTRACT_NUMBER_AND_INCR (k, p);
4814 p -= 2 * OFFSET_ADDRESS_SIZE;
4815 succeed_n_p = true; /* Spaghetti code alert. */
4816 goto handle_on_failure_jump;
4822 p += 2 * OFFSET_ADDRESS_SIZE;
4833 abort (); /* We have listed all the cases. */
4836 /* Getting here means we have found the possible starting
4837 characters for one path of the pattern -- and that the empty
4838 string does not match. We need not follow this path further.
4839 Instead, look at the next alternative (remembered on the
4840 stack), or quit if no more. The test at the top of the loop
4841 does these things. */
4842 path_can_be_null = false;
4846 /* Set `can_be_null' for the last path (also the first path, if the
4847 pattern is empty). */
4848 bufp->can_be_null |= path_can_be_null;
4851 RESET_FAIL_STACK ();
4853 } /* re_compile_fastmap */
4855 weak_alias (__re_compile_fastmap, re_compile_fastmap)
4858 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4859 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4860 this memory for recording register information. STARTS and ENDS
4861 must be allocated using the malloc library routine, and must each
4862 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4864 If NUM_REGS == 0, then subsequent matches should allocate their own
4867 Unless this function is called, the first search or match using
4868 PATTERN_BUFFER will allocate its own register data, without
4869 freeing the old data. */
4872 re_set_registers (bufp, regs, num_regs, starts, ends)
4873 struct re_pattern_buffer *bufp;
4874 struct re_registers *regs;
4876 regoff_t *starts, *ends;
4880 bufp->regs_allocated = REGS_REALLOCATE;
4881 regs->num_regs = num_regs;
4882 regs->start = starts;
4887 bufp->regs_allocated = REGS_UNALLOCATED;
4889 regs->start = regs->end = (regoff_t *) 0;
4893 weak_alias (__re_set_registers, re_set_registers)
4896 /* Searching routines. */
4898 /* Like re_search_2, below, but only one string is specified, and
4899 doesn't let you say where to stop matching. */
4902 re_search (bufp, string, size, startpos, range, regs)
4903 struct re_pattern_buffer *bufp;
4905 int size, startpos, range;
4906 struct re_registers *regs;
4908 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
4912 weak_alias (__re_search, re_search)
4916 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4917 virtual concatenation of STRING1 and STRING2, starting first at index
4918 STARTPOS, then at STARTPOS + 1, and so on.
4920 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4922 RANGE is how far to scan while trying to match. RANGE = 0 means try
4923 only at STARTPOS; in general, the last start tried is STARTPOS +
4926 In REGS, return the indices of the virtual concatenation of STRING1
4927 and STRING2 that matched the entire BUFP->buffer and its contained
4930 Do not consider matching one past the index STOP in the virtual
4931 concatenation of STRING1 and STRING2.
4933 We return either the position in the strings at which the match was
4934 found, -1 if no match, or -2 if error (such as failure
4938 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
4939 struct re_pattern_buffer *bufp;
4940 const char *string1, *string2;
4944 struct re_registers *regs;
4948 register char *fastmap = bufp->fastmap;
4949 register RE_TRANSLATE_TYPE translate = bufp->translate;
4950 int total_size = size1 + size2;
4951 int endpos = startpos + range;
4953 /* Check for out-of-range STARTPOS. */
4954 if (startpos < 0 || startpos > total_size)
4957 /* Fix up RANGE if it might eventually take us outside
4958 the virtual concatenation of STRING1 and STRING2.
4959 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
4961 range = 0 - startpos;
4962 else if (endpos > total_size)
4963 range = total_size - startpos;
4965 /* If the search isn't to be a backwards one, don't waste time in a
4966 search for a pattern that must be anchored. */
4967 if (bufp->used > 0 && range > 0
4968 && ((re_opcode_t) bufp->buffer[0] == begbuf
4969 /* `begline' is like `begbuf' if it cannot match at newlines. */
4970 || ((re_opcode_t) bufp->buffer[0] == begline
4971 && !bufp->newline_anchor)))
4980 /* In a forward search for something that starts with \=.
4981 don't keep searching past point. */
4982 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4984 range = PT - startpos;
4990 /* Update the fastmap now if not correct already. */
4991 if (fastmap && !bufp->fastmap_accurate)
4992 if (re_compile_fastmap (bufp) == -2)
4995 /* Loop through the string, looking for a place to start matching. */
4998 /* If a fastmap is supplied, skip quickly over characters that
4999 cannot be the start of a match. If the pattern can match the
5000 null string, however, we don't need to skip characters; we want
5001 the first null string. */
5002 if (fastmap && startpos < total_size && !bufp->can_be_null)
5004 if (range > 0) /* Searching forwards. */
5006 register const char *d;
5007 register int lim = 0;
5010 if (startpos < size1 && startpos + range >= size1)
5011 lim = range - (size1 - startpos);
5013 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
5015 /* Written out as an if-else to avoid testing `translate'
5019 && !fastmap[(unsigned char)
5020 translate[(unsigned char) *d++]])
5023 while (range > lim && !fastmap[(unsigned char) *d++])
5026 startpos += irange - range;
5028 else /* Searching backwards. */
5030 register char c = (size1 == 0 || startpos >= size1
5031 ? string2[startpos - size1]
5032 : string1[startpos]);
5034 if (!fastmap[(unsigned char) TRANSLATE (c)])
5039 /* If can't match the null string, and that's all we have left, fail. */
5040 if (range >= 0 && startpos == total_size && fastmap
5041 && !bufp->can_be_null)
5044 val = re_match_2_internal (bufp, string1, size1, string2, size2,
5045 startpos, regs, stop);
5046 #ifndef REGEX_MALLOC
5075 weak_alias (__re_search_2, re_search_2)
5079 /* This converts PTR, a pointer into one of the search wchar_t strings
5080 `string1' and `string2' into an multibyte string offset from the
5081 beginning of that string. We use mbs_offset to optimize.
5082 See convert_mbs_to_wcs. */
5083 # define POINTER_TO_OFFSET(ptr) \
5084 (FIRST_STRING_P (ptr) \
5085 ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \
5086 : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \
5089 /* This converts PTR, a pointer into one of the search strings `string1'
5090 and `string2' into an offset from the beginning of that string. */
5091 # define POINTER_TO_OFFSET(ptr) \
5092 (FIRST_STRING_P (ptr) \
5093 ? ((regoff_t) ((ptr) - string1)) \
5094 : ((regoff_t) ((ptr) - string2 + size1)))
5095 #endif /* MBS_SUPPORT */
5097 /* Macros for dealing with the split strings in re_match_2. */
5099 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
5101 /* Call before fetching a character with *d. This switches over to
5102 string2 if necessary. */
5103 #define PREFETCH() \
5106 /* End of string2 => fail. */ \
5107 if (dend == end_match_2) \
5109 /* End of string1 => advance to string2. */ \
5111 dend = end_match_2; \
5115 /* Test if at very beginning or at very end of the virtual concatenation
5116 of `string1' and `string2'. If only one string, it's `string2'. */
5117 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5118 #define AT_STRINGS_END(d) ((d) == end2)
5121 /* Test if D points to a character which is word-constituent. We have
5122 two special cases to check for: if past the end of string1, look at
5123 the first character in string2; and if before the beginning of
5124 string2, look at the last character in string1. */
5126 /* Use internationalized API instead of SYNTAX. */
5127 # define WORDCHAR_P(d) \
5128 (iswalnum ((wint_t)((d) == end1 ? *string2 \
5129 : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0)
5131 # define WORDCHAR_P(d) \
5132 (SYNTAX ((d) == end1 ? *string2 \
5133 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
5135 #endif /* MBS_SUPPORT */
5137 /* Disabled due to a compiler bug -- see comment at case wordbound */
5139 /* Test if the character before D and the one at D differ with respect
5140 to being word-constituent. */
5141 #define AT_WORD_BOUNDARY(d) \
5142 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
5143 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
5146 /* Free everything we malloc. */
5147 #ifdef MATCH_MAY_ALLOCATE
5148 # define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
5150 # define FREE_VARIABLES() \
5152 REGEX_FREE_STACK (fail_stack.stack); \
5153 FREE_VAR (regstart); \
5154 FREE_VAR (regend); \
5155 FREE_VAR (old_regstart); \
5156 FREE_VAR (old_regend); \
5157 FREE_VAR (best_regstart); \
5158 FREE_VAR (best_regend); \
5159 FREE_VAR (reg_info); \
5160 FREE_VAR (reg_dummy); \
5161 FREE_VAR (reg_info_dummy); \
5162 FREE_VAR (string1); \
5163 FREE_VAR (string2); \
5164 FREE_VAR (mbs_offset1); \
5165 FREE_VAR (mbs_offset2); \
5166 FREE_VAR (is_binary1); \
5167 FREE_VAR (is_binary2); \
5169 # else /* not MBS_SUPPORT */
5170 # define FREE_VARIABLES() \
5172 REGEX_FREE_STACK (fail_stack.stack); \
5173 FREE_VAR (regstart); \
5174 FREE_VAR (regend); \
5175 FREE_VAR (old_regstart); \
5176 FREE_VAR (old_regend); \
5177 FREE_VAR (best_regstart); \
5178 FREE_VAR (best_regend); \
5179 FREE_VAR (reg_info); \
5180 FREE_VAR (reg_dummy); \
5181 FREE_VAR (reg_info_dummy); \
5183 # endif /* MBS_SUPPORT */
5186 # define FREE_VARIABLES() \
5188 if (string1) free (string1); \
5189 if (string2) free (string2); \
5190 if (mbs_offset1) free (mbs_offset1); \
5191 if (mbs_offset2) free (mbs_offset2); \
5192 if (is_binary1) free (is_binary1); \
5193 if (is_binary2) free (is_binary2); \
5196 # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
5197 # endif /* MBS_SUPPORT */
5198 #endif /* not MATCH_MAY_ALLOCATE */
5200 /* These values must meet several constraints. They must not be valid
5201 register values; since we have a limit of 255 registers (because
5202 we use only one byte in the pattern for the register number), we can
5203 use numbers larger than 255. They must differ by 1, because of
5204 NUM_FAILURE_ITEMS above. And the value for the lowest register must
5205 be larger than the value for the highest register, so we do not try
5206 to actually save any registers when none are active. */
5207 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
5208 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
5210 /* Matching routines. */
5212 #ifndef emacs /* Emacs never uses this. */
5213 /* re_match is like re_match_2 except it takes only a single string. */
5216 re_match (bufp, string, size, pos, regs)
5217 struct re_pattern_buffer *bufp;
5220 struct re_registers *regs;
5222 int result = re_match_2_internal (bufp, NULL, 0, string, size,
5224 # ifndef REGEX_MALLOC
5232 weak_alias (__re_match, re_match)
5234 #endif /* not emacs */
5236 static boolean group_match_null_string_p _RE_ARGS ((US_CHAR_TYPE **p,
5238 register_info_type *reg_info));
5239 static boolean alt_match_null_string_p _RE_ARGS ((US_CHAR_TYPE *p,
5241 register_info_type *reg_info));
5242 static boolean common_op_match_null_string_p _RE_ARGS ((US_CHAR_TYPE **p,
5244 register_info_type *reg_info));
5245 static int bcmp_translate _RE_ARGS ((const CHAR_TYPE *s1, const CHAR_TYPE *s2,
5246 int len, char *translate));
5248 /* re_match_2 matches the compiled pattern in BUFP against the
5249 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
5250 and SIZE2, respectively). We start matching at POS, and stop
5253 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
5254 store offsets for the substring each group matched in REGS. See the
5255 documentation for exactly how many groups we fill.
5257 We return -1 if no match, -2 if an internal error (such as the
5258 failure stack overflowing). Otherwise, we return the length of the
5259 matched substring. */
5262 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
5263 struct re_pattern_buffer *bufp;
5264 const char *string1, *string2;
5267 struct re_registers *regs;
5270 int result = re_match_2_internal (bufp, string1, size1, string2, size2,
5272 #ifndef REGEX_MALLOC
5280 weak_alias (__re_match_2, re_match_2)
5284 /* This check the substring (from 0, to length) of the multibyte string,
5285 to which offset_buffer correspond. And count how many wchar_t_characters
5286 the substring occupy. We use offset_buffer to optimization.
5287 See convert_mbs_to_wcs. */
5289 count_mbs_length(offset_buffer, length)
5295 /* Check whether the size is valid. */
5299 if (offset_buffer == NULL)
5302 for (wcs_size = 0 ; offset_buffer[wcs_size] != -1 ; wcs_size++)
5304 if (offset_buffer[wcs_size] == length)
5306 if (offset_buffer[wcs_size] > length)
5307 /* It is a fragment of a wide character. */
5311 /* We reached at the sentinel. */
5314 #endif /* MBS_SUPPORT */
5316 /* This is a separate function so that we can force an alloca cleanup
5320 re_match_2_internal (bufp, cstring1, csize1, cstring2, csize2, pos, regs, stop)
5321 struct re_pattern_buffer *bufp;
5322 const char *cstring1, *cstring2;
5325 re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5326 struct re_pattern_buffer *bufp;
5327 const char *string1, *string2;
5331 struct re_registers *regs;
5334 /* General temporaries. */
5338 /* We need wchar_t* buffers correspond to string1, string2. */
5339 CHAR_TYPE *string1 = NULL, *string2 = NULL;
5340 /* We need the size of wchar_t buffers correspond to csize1, csize2. */
5341 int size1 = 0, size2 = 0;
5342 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
5343 int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
5344 /* They hold whether each wchar_t is binary data or not. */
5345 int *is_binary1 = NULL, *is_binary2 = NULL;
5346 #endif /* MBS_SUPPORT */
5348 /* Just past the end of the corresponding string. */
5349 const CHAR_TYPE *end1, *end2;
5351 /* Pointers into string1 and string2, just past the last characters in
5352 each to consider matching. */
5353 const CHAR_TYPE *end_match_1, *end_match_2;
5355 /* Where we are in the data, and the end of the current string. */
5356 const CHAR_TYPE *d, *dend;
5358 /* Where we are in the pattern, and the end of the pattern. */
5360 US_CHAR_TYPE *pattern, *p;
5361 register US_CHAR_TYPE *pend;
5363 US_CHAR_TYPE *p = bufp->buffer;
5364 register US_CHAR_TYPE *pend = p + bufp->used;
5365 #endif /* MBS_SUPPORT */
5367 /* Mark the opcode just after a start_memory, so we can test for an
5368 empty subpattern when we get to the stop_memory. */
5369 US_CHAR_TYPE *just_past_start_mem = 0;
5371 /* We use this to map every character in the string. */
5372 RE_TRANSLATE_TYPE translate = bufp->translate;
5374 /* Failure point stack. Each place that can handle a failure further
5375 down the line pushes a failure point on this stack. It consists of
5376 restart, regend, and reg_info for all registers corresponding to
5377 the subexpressions we're currently inside, plus the number of such
5378 registers, and, finally, two char *'s. The first char * is where
5379 to resume scanning the pattern; the second one is where to resume
5380 scanning the strings. If the latter is zero, the failure point is
5381 a ``dummy''; if a failure happens and the failure point is a dummy,
5382 it gets discarded and the next next one is tried. */
5383 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5384 fail_stack_type fail_stack;
5387 static unsigned failure_id;
5388 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5392 /* This holds the pointer to the failure stack, when
5393 it is allocated relocatably. */
5394 fail_stack_elt_t *failure_stack_ptr;
5397 /* We fill all the registers internally, independent of what we
5398 return, for use in backreferences. The number here includes
5399 an element for register zero. */
5400 size_t num_regs = bufp->re_nsub + 1;
5402 /* The currently active registers. */
5403 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5404 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5406 /* Information on the contents of registers. These are pointers into
5407 the input strings; they record just what was matched (on this
5408 attempt) by a subexpression part of the pattern, that is, the
5409 regnum-th regstart pointer points to where in the pattern we began
5410 matching and the regnum-th regend points to right after where we
5411 stopped matching the regnum-th subexpression. (The zeroth register
5412 keeps track of what the whole pattern matches.) */
5413 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5414 const CHAR_TYPE **regstart, **regend;
5417 /* If a group that's operated upon by a repetition operator fails to
5418 match anything, then the register for its start will need to be
5419 restored because it will have been set to wherever in the string we
5420 are when we last see its open-group operator. Similarly for a
5422 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5423 const CHAR_TYPE **old_regstart, **old_regend;
5426 /* The is_active field of reg_info helps us keep track of which (possibly
5427 nested) subexpressions we are currently in. The matched_something
5428 field of reg_info[reg_num] helps us tell whether or not we have
5429 matched any of the pattern so far this time through the reg_num-th
5430 subexpression. These two fields get reset each time through any
5431 loop their register is in. */
5432 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5433 register_info_type *reg_info;
5436 /* The following record the register info as found in the above
5437 variables when we find a match better than any we've seen before.
5438 This happens as we backtrack through the failure points, which in
5439 turn happens only if we have not yet matched the entire string. */
5440 unsigned best_regs_set = false;
5441 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5442 const CHAR_TYPE **best_regstart, **best_regend;
5445 /* Logically, this is `best_regend[0]'. But we don't want to have to
5446 allocate space for that if we're not allocating space for anything
5447 else (see below). Also, we never need info about register 0 for
5448 any of the other register vectors, and it seems rather a kludge to
5449 treat `best_regend' differently than the rest. So we keep track of
5450 the end of the best match so far in a separate variable. We
5451 initialize this to NULL so that when we backtrack the first time
5452 and need to test it, it's not garbage. */
5453 const CHAR_TYPE *match_end = NULL;
5455 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
5456 int set_regs_matched_done = 0;
5458 /* Used when we pop values we don't care about. */
5459 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5460 const CHAR_TYPE **reg_dummy;
5461 register_info_type *reg_info_dummy;
5465 /* Counts the total number of registers pushed. */
5466 unsigned num_regs_pushed = 0;
5469 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5473 #ifdef MATCH_MAY_ALLOCATE
5474 /* Do not bother to initialize all the register variables if there are
5475 no groups in the pattern, as it takes a fair amount of time. If
5476 there are groups, we include space for register 0 (the whole
5477 pattern), even though we never use it, since it simplifies the
5478 array indexing. We should fix this. */
5481 regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
5482 regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
5483 old_regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
5484 old_regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
5485 best_regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
5486 best_regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
5487 reg_info = REGEX_TALLOC (num_regs, register_info_type);
5488 reg_dummy = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
5489 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
5491 if (!(regstart && regend && old_regstart && old_regend && reg_info
5492 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
5500 /* We must initialize all our variables to NULL, so that
5501 `FREE_VARIABLES' doesn't try to free them. */
5502 regstart = regend = old_regstart = old_regend = best_regstart
5503 = best_regend = reg_dummy = NULL;
5504 reg_info = reg_info_dummy = (register_info_type *) NULL;
5506 #endif /* MATCH_MAY_ALLOCATE */
5508 /* The starting position is bogus. */
5510 if (pos < 0 || pos > csize1 + csize2)
5512 if (pos < 0 || pos > size1 + size2)
5520 /* Allocate wchar_t array for string1 and string2 and
5521 fill them with converted string. */
5524 string1 = TALLOC (csize1 + 1, CHAR_TYPE);
5525 mbs_offset1 = TALLOC (csize1 + 1, int);
5526 is_binary1 = TALLOC (csize1 + 1, int);
5527 if (!string1 || !mbs_offset1 || !is_binary1)
5529 if (string1) free(string1);
5530 if (mbs_offset1) free(mbs_offset1);
5531 if (is_binary1) free(is_binary1);
5534 size1 = convert_mbs_to_wcs(string1, cstring1, csize1,
5535 mbs_offset1, is_binary1);
5536 string1[size1] = L'\0'; /* for a sentinel */
5540 string2 = REGEX_TALLOC (csize2 + 1, CHAR_TYPE);
5541 mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
5542 is_binary2 = TALLOC (csize2 + 1, int);
5543 if (!string2 || !mbs_offset2 || !is_binary2)
5545 if (string1) free(string1);
5546 if (mbs_offset1) free(mbs_offset1);
5547 if (is_binary1) free(is_binary1);
5548 if (string2) free(string2);
5549 if (mbs_offset2) free(mbs_offset2);
5550 if (is_binary2) free(is_binary2);
5553 size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
5554 mbs_offset2, is_binary2);
5555 string2[size2] = L'\0'; /* for a sentinel */
5558 /* We need to cast pattern to (wchar_t*), because we casted this compiled
5559 pattern to (char*) in regex_compile. */
5560 p = pattern = (CHAR_TYPE*)bufp->buffer;
5561 pend = (CHAR_TYPE*)(bufp->buffer + bufp->used);
5563 #endif /* MBS_SUPPORT */
5565 /* Initialize subexpression text positions to -1 to mark ones that no
5566 start_memory/stop_memory has been seen for. Also initialize the
5567 register information struct. */
5568 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5570 regstart[mcnt] = regend[mcnt]
5571 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
5573 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
5574 IS_ACTIVE (reg_info[mcnt]) = 0;
5575 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5576 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5579 /* We move `string1' into `string2' if the latter's empty -- but not if
5580 `string1' is null. */
5581 if (size2 == 0 && string1 != NULL)
5588 end1 = string1 + size1;
5589 end2 = string2 + size2;
5591 /* Compute where to stop matching, within the two strings. */
5595 mcnt = count_mbs_length(mbs_offset1, stop);
5596 end_match_1 = string1 + mcnt;
5597 end_match_2 = string2;
5602 mcnt = count_mbs_length(mbs_offset2, stop-csize1);
5603 end_match_2 = string2 + mcnt;
5606 { /* count_mbs_length return error. */
5613 end_match_1 = string1 + stop;
5614 end_match_2 = string2;
5619 end_match_2 = string2 + stop - size1;
5621 #endif /* MBS_SUPPORT */
5623 /* `p' scans through the pattern as `d' scans through the data.
5624 `dend' is the end of the input string that `d' points within. `d'
5625 is advanced into the following input string whenever necessary, but
5626 this happens before fetching; therefore, at the beginning of the
5627 loop, `d' can be pointing at the end of a string, but it cannot
5630 if (size1 > 0 && pos <= csize1)
5632 mcnt = count_mbs_length(mbs_offset1, pos);
5638 mcnt = count_mbs_length(mbs_offset2, pos-csize1);
5644 { /* count_mbs_length return error. */
5649 if (size1 > 0 && pos <= size1)
5656 d = string2 + pos - size1;
5659 #endif /* MBS_SUPPORT */
5661 DEBUG_PRINT1 ("The compiled pattern is:\n");
5662 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5663 DEBUG_PRINT1 ("The string to match is: `");
5664 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5665 DEBUG_PRINT1 ("'\n");
5667 /* This loops over pattern commands. It exits by returning from the
5668 function if the match is complete, or it drops through if the match
5669 fails at this starting point in the input data. */
5673 DEBUG_PRINT2 ("\n%p: ", p);
5675 DEBUG_PRINT2 ("\n0x%x: ", p);
5679 { /* End of pattern means we might have succeeded. */
5680 DEBUG_PRINT1 ("end of pattern ... ");
5682 /* If we haven't matched the entire string, and we want the
5683 longest match, try backtracking. */
5684 if (d != end_match_2)
5686 /* 1 if this match ends in the same string (string1 or string2)
5687 as the best previous match. */
5688 boolean same_str_p = (FIRST_STRING_P (match_end)
5689 == MATCHING_IN_FIRST_STRING);
5690 /* 1 if this match is the best seen so far. */
5691 boolean best_match_p;
5693 /* AIX compiler got confused when this was combined
5694 with the previous declaration. */
5696 best_match_p = d > match_end;
5698 best_match_p = !MATCHING_IN_FIRST_STRING;
5700 DEBUG_PRINT1 ("backtracking.\n");
5702 if (!FAIL_STACK_EMPTY ())
5703 { /* More failure points to try. */
5705 /* If exceeds best match so far, save it. */
5706 if (!best_regs_set || best_match_p)
5708 best_regs_set = true;
5711 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5713 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5715 best_regstart[mcnt] = regstart[mcnt];
5716 best_regend[mcnt] = regend[mcnt];
5722 /* If no failure points, don't restore garbage. And if
5723 last match is real best match, don't restore second
5725 else if (best_regs_set && !best_match_p)
5728 /* Restore best match. It may happen that `dend ==
5729 end_match_1' while the restored d is in string2.
5730 For example, the pattern `x.*y.*z' against the
5731 strings `x-' and `y-z-', if the two strings are
5732 not consecutive in memory. */
5733 DEBUG_PRINT1 ("Restoring best registers.\n");
5736 dend = ((d >= string1 && d <= end1)
5737 ? end_match_1 : end_match_2);
5739 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5741 regstart[mcnt] = best_regstart[mcnt];
5742 regend[mcnt] = best_regend[mcnt];
5745 } /* d != end_match_2 */
5748 DEBUG_PRINT1 ("Accepting match.\n");
5749 /* If caller wants register contents data back, do it. */
5750 if (regs && !bufp->no_sub)
5752 /* Have the register data arrays been allocated? */
5753 if (bufp->regs_allocated == REGS_UNALLOCATED)
5754 { /* No. So allocate them with malloc. We need one
5755 extra element beyond `num_regs' for the `-1' marker
5757 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5758 regs->start = TALLOC (regs->num_regs, regoff_t);
5759 regs->end = TALLOC (regs->num_regs, regoff_t);
5760 if (regs->start == NULL || regs->end == NULL)
5765 bufp->regs_allocated = REGS_REALLOCATE;
5767 else if (bufp->regs_allocated == REGS_REALLOCATE)
5768 { /* Yes. If we need more elements than were already
5769 allocated, reallocate them. If we need fewer, just
5771 if (regs->num_regs < num_regs + 1)
5773 regs->num_regs = num_regs + 1;
5774 RETALLOC (regs->start, regs->num_regs, regoff_t);
5775 RETALLOC (regs->end, regs->num_regs, regoff_t);
5776 if (regs->start == NULL || regs->end == NULL)
5785 /* These braces fend off a "empty body in an else-statement"
5786 warning under GCC when assert expands to nothing. */
5787 assert (bufp->regs_allocated == REGS_FIXED);
5790 /* Convert the pointer data in `regstart' and `regend' to
5791 indices. Register zero has to be set differently,
5792 since we haven't kept track of any info for it. */
5793 if (regs->num_regs > 0)
5795 regs->start[0] = pos;
5797 if (MATCHING_IN_FIRST_STRING)
5798 regs->end[0] = mbs_offset1 != NULL ?
5799 mbs_offset1[d-string1] : 0;
5801 regs->end[0] = csize1 + (mbs_offset2 != NULL ?
5802 mbs_offset2[d-string2] : 0);
5804 regs->end[0] = (MATCHING_IN_FIRST_STRING
5805 ? ((regoff_t) (d - string1))
5806 : ((regoff_t) (d - string2 + size1)));
5807 #endif /* MBS_SUPPORT */
5810 /* Go through the first `min (num_regs, regs->num_regs)'
5811 registers, since that is all we initialized. */
5812 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
5815 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
5816 regs->start[mcnt] = regs->end[mcnt] = -1;
5820 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
5822 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
5826 /* If the regs structure we return has more elements than
5827 were in the pattern, set the extra elements to -1. If
5828 we (re)allocated the registers, this is the case,
5829 because we always allocate enough to have at least one
5831 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
5832 regs->start[mcnt] = regs->end[mcnt] = -1;
5833 } /* regs && !bufp->no_sub */
5835 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5836 nfailure_points_pushed, nfailure_points_popped,
5837 nfailure_points_pushed - nfailure_points_popped);
5838 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
5841 if (MATCHING_IN_FIRST_STRING)
5842 mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0;
5844 mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) +
5848 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
5851 #endif /* MBS_SUPPORT */
5853 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
5859 /* Otherwise match next pattern command. */
5860 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5862 /* Ignore these. Used to ignore the n of succeed_n's which
5863 currently have n == 0. */
5865 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5869 DEBUG_PRINT1 ("EXECUTING succeed.\n");
5872 /* Match the next n pattern characters exactly. The following
5873 byte in the pattern defines n, and the n bytes after that
5874 are the characters to match. */
5880 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
5882 /* This is written out as an if-else so we don't waste time
5883 testing `translate' inside the loop. */
5892 if ((US_CHAR_TYPE) translate[(unsigned char) *d++]
5893 != (US_CHAR_TYPE) *p++)
5898 if (*d++ != (CHAR_TYPE) *p++)
5902 if ((US_CHAR_TYPE) translate[(unsigned char) *d++]
5903 != (US_CHAR_TYPE) *p++)
5905 #endif /* MBS_SUPPORT */
5914 if (*d++ != (CHAR_TYPE) *p++) goto fail;
5918 SET_REGS_MATCHED ();
5922 /* Match any character except possibly a newline or a null. */
5924 DEBUG_PRINT1 ("EXECUTING anychar.\n");
5928 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
5929 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
5932 SET_REGS_MATCHED ();
5933 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5941 register US_CHAR_TYPE c;
5943 unsigned int i, char_class_length, coll_symbol_length,
5944 equiv_class_length, ranges_length, chars_length, length;
5945 CHAR_TYPE *workp, *workp2, *charset_top;
5946 #define WORK_BUFFER_SIZE 128
5947 CHAR_TYPE str_buf[WORK_BUFFER_SIZE];
5951 #endif /* MBS_SUPPORT */
5952 boolean not = (re_opcode_t) *(p - 1) == charset_not;
5954 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
5956 c = TRANSLATE (*d); /* The character to match. */
5959 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
5961 charset_top = p - 1;
5962 char_class_length = *p++;
5963 coll_symbol_length = *p++;
5964 equiv_class_length = *p++;
5965 ranges_length = *p++;
5966 chars_length = *p++;
5967 /* p points charset[6], so the address of the next instruction
5968 (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'],
5969 where l=length of char_classes, m=length of collating_symbol,
5970 n=equivalence_class, o=length of char_range,
5971 p'=length of character. */
5973 /* Update p to indicate the next instruction. */
5974 p += char_class_length + coll_symbol_length+ equiv_class_length +
5975 2*ranges_length + chars_length;
5977 /* match with char_class? */
5978 for (i = 0; i < char_class_length ; i++)
5979 if (iswctype((wint_t)c, (wctype_t)(*workp++)))
5980 goto char_set_matched;
5982 /* match with collating_symbol? */
5986 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;
5990 wextra = (int32_t*) *workp++;
5991 for (i = 0; i < *wextra; ++i)
5992 if (TRANSLATE(d[i]) != wextra[1 + i])
5997 /* Update d, however d will be incremented at
5998 char_set_matched:, we decrement d here. */
6000 goto char_set_matched;
6004 else /* (nrules == 0) */
6006 /* If we can't look up collation data, we use wcscoll
6009 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;)
6011 const CHAR_TYPE *backup_d = d, *backup_dend = dend;
6012 length = wcslen(workp);
6014 /* If wcscoll(the collating symbol, whole string) > 0,
6015 any substring of the string never match with the
6016 collating symbol. */
6017 if (wcscoll(workp, d) > 0)
6019 workp += length + 1;
6023 /* First, we compare the collating symbol with
6024 the first character of the string.
6025 If it don't match, we add the next character to
6026 the compare buffer in turn. */
6027 for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++)
6032 if (dend == end_match_2)
6038 /* add next character to the compare buffer. */
6039 str_buf[i] = TRANSLATE(*d);
6040 str_buf[i+1] = '\0';
6042 match = wcscoll(workp, str_buf);
6044 goto char_set_matched;
6047 /* (str_buf > workp) indicate (str_buf + X > workp),
6048 because for all X (str_buf + X > str_buf).
6049 So we don't need continue this loop. */
6052 /* Otherwise(str_buf < workp),
6053 (str_buf+next_character) may equals (workp).
6054 So we continue this loop. */
6059 workp += length + 1;
6062 /* match with equivalence_class? */
6066 const CHAR_TYPE *backup_d = d, *backup_dend = dend;
6067 /* Try to match the equivalence class against
6068 those known to the collate implementation. */
6069 const int32_t *table;
6070 const int32_t *weights;
6071 const int32_t *extra;
6072 const int32_t *indirect;
6077 /* This #include defines a local function! */
6078 # include <locale/weightwc.h>
6080 table = (const int32_t *)
6081 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC);
6082 weights = (const wint_t *)
6083 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC);
6084 extra = (const wint_t *)
6085 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC);
6086 indirect = (const int32_t *)
6087 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC);
6089 /* Write 1 collating element to str_buf, and
6093 for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++)
6095 cp = (wint_t*)str_buf;
6098 if (dend == end_match_2)
6103 str_buf[i] = TRANSLATE(*(d+i));
6104 str_buf[i+1] = '\0'; /* sentinel */
6105 idx2 = findidx ((const wint_t**)&cp);
6108 /* Update d, however d will be incremented at
6109 char_set_matched:, we decrement d here. */
6110 d = backup_d + (wint_t)cp - (wint_t)str_buf - 1;
6113 if (dend == end_match_2)
6122 len = weights[idx2];
6124 for (workp2 = workp + equiv_class_length ; workp < workp2 ;
6127 idx = (int32_t)*workp;
6128 /* We already checked idx != 0 in regex_compile. */
6130 if (idx2 != 0 && len == weights[idx])
6133 while (cnt < len && (weights[idx + 1 + cnt]
6134 == weights[idx2 + 1 + cnt]))
6138 goto char_set_matched;
6145 else /* (nrules == 0) */
6147 /* If we can't look up collation data, we use wcscoll
6150 for (workp2 = workp + equiv_class_length ; workp < workp2 ;)
6152 const CHAR_TYPE *backup_d = d, *backup_dend = dend;
6153 length = wcslen(workp);
6155 /* If wcscoll(the collating symbol, whole string) > 0,
6156 any substring of the string never match with the
6157 collating symbol. */
6158 if (wcscoll(workp, d) > 0)
6160 workp += length + 1;
6164 /* First, we compare the equivalence class with
6165 the first character of the string.
6166 If it don't match, we add the next character to
6167 the compare buffer in turn. */
6168 for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++)
6173 if (dend == end_match_2)
6179 /* add next character to the compare buffer. */
6180 str_buf[i] = TRANSLATE(*d);
6181 str_buf[i+1] = '\0';
6183 match = wcscoll(workp, str_buf);
6186 goto char_set_matched;
6189 /* (str_buf > workp) indicate (str_buf + X > workp),
6190 because for all X (str_buf + X > str_buf).
6191 So we don't need continue this loop. */
6194 /* Otherwise(str_buf < workp),
6195 (str_buf+next_character) may equals (workp).
6196 So we continue this loop. */
6201 workp += length + 1;
6205 /* match with char_range? */
6209 uint32_t collseqval;
6210 const char *collseq = (const char *)
6211 _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC);
6213 collseqval = collseq_table_lookup (collseq, c);
6215 for (; workp < p - chars_length ;)
6217 uint32_t start_val, end_val;
6219 /* We already compute the collation sequence value
6220 of the characters (or collating symbols). */
6221 start_val = (uint32_t) *workp++; /* range_start */
6222 end_val = (uint32_t) *workp++; /* range_end */
6224 if (start_val <= collseqval && collseqval <= end_val)
6225 goto char_set_matched;
6231 /* We set range_start_char at str_buf[0], range_end_char
6232 at str_buf[4], and compared char at str_buf[2]. */
6237 for (; workp < p - chars_length ;)
6239 wchar_t *range_start_char, *range_end_char;
6241 /* match if (range_start_char <= c <= range_end_char). */
6243 /* If range_start(or end) < 0, we assume -range_start(end)
6244 is the offset of the collating symbol which is specified
6245 as the character of the range start(end). */
6249 range_start_char = charset_top - (*workp++);
6252 str_buf[0] = *workp++;
6253 range_start_char = str_buf;
6258 range_end_char = charset_top - (*workp++);
6261 str_buf[4] = *workp++;
6262 range_end_char = str_buf + 4;
6265 if (wcscoll(range_start_char, str_buf+2) <= 0 &&
6266 wcscoll(str_buf+2, range_end_char) <= 0)
6268 goto char_set_matched;
6272 /* match with char? */
6273 for (; workp < p ; workp++)
6275 goto char_set_matched;
6282 /* Cast to `unsigned' instead of `unsigned char' in case the
6283 bit list is a full 32 bytes long. */
6284 if (c < (unsigned) (*p * BYTEWIDTH)
6285 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
6290 if (!not) goto fail;
6291 #undef WORK_BUFFER_SIZE
6292 #endif /* MBS_SUPPORT */
6293 SET_REGS_MATCHED ();
6299 /* The beginning of a group is represented by start_memory.
6300 The arguments are the register number in the next byte, and the
6301 number of groups inner to this one in the next. The text
6302 matched within the group is recorded (in the internal
6303 registers data structure) under the register number. */
6305 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
6307 /* Find out if this group can match the empty string. */
6308 p1 = p; /* To send to group_match_null_string_p. */
6310 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
6311 REG_MATCH_NULL_STRING_P (reg_info[*p])
6312 = group_match_null_string_p (&p1, pend, reg_info);
6314 /* Save the position in the string where we were the last time
6315 we were at this open-group operator in case the group is
6316 operated upon by a repetition operator, e.g., with `(a*)*b'
6317 against `ab'; then we want to ignore where we are now in
6318 the string in case this attempt to match fails. */
6319 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6320 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
6322 DEBUG_PRINT2 (" old_regstart: %d\n",
6323 POINTER_TO_OFFSET (old_regstart[*p]));
6326 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
6328 IS_ACTIVE (reg_info[*p]) = 1;
6329 MATCHED_SOMETHING (reg_info[*p]) = 0;
6331 /* Clear this whenever we change the register activity status. */
6332 set_regs_matched_done = 0;
6334 /* This is the new highest active register. */
6335 highest_active_reg = *p;
6337 /* If nothing was active before, this is the new lowest active
6339 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
6340 lowest_active_reg = *p;
6342 /* Move past the register number and inner group count. */
6344 just_past_start_mem = p;
6349 /* The stop_memory opcode represents the end of a group. Its
6350 arguments are the same as start_memory's: the register
6351 number, and the number of inner groups. */
6353 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
6355 /* We need to save the string position the last time we were at
6356 this close-group operator in case the group is operated
6357 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
6358 against `aba'; then we want to ignore where we are now in
6359 the string in case this attempt to match fails. */
6360 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6361 ? REG_UNSET (regend[*p]) ? d : regend[*p]
6363 DEBUG_PRINT2 (" old_regend: %d\n",
6364 POINTER_TO_OFFSET (old_regend[*p]));
6367 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
6369 /* This register isn't active anymore. */
6370 IS_ACTIVE (reg_info[*p]) = 0;
6372 /* Clear this whenever we change the register activity status. */
6373 set_regs_matched_done = 0;
6375 /* If this was the only register active, nothing is active
6377 if (lowest_active_reg == highest_active_reg)
6379 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6380 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6383 { /* We must scan for the new highest active register, since
6384 it isn't necessarily one less than now: consider
6385 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
6386 new highest active register is 1. */
6387 US_CHAR_TYPE r = *p - 1;
6388 while (r > 0 && !IS_ACTIVE (reg_info[r]))
6391 /* If we end up at register zero, that means that we saved
6392 the registers as the result of an `on_failure_jump', not
6393 a `start_memory', and we jumped to past the innermost
6394 `stop_memory'. For example, in ((.)*) we save
6395 registers 1 and 2 as a result of the *, but when we pop
6396 back to the second ), we are at the stop_memory 1.
6397 Thus, nothing is active. */
6400 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6401 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6404 highest_active_reg = r;
6407 /* If just failed to match something this time around with a
6408 group that's operated on by a repetition operator, try to
6409 force exit from the ``loop'', and restore the register
6410 information for this group that we had before trying this
6412 if ((!MATCHED_SOMETHING (reg_info[*p])
6413 || just_past_start_mem == p - 1)
6416 boolean is_a_jump_n = false;
6420 switch ((re_opcode_t) *p1++)
6424 case pop_failure_jump:
6425 case maybe_pop_jump:
6427 case dummy_failure_jump:
6428 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6430 p1 += OFFSET_ADDRESS_SIZE;
6438 /* If the next operation is a jump backwards in the pattern
6439 to an on_failure_jump right before the start_memory
6440 corresponding to this stop_memory, exit from the loop
6441 by forcing a failure after pushing on the stack the
6442 on_failure_jump's jump in the pattern, and d. */
6443 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
6444 && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory
6445 && p1[2+OFFSET_ADDRESS_SIZE] == *p)
6447 /* If this group ever matched anything, then restore
6448 what its registers were before trying this last
6449 failed match, e.g., with `(a*)*b' against `ab' for
6450 regstart[1], and, e.g., with `((a*)*(b*)*)*'
6451 against `aba' for regend[3].
6453 Also restore the registers for inner groups for,
6454 e.g., `((a*)(b*))*' against `aba' (register 3 would
6455 otherwise get trashed). */
6457 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
6461 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
6463 /* Restore this and inner groups' (if any) registers. */
6464 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
6467 regstart[r] = old_regstart[r];
6469 /* xx why this test? */
6470 if (old_regend[r] >= regstart[r])
6471 regend[r] = old_regend[r];
6475 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6476 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
6482 /* Move past the register number and the inner group count. */
6487 /* \<digit> has been turned into a `duplicate' command which is
6488 followed by the numeric value of <digit> as the register number. */
6491 register const CHAR_TYPE *d2, *dend2;
6492 int regno = *p++; /* Get which register to match against. */
6493 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
6495 /* Can't back reference a group which we've never matched. */
6496 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
6499 /* Where in input to try to start matching. */
6500 d2 = regstart[regno];
6502 /* Where to stop matching; if both the place to start and
6503 the place to stop matching are in the same string, then
6504 set to the place to stop, otherwise, for now have to use
6505 the end of the first string. */
6507 dend2 = ((FIRST_STRING_P (regstart[regno])
6508 == FIRST_STRING_P (regend[regno]))
6509 ? regend[regno] : end_match_1);
6512 /* If necessary, advance to next segment in register
6516 if (dend2 == end_match_2) break;
6517 if (dend2 == regend[regno]) break;
6519 /* End of string1 => advance to string2. */
6521 dend2 = regend[regno];
6523 /* At end of register contents => success */
6524 if (d2 == dend2) break;
6526 /* If necessary, advance to next segment in data. */
6529 /* How many characters left in this segment to match. */
6532 /* Want how many consecutive characters we can match in
6533 one shot, so, if necessary, adjust the count. */
6534 if (mcnt > dend2 - d2)
6537 /* Compare that many; failure if mismatch, else move
6540 ? bcmp_translate (d, d2, mcnt, translate)
6541 : memcmp (d, d2, mcnt*sizeof(US_CHAR_TYPE)))
6543 d += mcnt, d2 += mcnt;
6545 /* Do this because we've match some characters. */
6546 SET_REGS_MATCHED ();
6552 /* begline matches the empty string at the beginning of the string
6553 (unless `not_bol' is set in `bufp'), and, if
6554 `newline_anchor' is set, after newlines. */
6556 DEBUG_PRINT1 ("EXECUTING begline.\n");
6558 if (AT_STRINGS_BEG (d))
6560 if (!bufp->not_bol) break;
6562 else if (d[-1] == '\n' && bufp->newline_anchor)
6566 /* In all other cases, we fail. */
6570 /* endline is the dual of begline. */
6572 DEBUG_PRINT1 ("EXECUTING endline.\n");
6574 if (AT_STRINGS_END (d))
6576 if (!bufp->not_eol) break;
6579 /* We have to ``prefetch'' the next character. */
6580 else if ((d == end1 ? *string2 : *d) == '\n'
6581 && bufp->newline_anchor)
6588 /* Match at the very beginning of the data. */
6590 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
6591 if (AT_STRINGS_BEG (d))
6596 /* Match at the very end of the data. */
6598 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
6599 if (AT_STRINGS_END (d))
6604 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
6605 pushes NULL as the value for the string on the stack. Then
6606 `pop_failure_point' will keep the current value for the
6607 string, instead of restoring it. To see why, consider
6608 matching `foo\nbar' against `.*\n'. The .* matches the foo;
6609 then the . fails against the \n. But the next thing we want
6610 to do is match the \n against the \n; if we restored the
6611 string value, we would be back at the foo.
6613 Because this is used only in specific cases, we don't need to
6614 check all the things that `on_failure_jump' does, to make
6615 sure the right things get saved on the stack. Hence we don't
6616 share its code. The only reason to push anything on the
6617 stack at all is that otherwise we would have to change
6618 `anychar's code to do something besides goto fail in this
6619 case; that seems worse than this. */
6620 case on_failure_keep_string_jump:
6621 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
6623 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6625 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt);
6627 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
6630 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
6634 /* Uses of on_failure_jump:
6636 Each alternative starts with an on_failure_jump that points
6637 to the beginning of the next alternative. Each alternative
6638 except the last ends with a jump that in effect jumps past
6639 the rest of the alternatives. (They really jump to the
6640 ending jump of the following alternative, because tensioning
6641 these jumps is a hassle.)
6643 Repeats start with an on_failure_jump that points past both
6644 the repetition text and either the following jump or
6645 pop_failure_jump back to this on_failure_jump. */
6646 case on_failure_jump:
6648 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
6650 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6652 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt);
6654 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
6657 /* If this on_failure_jump comes right before a group (i.e.,
6658 the original * applied to a group), save the information
6659 for that group and all inner ones, so that if we fail back
6660 to this point, the group's information will be correct.
6661 For example, in \(a*\)*\1, we need the preceding group,
6662 and in \(zz\(a*\)b*\)\2, we need the inner group. */
6664 /* We can't use `p' to check ahead because we push
6665 a failure point to `p + mcnt' after we do this. */
6668 /* We need to skip no_op's before we look for the
6669 start_memory in case this on_failure_jump is happening as
6670 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
6672 while (p1 < pend && (re_opcode_t) *p1 == no_op)
6675 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
6677 /* We have a new highest active register now. This will
6678 get reset at the start_memory we are about to get to,
6679 but we will have saved all the registers relevant to
6680 this repetition op, as described above. */
6681 highest_active_reg = *(p1 + 1) + *(p1 + 2);
6682 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
6683 lowest_active_reg = *(p1 + 1);
6686 DEBUG_PRINT1 (":\n");
6687 PUSH_FAILURE_POINT (p + mcnt, d, -2);
6691 /* A smart repeat ends with `maybe_pop_jump'.
6692 We change it to either `pop_failure_jump' or `jump'. */
6693 case maybe_pop_jump:
6694 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6695 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
6697 register US_CHAR_TYPE *p2 = p;
6699 /* Compare the beginning of the repeat with what in the
6700 pattern follows its end. If we can establish that there
6701 is nothing that they would both match, i.e., that we
6702 would have to backtrack because of (as in, e.g., `a*a')
6703 then we can change to pop_failure_jump, because we'll
6704 never have to backtrack.
6706 This is not true in the case of alternatives: in
6707 `(a|ab)*' we do need to backtrack to the `ab' alternative
6708 (e.g., if the string was `ab'). But instead of trying to
6709 detect that here, the alternative has put on a dummy
6710 failure point which is what we will end up popping. */
6712 /* Skip over open/close-group commands.
6713 If what follows this loop is a ...+ construct,
6714 look at what begins its body, since we will have to
6715 match at least one of that. */
6719 && ((re_opcode_t) *p2 == stop_memory
6720 || (re_opcode_t) *p2 == start_memory))
6722 else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend
6723 && (re_opcode_t) *p2 == dummy_failure_jump)
6724 p2 += 2 + 2 * OFFSET_ADDRESS_SIZE;
6730 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
6731 to the `maybe_finalize_jump' of this case. Examine what
6734 /* If we're at the end of the pattern, we can change. */
6737 /* Consider what happens when matching ":\(.*\)"
6738 against ":/". I don't really understand this code
6740 p[-(1+OFFSET_ADDRESS_SIZE)] = (US_CHAR_TYPE)
6743 (" End of pattern: change to `pop_failure_jump'.\n");
6746 else if ((re_opcode_t) *p2 == exactn
6748 || (re_opcode_t) *p2 == exactn_bin
6750 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
6752 register US_CHAR_TYPE c
6753 = *p2 == (US_CHAR_TYPE) endline ? '\n' : p2[2];
6755 if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn
6757 || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin
6759 ) && p1[3+OFFSET_ADDRESS_SIZE] != c)
6761 p[-(1+OFFSET_ADDRESS_SIZE)] = (US_CHAR_TYPE)
6763 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
6764 c, p1[3+OFFSET_ADDRESS_SIZE]);
6768 else if ((re_opcode_t) p1[3] == charset
6769 || (re_opcode_t) p1[3] == charset_not)
6771 int not = (re_opcode_t) p1[3] == charset_not;
6773 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
6774 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
6777 /* `not' is equal to 1 if c would match, which means
6778 that we can't change to pop_failure_jump. */
6781 p[-3] = (unsigned char) pop_failure_jump;
6782 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
6785 #endif /* not MBS_SUPPORT */
6788 else if ((re_opcode_t) *p2 == charset)
6790 /* We win if the first character of the loop is not part
6792 if ((re_opcode_t) p1[3] == exactn
6793 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
6794 && (p2[2 + p1[5] / BYTEWIDTH]
6795 & (1 << (p1[5] % BYTEWIDTH)))))
6797 p[-3] = (unsigned char) pop_failure_jump;
6798 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
6801 else if ((re_opcode_t) p1[3] == charset_not)
6804 /* We win if the charset_not inside the loop
6805 lists every character listed in the charset after. */
6806 for (idx = 0; idx < (int) p2[1]; idx++)
6807 if (! (p2[2 + idx] == 0
6808 || (idx < (int) p1[4]
6809 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
6814 p[-3] = (unsigned char) pop_failure_jump;
6815 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
6818 else if ((re_opcode_t) p1[3] == charset)
6821 /* We win if the charset inside the loop
6822 has no overlap with the one after the loop. */
6824 idx < (int) p2[1] && idx < (int) p1[4];
6826 if ((p2[2 + idx] & p1[5 + idx]) != 0)
6829 if (idx == p2[1] || idx == p1[4])
6831 p[-3] = (unsigned char) pop_failure_jump;
6832 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
6836 #endif /* not MBS_SUPPORT */
6838 p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */
6839 if ((re_opcode_t) p[-1] != pop_failure_jump)
6841 p[-1] = (US_CHAR_TYPE) jump;
6842 DEBUG_PRINT1 (" Match => jump.\n");
6843 goto unconditional_jump;
6845 /* Note fall through. */
6848 /* The end of a simple repeat has a pop_failure_jump back to
6849 its matching on_failure_jump, where the latter will push a
6850 failure point. The pop_failure_jump takes off failure
6851 points put on by this pop_failure_jump's matching
6852 on_failure_jump; we got through the pattern to here from the
6853 matching on_failure_jump, so didn't fail. */
6854 case pop_failure_jump:
6856 /* We need to pass separate storage for the lowest and
6857 highest registers, even though we don't care about the
6858 actual values. Otherwise, we will restore only one
6859 register from the stack, since lowest will == highest in
6860 `pop_failure_point'. */
6861 active_reg_t dummy_low_reg, dummy_high_reg;
6862 US_CHAR_TYPE *pdummy = NULL;
6863 const CHAR_TYPE *sdummy = NULL;
6865 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
6866 POP_FAILURE_POINT (sdummy, pdummy,
6867 dummy_low_reg, dummy_high_reg,
6868 reg_dummy, reg_dummy, reg_info_dummy);
6870 /* Note fall through. */
6874 DEBUG_PRINT2 ("\n%p: ", p);
6876 DEBUG_PRINT2 ("\n0x%x: ", p);
6878 /* Note fall through. */
6880 /* Unconditionally jump (without popping any failure points). */
6882 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
6883 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
6884 p += mcnt; /* Do the jump. */
6886 DEBUG_PRINT2 ("(to %p).\n", p);
6888 DEBUG_PRINT2 ("(to 0x%x).\n", p);
6893 /* We need this opcode so we can detect where alternatives end
6894 in `group_match_null_string_p' et al. */
6896 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
6897 goto unconditional_jump;
6900 /* Normally, the on_failure_jump pushes a failure point, which
6901 then gets popped at pop_failure_jump. We will end up at
6902 pop_failure_jump, also, and with a pattern of, say, `a+', we
6903 are skipping over the on_failure_jump, so we have to push
6904 something meaningless for pop_failure_jump to pop. */
6905 case dummy_failure_jump:
6906 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
6907 /* It doesn't matter what we push for the string here. What
6908 the code at `fail' tests is the value for the pattern. */
6909 PUSH_FAILURE_POINT (NULL, NULL, -2);
6910 goto unconditional_jump;
6913 /* At the end of an alternative, we need to push a dummy failure
6914 point in case we are followed by a `pop_failure_jump', because
6915 we don't want the failure point for the alternative to be
6916 popped. For example, matching `(a|ab)*' against `aab'
6917 requires that we match the `ab' alternative. */
6918 case push_dummy_failure:
6919 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
6920 /* See comments just above at `dummy_failure_jump' about the
6922 PUSH_FAILURE_POINT (NULL, NULL, -2);
6925 /* Have to succeed matching what follows at least n times.
6926 After that, handle like `on_failure_jump'. */
6928 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
6929 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
6932 /* Originally, this is how many times we HAVE to succeed. */
6936 p += OFFSET_ADDRESS_SIZE;
6937 STORE_NUMBER_AND_INCR (p, mcnt);
6939 DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE
6942 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE
6949 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n",
6950 p + OFFSET_ADDRESS_SIZE);
6952 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n",
6953 p + OFFSET_ADDRESS_SIZE);
6957 p[1] = (US_CHAR_TYPE) no_op;
6959 p[2] = (US_CHAR_TYPE) no_op;
6960 p[3] = (US_CHAR_TYPE) no_op;
6961 #endif /* MBS_SUPPORT */
6967 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
6968 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
6970 /* Originally, this is how many times we CAN jump. */
6974 STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt);
6977 DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE,
6980 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE,
6983 goto unconditional_jump;
6985 /* If don't have to jump any more, skip over the rest of command. */
6987 p += 2 * OFFSET_ADDRESS_SIZE;
6992 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
6994 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6996 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6998 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
7000 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
7002 STORE_NUMBER (p1, mcnt);
7007 /* The DEC Alpha C compiler 3.x generates incorrect code for the
7008 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7009 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
7010 macro and introducing temporary variables works around the bug. */
7013 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7014 if (AT_WORD_BOUNDARY (d))
7019 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7020 if (AT_WORD_BOUNDARY (d))
7026 boolean prevchar, thischar;
7028 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7029 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7032 prevchar = WORDCHAR_P (d - 1);
7033 thischar = WORDCHAR_P (d);
7034 if (prevchar != thischar)
7041 boolean prevchar, thischar;
7043 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7044 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7047 prevchar = WORDCHAR_P (d - 1);
7048 thischar = WORDCHAR_P (d);
7049 if (prevchar != thischar)
7056 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
7057 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
7062 DEBUG_PRINT1 ("EXECUTING wordend.\n");
7063 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
7064 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
7070 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
7071 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
7076 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
7077 if (PTR_CHAR_POS ((unsigned char *) d) != point)
7082 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
7083 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
7088 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
7093 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
7097 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7099 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
7101 SET_REGS_MATCHED ();
7105 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
7107 goto matchnotsyntax;
7110 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
7114 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7116 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
7118 SET_REGS_MATCHED ();
7121 #else /* not emacs */
7123 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
7125 if (!WORDCHAR_P (d))
7127 SET_REGS_MATCHED ();
7132 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
7136 SET_REGS_MATCHED ();
7139 #endif /* not emacs */
7144 continue; /* Successfully executed one pattern command; keep going. */
7147 /* We goto here if a matching operation fails. */
7149 if (!FAIL_STACK_EMPTY ())
7150 { /* A restart point is known. Restore to that state. */
7151 DEBUG_PRINT1 ("\nFAIL:\n");
7152 POP_FAILURE_POINT (d, p,
7153 lowest_active_reg, highest_active_reg,
7154 regstart, regend, reg_info);
7156 /* If this failure point is a dummy, try the next one. */
7160 /* If we failed to the end of the pattern, don't examine *p. */
7164 boolean is_a_jump_n = false;
7166 /* If failed to a backwards jump that's part of a repetition
7167 loop, need to pop this failure point and use the next one. */
7168 switch ((re_opcode_t) *p)
7172 case maybe_pop_jump:
7173 case pop_failure_jump:
7176 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7179 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
7181 && (re_opcode_t) *p1 == on_failure_jump))
7189 if (d >= string1 && d <= end1)
7193 break; /* Matching at this starting point really fails. */
7197 goto restore_best_regs;
7201 return -1; /* Failure to match. */
7204 /* Subroutine definitions for re_match_2. */
7207 /* We are passed P pointing to a register number after a start_memory.
7209 Return true if the pattern up to the corresponding stop_memory can
7210 match the empty string, and false otherwise.
7212 If we find the matching stop_memory, sets P to point to one past its number.
7213 Otherwise, sets P to an undefined byte less than or equal to END.
7215 We don't handle duplicates properly (yet). */
7218 group_match_null_string_p (p, end, reg_info)
7219 US_CHAR_TYPE **p, *end;
7220 register_info_type *reg_info;
7223 /* Point to after the args to the start_memory. */
7224 US_CHAR_TYPE *p1 = *p + 2;
7228 /* Skip over opcodes that can match nothing, and return true or
7229 false, as appropriate, when we get to one that can't, or to the
7230 matching stop_memory. */
7232 switch ((re_opcode_t) *p1)
7234 /* Could be either a loop or a series of alternatives. */
7235 case on_failure_jump:
7237 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7239 /* If the next operation is not a jump backwards in the
7244 /* Go through the on_failure_jumps of the alternatives,
7245 seeing if any of the alternatives cannot match nothing.
7246 The last alternative starts with only a jump,
7247 whereas the rest start with on_failure_jump and end
7248 with a jump, e.g., here is the pattern for `a|b|c':
7250 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
7251 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
7254 So, we have to first go through the first (n-1)
7255 alternatives and then deal with the last one separately. */
7258 /* Deal with the first (n-1) alternatives, which start
7259 with an on_failure_jump (see above) that jumps to right
7260 past a jump_past_alt. */
7262 while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] ==
7265 /* `mcnt' holds how many bytes long the alternative
7266 is, including the ending `jump_past_alt' and
7269 if (!alt_match_null_string_p (p1, p1 + mcnt -
7270 (1 + OFFSET_ADDRESS_SIZE),
7274 /* Move to right after this alternative, including the
7278 /* Break if it's the beginning of an n-th alternative
7279 that doesn't begin with an on_failure_jump. */
7280 if ((re_opcode_t) *p1 != on_failure_jump)
7283 /* Still have to check that it's not an n-th
7284 alternative that starts with an on_failure_jump. */
7286 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7287 if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] !=
7290 /* Get to the beginning of the n-th alternative. */
7291 p1 -= 1 + OFFSET_ADDRESS_SIZE;
7296 /* Deal with the last alternative: go back and get number
7297 of the `jump_past_alt' just before it. `mcnt' contains
7298 the length of the alternative. */
7299 EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE);
7301 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
7304 p1 += mcnt; /* Get past the n-th alternative. */
7310 assert (p1[1] == **p);
7316 if (!common_op_match_null_string_p (&p1, end, reg_info))
7319 } /* while p1 < end */
7322 } /* group_match_null_string_p */
7325 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
7326 It expects P to be the first byte of a single alternative and END one
7327 byte past the last. The alternative can contain groups. */
7330 alt_match_null_string_p (p, end, reg_info)
7331 US_CHAR_TYPE *p, *end;
7332 register_info_type *reg_info;
7335 US_CHAR_TYPE *p1 = p;
7339 /* Skip over opcodes that can match nothing, and break when we get
7340 to one that can't. */
7342 switch ((re_opcode_t) *p1)
7345 case on_failure_jump:
7347 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7352 if (!common_op_match_null_string_p (&p1, end, reg_info))
7355 } /* while p1 < end */
7358 } /* alt_match_null_string_p */
7361 /* Deals with the ops common to group_match_null_string_p and
7362 alt_match_null_string_p.
7364 Sets P to one after the op and its arguments, if any. */
7367 common_op_match_null_string_p (p, end, reg_info)
7368 US_CHAR_TYPE **p, *end;
7369 register_info_type *reg_info;
7374 US_CHAR_TYPE *p1 = *p;
7376 switch ((re_opcode_t) *p1++)
7396 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
7397 ret = group_match_null_string_p (&p1, end, reg_info);
7399 /* Have to set this here in case we're checking a group which
7400 contains a group and a back reference to it. */
7402 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
7403 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
7409 /* If this is an optimized succeed_n for zero times, make the jump. */
7411 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7419 /* Get to the number of times to succeed. */
7420 p1 += OFFSET_ADDRESS_SIZE;
7421 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7425 p1 -= 2 * OFFSET_ADDRESS_SIZE;
7426 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7434 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
7439 p1 += 2 * OFFSET_ADDRESS_SIZE;
7442 /* All other opcodes mean we cannot match the empty string. */
7448 } /* common_op_match_null_string_p */
7451 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
7452 bytes; nonzero otherwise. */
7455 bcmp_translate (s1, s2, len, translate)
7456 const CHAR_TYPE *s1, *s2;
7458 RE_TRANSLATE_TYPE translate;
7460 register const US_CHAR_TYPE *p1 = (const US_CHAR_TYPE *) s1;
7461 register const US_CHAR_TYPE *p2 = (const US_CHAR_TYPE *) s2;
7465 if (((*p1<=0xff)?translate[*p1++]:*p1++)
7466 != ((*p2<=0xff)?translate[*p2++]:*p2++))
7469 if (translate[*p1++] != translate[*p2++]) return 1;
7470 #endif /* MBS_SUPPORT */
7476 /* Entry points for GNU code. */
7478 /* re_compile_pattern is the GNU regular expression compiler: it
7479 compiles PATTERN (of length SIZE) and puts the result in BUFP.
7480 Returns 0 if the pattern was valid, otherwise an error string.
7482 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
7483 are set in BUFP on entry.
7485 We call regex_compile to do the actual compilation. */
7488 re_compile_pattern (pattern, length, bufp)
7489 const char *pattern;
7491 struct re_pattern_buffer *bufp;
7495 /* GNU code is written to assume at least RE_NREGS registers will be set
7496 (and at least one extra will be -1). */
7497 bufp->regs_allocated = REGS_UNALLOCATED;
7499 /* And GNU code determines whether or not to get register information
7500 by passing null for the REGS argument to re_match, etc., not by
7504 /* Match anchors at newline. */
7505 bufp->newline_anchor = 1;
7507 ret = regex_compile (pattern, length, re_syntax_options, bufp);
7511 return gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
7514 weak_alias (__re_compile_pattern, re_compile_pattern)
7517 /* Entry points compatible with 4.2 BSD regex library. We don't define
7518 them unless specifically requested. */
7520 #if defined _REGEX_RE_COMP || defined _LIBC
7522 /* BSD has one and only one pattern buffer. */
7523 static struct re_pattern_buffer re_comp_buf;
7527 /* Make these definitions weak in libc, so POSIX programs can redefine
7528 these names if they don't use our functions, and still use
7529 regcomp/regexec below without link errors. */
7539 if (!re_comp_buf.buffer)
7540 return gettext ("No previous regular expression");
7544 if (!re_comp_buf.buffer)
7546 re_comp_buf.buffer = (unsigned char *) malloc (200);
7547 if (re_comp_buf.buffer == NULL)
7548 return (char *) gettext (re_error_msgid
7549 + re_error_msgid_idx[(int) REG_ESPACE]);
7550 re_comp_buf.allocated = 200;
7552 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
7553 if (re_comp_buf.fastmap == NULL)
7554 return (char *) gettext (re_error_msgid
7555 + re_error_msgid_idx[(int) REG_ESPACE]);
7558 /* Since `re_exec' always passes NULL for the `regs' argument, we
7559 don't need to initialize the pattern buffer fields which affect it. */
7561 /* Match anchors at newlines. */
7562 re_comp_buf.newline_anchor = 1;
7564 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
7569 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
7570 return (char *) gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
7581 const int len = strlen (s);
7583 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
7586 #endif /* _REGEX_RE_COMP */
7588 /* POSIX.2 functions. Don't define these for Emacs. */
7592 /* regcomp takes a regular expression as a string and compiles it.
7594 PREG is a regex_t *. We do not expect any fields to be initialized,
7595 since POSIX says we shouldn't. Thus, we set
7597 `buffer' to the compiled pattern;
7598 `used' to the length of the compiled pattern;
7599 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
7600 REG_EXTENDED bit in CFLAGS is set; otherwise, to
7601 RE_SYNTAX_POSIX_BASIC;
7602 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
7603 `fastmap' to an allocated space for the fastmap;
7604 `fastmap_accurate' to zero;
7605 `re_nsub' to the number of subexpressions in PATTERN.
7607 PATTERN is the address of the pattern string.
7609 CFLAGS is a series of bits which affect compilation.
7611 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
7612 use POSIX basic syntax.
7614 If REG_NEWLINE is set, then . and [^...] don't match newline.
7615 Also, regexec will try a match beginning after every newline.
7617 If REG_ICASE is set, then we considers upper- and lowercase
7618 versions of letters to be equivalent when matching.
7620 If REG_NOSUB is set, then when PREG is passed to regexec, that
7621 routine will report only success or failure, and nothing about the
7624 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
7625 the return codes and their meanings.) */
7628 regcomp (preg, pattern, cflags)
7630 const char *pattern;
7635 = (cflags & REG_EXTENDED) ?
7636 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
7638 /* regex_compile will allocate the space for the compiled pattern. */
7640 preg->allocated = 0;
7643 /* Try to allocate space for the fastmap. */
7644 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
7646 if (cflags & REG_ICASE)
7651 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
7652 * sizeof (*(RE_TRANSLATE_TYPE)0));
7653 if (preg->translate == NULL)
7654 return (int) REG_ESPACE;
7656 /* Map uppercase characters to corresponding lowercase ones. */
7657 for (i = 0; i < CHAR_SET_SIZE; i++)
7658 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
7661 preg->translate = NULL;
7663 /* If REG_NEWLINE is set, newlines are treated differently. */
7664 if (cflags & REG_NEWLINE)
7665 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
7666 syntax &= ~RE_DOT_NEWLINE;
7667 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
7668 /* It also changes the matching behavior. */
7669 preg->newline_anchor = 1;
7672 preg->newline_anchor = 0;
7674 preg->no_sub = !!(cflags & REG_NOSUB);
7676 /* POSIX says a null character in the pattern terminates it, so we
7677 can use strlen here in compiling the pattern. */
7678 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
7680 /* POSIX doesn't distinguish between an unmatched open-group and an
7681 unmatched close-group: both are REG_EPAREN. */
7682 if (ret == REG_ERPAREN) ret = REG_EPAREN;
7684 if (ret == REG_NOERROR && preg->fastmap)
7686 /* Compute the fastmap now, since regexec cannot modify the pattern
7688 if (re_compile_fastmap (preg) == -2)
7690 /* Some error occurred while computing the fastmap, just forget
7692 free (preg->fastmap);
7693 preg->fastmap = NULL;
7700 weak_alias (__regcomp, regcomp)
7704 /* regexec searches for a given pattern, specified by PREG, in the
7707 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
7708 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
7709 least NMATCH elements, and we set them to the offsets of the
7710 corresponding matched substrings.
7712 EFLAGS specifies `execution flags' which affect matching: if
7713 REG_NOTBOL is set, then ^ does not match at the beginning of the
7714 string; if REG_NOTEOL is set, then $ does not match at the end.
7716 We return 0 if we find a match and REG_NOMATCH if not. */
7719 regexec (preg, string, nmatch, pmatch, eflags)
7720 const regex_t *preg;
7723 regmatch_t pmatch[];
7727 struct re_registers regs;
7728 regex_t private_preg;
7729 int len = strlen (string);
7730 boolean want_reg_info = !preg->no_sub && nmatch > 0;
7732 private_preg = *preg;
7734 private_preg.not_bol = !!(eflags & REG_NOTBOL);
7735 private_preg.not_eol = !!(eflags & REG_NOTEOL);
7737 /* The user has told us exactly how many registers to return
7738 information about, via `nmatch'. We have to pass that on to the
7739 matching routines. */
7740 private_preg.regs_allocated = REGS_FIXED;
7744 regs.num_regs = nmatch;
7745 regs.start = TALLOC (nmatch * 2, regoff_t);
7746 if (regs.start == NULL)
7747 return (int) REG_NOMATCH;
7748 regs.end = regs.start + nmatch;
7751 /* Perform the searching operation. */
7752 ret = re_search (&private_preg, string, len,
7753 /* start: */ 0, /* range: */ len,
7754 want_reg_info ? ®s : (struct re_registers *) 0);
7756 /* Copy the register information to the POSIX structure. */
7763 for (r = 0; r < nmatch; r++)
7765 pmatch[r].rm_so = regs.start[r];
7766 pmatch[r].rm_eo = regs.end[r];
7770 /* If we needed the temporary register info, free the space now. */
7774 /* We want zero return to mean success, unlike `re_search'. */
7775 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
7778 weak_alias (__regexec, regexec)
7782 /* Returns a message corresponding to an error code, ERRCODE, returned
7783 from either regcomp or regexec. We don't use PREG here. */
7786 regerror (errcode, preg, errbuf, errbuf_size)
7788 const regex_t *preg;
7796 || errcode >= (int) (sizeof (re_error_msgid_idx)
7797 / sizeof (re_error_msgid_idx[0])))
7798 /* Only error codes returned by the rest of the code should be passed
7799 to this routine. If we are given anything else, or if other regex
7800 code generates an invalid error code, then the program has a bug.
7801 Dump core so we can fix it. */
7804 msg = gettext (re_error_msgid + re_error_msgid_idx[errcode]);
7806 msg_size = strlen (msg) + 1; /* Includes the null. */
7808 if (errbuf_size != 0)
7810 if (msg_size > errbuf_size)
7812 #if defined HAVE_MEMPCPY || defined _LIBC
7813 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
7815 memcpy (errbuf, msg, errbuf_size - 1);
7816 errbuf[errbuf_size - 1] = 0;
7820 memcpy (errbuf, msg, msg_size);
7826 weak_alias (__regerror, regerror)
7830 /* Free dynamically allocated space used by PREG. */
7836 if (preg->buffer != NULL)
7837 free (preg->buffer);
7838 preg->buffer = NULL;
7840 preg->allocated = 0;
7843 if (preg->fastmap != NULL)
7844 free (preg->fastmap);
7845 preg->fastmap = NULL;
7846 preg->fastmap_accurate = 0;
7848 if (preg->translate != NULL)
7849 free (preg->translate);
7850 preg->translate = NULL;
7853 weak_alias (__regfree, regfree)
7856 #endif /* not emacs */