1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
49 #include "pcre_internal.h"
52 /* Returns from set_start_bits() */
54 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
57 /*************************************************
58 * Set a bit and maybe its alternate case *
59 *************************************************/
61 /* Given a character, set its bit in the table, and also the bit for the other
62 version of a letter if we are caseless.
65 start_bits points to the bit map
67 caseless the caseless flag
68 cd the block with char table pointers
74 set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
76 start_bits[c/8] |= (1 << (c&7));
77 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
78 start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
83 /*************************************************
84 * Create bitmap of starting bytes *
85 *************************************************/
87 /* This function scans a compiled unanchored expression recursively and
88 attempts to build a bitmap of the set of possible starting bytes. As time goes
89 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
90 useful for parenthesized groups in patterns such as (a*)b where the group
91 provides some optional starting bytes but scanning must continue at the outer
92 level to find at least one mandatory byte. At the outermost level, this
93 function fails unless the result is SSB_DONE.
96 code points to an expression
97 start_bits points to a 32-byte table, initialized to 0
98 caseless the current state of the caseless flag
99 utf8 TRUE if in UTF-8 mode
100 cd the block with char table pointers
102 Returns: SSB_FAIL => Failed to find any starting bytes
103 SSB_DONE => Found mandatory starting bytes
104 SSB_CONTINUE => Found optional starting bytes
108 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
109 BOOL utf8, compile_data *cd)
112 int yield = SSB_DONE;
115 /* ========================================================================= */
116 /* The following comment and code was inserted in January 1999. In May 2006,
117 when it was observed to cause compiler warnings about unused values, I took it
118 out again. If anybody is still using OS/2, they will have to put it back
121 /* This next statement and the later reference to dummy are here in order to
122 trick the optimizer of the IBM C compiler for OS/2 into generating correct
123 code. Apparently IBM isn't going to fix the problem, and we would rather not
124 disable optimization (in this module it actually makes a big difference, and
125 the pcre module can use all the optimization it can get). */
128 /* ========================================================================= */
133 const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
134 BOOL try_next = TRUE;
136 while (try_next) /* Loop for items in this branch */
141 /* Fail if we reach something we don't understand */
146 /* If we hit a bracket or a positive lookahead assertion, recurse to set
147 bits from within the subpattern. If it can't find anything, we have to
148 give up. If it finds some mandatory character(s), we are done for this
149 branch. Otherwise, carry on scanning after the subpattern. */
157 rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
158 if (rc == SSB_FAIL) return SSB_FAIL;
159 if (rc == SSB_DONE) try_next = FALSE; else
161 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
162 tcode += 1 + LINK_SIZE;
166 /* If we hit ALT or KET, it means we haven't found anything mandatory in
167 this branch, though we might have found something optional. For ALT, we
168 continue with the next alternative, but we have to arrange that the final
169 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
170 return SSB_CONTINUE: if this is the top level, that indicates failure,
171 but after a nested subpattern, it causes scanning to continue. */
174 yield = SSB_CONTINUE;
183 /* Skip over callout */
186 tcode += 2 + 2*LINK_SIZE;
189 /* Skip over lookbehind and negative lookahead assertions */
193 case OP_ASSERTBACK_NOT:
194 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
195 tcode += 1 + LINK_SIZE;
198 /* Skip over an option setting, changing the caseless flag */
201 caseless = (tcode[1] & PCRE_CASELESS) != 0;
205 /* BRAZERO does the bracket, but carries on. */
209 if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
211 /* =========================================================================
212 See the comment at the head of this function concerning the next line,
213 which was an old fudge for the benefit of OS/2.
215 ========================================================================= */
216 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
217 tcode += 1 + LINK_SIZE;
220 /* SKIPZERO skips the bracket. */
224 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
225 tcode += 1 + LINK_SIZE;
228 /* Single-char * or ? sets the bit and tries the next item */
236 set_bit(start_bits, tcode[1], caseless, cd);
239 if (utf8 && tcode[-1] >= 0xc0)
240 tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
244 /* Single-char upto sets the bit and tries the next */
249 set_bit(start_bits, tcode[3], caseless, cd);
252 if (utf8 && tcode[-1] >= 0xc0)
253 tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
257 /* At least one single char sets the bit and stops */
259 case OP_EXACT: /* Fall through */
267 set_bit(start_bits, tcode[1], caseless, cd);
271 /* Single character type sets the bits and stops */
274 for (c = 0; c < 32; c++)
275 start_bits[c] |= ~cd->cbits[c+cbit_digit];
280 for (c = 0; c < 32; c++)
281 start_bits[c] |= cd->cbits[c+cbit_digit];
285 /* The cbit_space table has vertical tab as whitespace; we have to
288 case OP_NOT_WHITESPACE:
289 for (c = 0; c < 32; c++)
291 int d = cd->cbits[c+cbit_space];
292 if (c == 1) d &= ~0x08;
298 /* The cbit_space table has vertical tab as whitespace; we have to
302 for (c = 0; c < 32; c++)
304 int d = cd->cbits[c+cbit_space];
305 if (c == 1) d &= ~0x08;
311 case OP_NOT_WORDCHAR:
312 for (c = 0; c < 32; c++)
313 start_bits[c] |= ~cd->cbits[c+cbit_word];
318 for (c = 0; c < 32; c++)
319 start_bits[c] |= cd->cbits[c+cbit_word];
323 /* One or more character type fudges the pointer and restarts, knowing
324 it will hit a single character type and stop there. */
335 /* Zero or more repeats of character types set the bits and then
341 tcode += 2; /* Fall through */
347 case OP_TYPEMINQUERY:
348 case OP_TYPEPOSQUERY:
356 for (c = 0; c < 32; c++)
357 start_bits[c] |= ~cd->cbits[c+cbit_digit];
361 for (c = 0; c < 32; c++)
362 start_bits[c] |= cd->cbits[c+cbit_digit];
365 /* The cbit_space table has vertical tab as whitespace; we have to
368 case OP_NOT_WHITESPACE:
369 for (c = 0; c < 32; c++)
371 int d = cd->cbits[c+cbit_space];
372 if (c == 1) d &= ~0x08;
377 /* The cbit_space table has vertical tab as whitespace; we have to
381 for (c = 0; c < 32; c++)
383 int d = cd->cbits[c+cbit_space];
384 if (c == 1) d &= ~0x08;
389 case OP_NOT_WORDCHAR:
390 for (c = 0; c < 32; c++)
391 start_bits[c] |= ~cd->cbits[c+cbit_word];
395 for (c = 0; c < 32; c++)
396 start_bits[c] |= cd->cbits[c+cbit_word];
403 /* Character class where all the information is in a bit map: set the
404 bits and either carry on or not, according to the repeat count. If it was
405 a negative class, and we are operating with UTF-8 characters, any byte
406 with a value >= 0xc4 is a potentially valid starter because it starts a
407 character with a value > 255. */
413 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
414 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
423 /* In UTF-8 mode, the bits in a bit map correspond to character
424 values, not to byte values. However, the bit map we are constructing is
425 for byte values. So we have to do a conversion for characters whose
426 value is > 127. In fact, there are only two possible starting bytes for
427 characters in the range 128 - 255. */
432 for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
433 for (c = 128; c < 256; c++)
435 if ((tcode[c/8] && (1 << (c&7))) != 0)
437 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
438 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
439 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
444 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
449 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
452 /* Advance past the bit map, and act on what follows */
466 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
467 else try_next = FALSE;
475 break; /* End of bitmap class handling */
477 } /* End of switch */
478 } /* End of try_next loop */
480 code += GET(code, 1); /* Advance to next branch */
482 while (*code == OP_ALT);
488 /*************************************************
489 * Study a compiled expression *
490 *************************************************/
492 /* This function is handed a compiled expression that it must study to produce
493 information that will speed up the matching. It returns a pcre_extra block
494 which then gets handed back to pcre_exec().
497 re points to the compiled expression
498 options contains option bits
499 errorptr points to where to place error messages;
500 set NULL unless error
502 Returns: pointer to a pcre_extra block, with study_data filled in and the
503 appropriate flag set;
504 NULL on error or if no optimization possible
507 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
508 pcre_study(const pcre *external_re, int options, const char **errorptr)
510 uschar start_bits[32];
512 pcre_study_data *study;
513 const uschar *tables;
515 compile_data compile_block;
516 const real_pcre *re = (const real_pcre *)external_re;
520 if (re == NULL || re->magic_number != MAGIC_NUMBER)
522 *errorptr = "argument is not a compiled regular expression";
526 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
528 *errorptr = "unknown or incorrect option bit(s) set";
532 code = (uschar *)re + re->name_table_offset +
533 (re->name_count * re->name_entry_size);
535 /* For an anchored pattern, or an unanchored pattern that has a first char, or
536 a multiline pattern that matches only at "line starts", no further processing
539 if ((re->options & PCRE_ANCHORED) != 0 ||
540 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
543 /* Set the character tables in the block that is passed around */
547 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
550 compile_block.lcc = tables + lcc_offset;
551 compile_block.fcc = tables + fcc_offset;
552 compile_block.cbits = tables + cbits_offset;
553 compile_block.ctypes = tables + ctypes_offset;
555 /* See if we can find a fixed set of initial characters for the pattern. */
557 memset(start_bits, 0, 32 * sizeof(uschar));
558 if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
559 (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
561 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
562 the latter, which is pointed to by the former, which may also get additional
563 data set later by the calling program. At the moment, the size of
564 pcre_study_data is fixed. We nevertheless save it in a field for returning via
565 the pcre_fullinfo() function so that if it becomes variable in the future, we
566 don't have to change that code. */
568 extra = (pcre_extra *)(pcre_malloc)
569 (sizeof(pcre_extra) + sizeof(pcre_study_data));
573 *errorptr = "failed to get memory";
577 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
578 extra->flags = PCRE_EXTRA_STUDY_DATA;
579 extra->study_data = study;
581 study->size = sizeof(pcre_study_data);
582 study->options = PCRE_STUDY_MAPPED;
583 memcpy(study->start_bits, start_bits, sizeof(start_bits));
588 /* End of pcre_study.c */