pcre_study.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9            Copyright (c) 1997-2010 University of Cambridge
  10
  11 -----------------------------------------------------------------------------
  12 Redistribution and use in source and binary forms, with or without
  13 modification, are permitted provided that the following conditions are met:
  14
  15     * Redistributions of source code must retain the above copyright notice,
  16       this list of conditions and the following disclaimer.
  17
  18     * Redistributions in binary form must reproduce the above copyright
  19       notice, this list of conditions and the following disclaimer in the
  20       documentation and/or other materials provided with the distribution.
  21
  22     * Neither the name of the University of Cambridge nor the names of its
  23       contributors may be used to endorse or promote products derived from
  24       this software without specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE.
  37 -----------------------------------------------------------------------------
  38 */
  39
  40
  41 /* This module contains the external function pcre_study(), along with local
  42 supporting functions. */
  43
  44
  45 #ifdef HAVE_CONFIG_H
  46 #include "config.h"
  47 #endif
  48
  49 #include "pcre_internal.h"
  50
  51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
  52
  53 /* Returns from set_start_bits() */
  54
  55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
  56
  57
  58
  59 /*************************************************
  60 *   Find the minimum subject length for a group  *
  61 *************************************************/
  62
  63 /* Scan a parenthesized group and compute the minimum length of subject that
  64 is needed to match it. This is a lower bound; it does not mean there is a
  65 string of that length that matches. In UTF8 mode, the result is in characters
  66 rather than bytes.
  67
  68 Arguments:
  69   code       pointer to start of group (the bracket)
  70   startcode  pointer to start of the whole pattern
  71   options    the compiling options
  72
  73 Returns:   the minimum length
  74            -1 if \C was encountered
  75            -2 internal error (missing capturing bracket)
  76 */
  77
  78 static int
  79 find_minlength(const uschar *code, const uschar *startcode, int options)
  80 {
  81 int length = -1;
  82 BOOL utf8 = (options & PCRE_UTF8) != 0;
  83 BOOL had_recurse = FALSE;
  84 register int branchlength = 0;
  85 register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
  86
  87 if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
  88
  89 /* Scan along the opcodes for this branch. If we get to the end of the
  90 branch, check the length against that of the other branches. */
  91
  92 for (;;)
  93   {
  94   int d, min;
  95   uschar *cs, *ce;
  96   register int op = *cc;
  97
  98   switch (op)
  99     {
 100     case OP_COND:
 101     case OP_SCOND:
 102
 103     /* If there is only one branch in a condition, the implied branch has zero
 104     length, so we don't add anything. This covers the DEFINE "condition"
 105     automatically. */
 106
 107     cs = cc + GET(cc, 1);
 108     if (*cs != OP_ALT)
 109       {
 110       cc = cs + 1 + LINK_SIZE;
 111       break;
 112       }
 113
 114     /* Otherwise we can fall through and treat it the same as any other
 115     subpattern. */
 116
 117     case OP_CBRA:
 118     case OP_SCBRA:
 119     case OP_BRA:
 120     case OP_SBRA:
 121     case OP_ONCE:
 122     d = find_minlength(cc, startcode, options);
 123     if (d < 0) return d;
 124     branchlength += d;
 125     do cc += GET(cc, 1); while (*cc == OP_ALT);
 126     cc += 1 + LINK_SIZE;
 127     break;
 128
 129     /* Reached end of a branch; if it's a ket it is the end of a nested
 130     call. If it's ALT it is an alternation in a nested call. If it is
 131     END it's the end of the outer call. All can be handled by the same code. */
 132
 133     case OP_ALT:
 134     case OP_KET:
 135     case OP_KETRMAX:
 136     case OP_KETRMIN:
 137     case OP_END:
 138     if (length < 0 || (!had_recurse && branchlength < length))
 139       length = branchlength;
 140     if (*cc != OP_ALT) return length;
 141     cc += 1 + LINK_SIZE;
 142     branchlength = 0;
 143     had_recurse = FALSE;
 144     break;
 145
 146     /* Skip over assertive subpatterns */
 147
 148     case OP_ASSERT:
 149     case OP_ASSERT_NOT:
 150     case OP_ASSERTBACK:
 151     case OP_ASSERTBACK_NOT:
 152     do cc += GET(cc, 1); while (*cc == OP_ALT);
 153     /* Fall through */
 154
 155     /* Skip over things that don't match chars */
 156
 157     case OP_REVERSE:
 158     case OP_CREF:
 159     case OP_NCREF:
 160     case OP_RREF:
 161     case OP_NRREF:
 162     case OP_DEF:
 163     case OP_OPT:
 164     case OP_CALLOUT:
 165     case OP_SOD:
 166     case OP_SOM:
 167     case OP_EOD:
 168     case OP_EODN:
 169     case OP_CIRC:
 170     case OP_DOLL:
 171     case OP_NOT_WORD_BOUNDARY:
 172     case OP_WORD_BOUNDARY:
 173     cc += _pcre_OP_lengths[*cc];
 174     break;
 175
 176     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
 177
 178     case OP_BRAZERO:
 179     case OP_BRAMINZERO:
 180     case OP_SKIPZERO:
 181     cc += _pcre_OP_lengths[*cc];
 182     do cc += GET(cc, 1); while (*cc == OP_ALT);
 183     cc += 1 + LINK_SIZE;
 184     break;
 185
 186     /* Handle literal characters and + repetitions */
 187
 188     case OP_CHAR:
 189     case OP_CHARNC:
 190     case OP_NOT:
 191     case OP_PLUS:
 192     case OP_MINPLUS:
 193     case OP_POSPLUS:
 194     case OP_NOTPLUS:
 195     case OP_NOTMINPLUS:
 196     case OP_NOTPOSPLUS:
 197     branchlength++;
 198     cc += 2;
 199 #ifdef SUPPORT_UTF8
 200     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
 201 #endif
 202     break;
 203
 204     case OP_TYPEPLUS:
 205     case OP_TYPEMINPLUS:
 206     case OP_TYPEPOSPLUS:
 207     branchlength++;
 208     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
 209     break;
 210
 211     /* Handle exact repetitions. The count is already in characters, but we
 212     need to skip over a multibyte character in UTF8 mode.  */
 213
 214     case OP_EXACT:
 215     case OP_NOTEXACT:
 216     branchlength += GET2(cc,1);
 217     cc += 4;
 218 #ifdef SUPPORT_UTF8
 219     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
 220 #endif
 221     break;
 222
 223     case OP_TYPEEXACT:
 224     branchlength += GET2(cc,1);
 225     cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
 226     break;
 227
 228     /* Handle single-char non-literal matchers */
 229
 230     case OP_PROP:
 231     case OP_NOTPROP:
 232     cc += 2;
 233     /* Fall through */
 234
 235     case OP_NOT_DIGIT:
 236     case OP_DIGIT:
 237     case OP_NOT_WHITESPACE:
 238     case OP_WHITESPACE:
 239     case OP_NOT_WORDCHAR:
 240     case OP_WORDCHAR:
 241     case OP_ANY:
 242     case OP_ALLANY:
 243     case OP_EXTUNI:
 244     case OP_HSPACE:
 245     case OP_NOT_HSPACE:
 246     case OP_VSPACE:
 247     case OP_NOT_VSPACE:
 248     branchlength++;
 249     cc++;
 250     break;
 251
 252     /* "Any newline" might match two characters */
 253
 254     case OP_ANYNL:
 255     branchlength += 2;
 256     cc++;
 257     break;
 258
 259     /* The single-byte matcher means we can't proceed in UTF-8 mode */
 260
 261     case OP_ANYBYTE:
 262 #ifdef SUPPORT_UTF8
 263     if (utf8) return -1;
 264 #endif
 265     branchlength++;
 266     cc++;
 267     break;
 268
 269     /* For repeated character types, we have to test for \p and \P, which have
 270     an extra two bytes of parameters. */
 271
 272     case OP_TYPESTAR:
 273     case OP_TYPEMINSTAR:
 274     case OP_TYPEQUERY:
 275     case OP_TYPEMINQUERY:
 276     case OP_TYPEPOSSTAR:
 277     case OP_TYPEPOSQUERY:
 278     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
 279     cc += _pcre_OP_lengths[op];
 280     break;
 281
 282     case OP_TYPEUPTO:
 283     case OP_TYPEMINUPTO:
 284     case OP_TYPEPOSUPTO:
 285     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
 286     cc += _pcre_OP_lengths[op];
 287     break;
 288
 289     /* Check a class for variable quantification */
 290
 291 #ifdef SUPPORT_UTF8
 292     case OP_XCLASS:
 293     cc += GET(cc, 1) - 33;
 294     /* Fall through */
 295 #endif
 296
 297     case OP_CLASS:
 298     case OP_NCLASS:
 299     cc += 33;
 300
 301     switch (*cc)
 302       {
 303       case OP_CRPLUS:
 304       case OP_CRMINPLUS:
 305       branchlength++;
 306       /* Fall through */
 307
 308       case OP_CRSTAR:
 309       case OP_CRMINSTAR:
 310       case OP_CRQUERY:
 311       case OP_CRMINQUERY:
 312       cc++;
 313       break;
 314
 315       case OP_CRRANGE:
 316       case OP_CRMINRANGE:
 317       branchlength += GET2(cc,1);
 318       cc += 5;
 319       break;
 320
 321       default:
 322       branchlength++;
 323       break;
 324       }
 325     break;
 326
 327     /* Backreferences and subroutine calls are treated in the same way: we find
 328     the minimum length for the subpattern. A recursion, however, causes an
 329     a flag to be set that causes the length of this branch to be ignored. The
 330     logic is that a recursion can only make sense if there is another
 331     alternation that stops the recursing. That will provide the minimum length
 332     (when no recursion happens). A backreference within the group that it is
 333     referencing behaves in the same way.
 334
 335     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
 336     matches an empty string (by default it causes a matching failure), so in
 337     that case we must set the minimum length to zero. */
 338
 339     case OP_REF:
 340     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
 341       {
 342       ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
 343       if (cs == NULL) return -2;
 344       do ce += GET(ce, 1); while (*ce == OP_ALT);
 345       if (cc > cs && cc < ce)
 346         {
 347         d = 0;
 348         had_recurse = TRUE;
 349         }
 350       else d = find_minlength(cs, startcode, options);
 351       }
 352     else d = 0;
 353     cc += 3;
 354
 355     /* Handle repeated back references */
 356
 357     switch (*cc)
 358       {
 359       case OP_CRSTAR:
 360       case OP_CRMINSTAR:
 361       case OP_CRQUERY:
 362       case OP_CRMINQUERY:
 363       min = 0;
 364       cc++;
 365       break;
 366
 367       case OP_CRRANGE:
 368       case OP_CRMINRANGE:
 369       min = GET2(cc, 1);
 370       cc += 5;
 371       break;
 372
 373       default:
 374       min = 1;
 375       break;
 376       }
 377
 378     branchlength += min * d;
 379     break;
 380
 381     case OP_RECURSE:
 382     cs = ce = (uschar *)startcode + GET(cc, 1);
 383     if (cs == NULL) return -2;
 384     do ce += GET(ce, 1); while (*ce == OP_ALT);
 385     if (cc > cs && cc < ce)
 386       had_recurse = TRUE;
 387     else
 388       branchlength += find_minlength(cs, startcode, options);
 389     cc += 1 + LINK_SIZE;
 390     break;
 391
 392     /* Anything else does not or need not match a character. We can get the
 393     item's length from the table, but for those that can match zero occurrences
 394     of a character, we must take special action for UTF-8 characters. */
 395
 396     case OP_UPTO:
 397     case OP_NOTUPTO:
 398     case OP_MINUPTO:
 399     case OP_NOTMINUPTO:
 400     case OP_POSUPTO:
 401     case OP_STAR:
 402     case OP_MINSTAR:
 403     case OP_NOTMINSTAR:
 404     case OP_POSSTAR:
 405     case OP_NOTPOSSTAR:
 406     case OP_QUERY:
 407     case OP_MINQUERY:
 408     case OP_NOTMINQUERY:
 409     case OP_POSQUERY:
 410     case OP_NOTPOSQUERY:
 411     cc += _pcre_OP_lengths[op];
 412 #ifdef SUPPORT_UTF8
 413     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
 414 #endif
 415     break;
 416
 417     /* Skip these, but we need to add in the name length. */
 418
 419     case OP_MARK:
 420     case OP_PRUNE_ARG:
 421     case OP_SKIP_ARG:
 422     cc += _pcre_OP_lengths[op] + cc[1];
 423     break;
 424
 425     case OP_THEN_ARG:
 426     cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
 427     break;
 428
 429     /* For the record, these are the opcodes that are matched by "default":
 430     OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
 431     OP_THEN. */
 432
 433     default:
 434     cc += _pcre_OP_lengths[op];
 435     break;
 436     }
 437   }
 438 /* Control never gets here */
 439 }
 440
 441
 442
 443 /*************************************************
 444 *      Set a bit and maybe its alternate case    *
 445 *************************************************/
 446
 447 /* Given a character, set its first byte's bit in the table, and also the
 448 corresponding bit for the other version of a letter if we are caseless. In
 449 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
 450 when Unicode property support is available.
 451
 452 Arguments:
 453   start_bits    points to the bit map
 454   p             points to the character
 455   caseless      the caseless flag
 456   cd            the block with char table pointers
 457   utf8          TRUE for UTF-8 mode
 458
 459 Returns:        pointer after the character
 460 */
 461
 462 static const uschar *
 463 set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
 464   compile_data *cd, BOOL utf8)
 465 {
 466 unsigned int c = *p;
 467
 468 SET_BIT(c);
 469
 470 #ifdef SUPPORT_UTF8
 471 if (utf8 && c > 127)
 472   {
 473   GETCHARINC(c, p);
 474 #ifdef SUPPORT_UCP
 475   if (caseless)
 476     {
 477     uschar buff[8];
 478     c = UCD_OTHERCASE(c);
 479     (void)_pcre_ord2utf8(c, buff);
 480     SET_BIT(buff[0]);
 481     }
 482 #endif
 483   return p;
 484   }
 485 #endif
 486
 487 /* Not UTF-8 mode, or character is less than 127. */
 488
 489 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
 490 return p + 1;
 491 }
 492
 493
 494
 495 /*************************************************
 496 *     Set bits for a positive character type     *
 497 *************************************************/
 498
 499 /* This function sets starting bits for a character type. In UTF-8 mode, we can
 500 only do a direct setting for bytes less than 128, as otherwise there can be
 501 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
 502 environment, the tables will only recognize ASCII characters anyway, but in at
 503 least one Windows environment, some higher bytes bits were set in the tables.
 504 So we deal with that case by considering the UTF-8 encoding.
 505
 506 Arguments:
 507   start_bits     the starting bitmap
 508   cbit type      the type of character wanted
 509   table_limit    32 for non-UTF-8; 16 for UTF-8
 510   cd             the block with char table pointers
 511
 512 Returns:         nothing
 513 */
 514
 515 static void
 516 set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
 517   compile_data *cd)
 518 {
 519 register int c;
 520 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
 521 if (table_limit == 32) return;
 522 for (c = 128; c < 256; c++)
 523   {
 524   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
 525     {
 526     uschar buff[8];
 527     (void)_pcre_ord2utf8(c, buff);
 528     SET_BIT(buff[0]);
 529     }
 530   }
 531 }
 532
 533
 534 /*************************************************
 535 *     Set bits for a negative character type     *
 536 *************************************************/
 537
 538 /* This function sets starting bits for a negative character type such as \D.
 539 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
 540 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
 541 Unlike in the positive case, where we can set appropriate starting bits for
 542 specific high-valued UTF-8 characters, in this case we have to set the bits for
 543 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
 544 0xc0 (192) for simplicity.
 545
 546 Arguments:
 547   start_bits     the starting bitmap
 548   cbit type      the type of character wanted
 549   table_limit    32 for non-UTF-8; 16 for UTF-8
 550   cd             the block with char table pointers
 551
 552 Returns:         nothing
 553 */
 554
 555 static void
 556 set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
 557   compile_data *cd)
 558 {
 559 register int c;
 560 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
 561 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
 562 }
 563
 564
 565
 566 /*************************************************
 567 *          Create bitmap of starting bytes       *
 568 *************************************************/
 569
 570 /* This function scans a compiled unanchored expression recursively and
 571 attempts to build a bitmap of the set of possible starting bytes. As time goes
 572 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
 573 useful for parenthesized groups in patterns such as (a*)b where the group
 574 provides some optional starting bytes but scanning must continue at the outer
 575 level to find at least one mandatory byte. At the outermost level, this
 576 function fails unless the result is SSB_DONE.
 577
 578 Arguments:
 579   code         points to an expression
 580   start_bits   points to a 32-byte table, initialized to 0
 581   caseless     the current state of the caseless flag
 582   utf8         TRUE if in UTF-8 mode
 583   cd           the block with char table pointers
 584
 585 Returns:       SSB_FAIL     => Failed to find any starting bytes
 586                SSB_DONE     => Found mandatory starting bytes
 587                SSB_CONTINUE => Found optional starting bytes
 588 */
 589
 590 static int
 591 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
 592   BOOL utf8, compile_data *cd)
 593 {
 594 register int c;
 595 int yield = SSB_DONE;
 596 int table_limit = utf8? 16:32;
 597
 598 #if 0
 599 /* ========================================================================= */
 600 /* The following comment and code was inserted in January 1999. In May 2006,
 601 when it was observed to cause compiler warnings about unused values, I took it
 602 out again. If anybody is still using OS/2, they will have to put it back
 603 manually. */
 604
 605 /* This next statement and the later reference to dummy are here in order to
 606 trick the optimizer of the IBM C compiler for OS/2 into generating correct
 607 code. Apparently IBM isn't going to fix the problem, and we would rather not
 608 disable optimization (in this module it actually makes a big difference, and
 609 the pcre module can use all the optimization it can get). */
 610
 611 volatile int dummy;
 612 /* ========================================================================= */
 613 #endif
 614
 615 do
 616   {
 617   const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
 618   BOOL try_next = TRUE;
 619
 620   while (try_next)    /* Loop for items in this branch */
 621     {
 622     int rc;
 623     switch(*tcode)
 624       {
 625       /* Fail if we reach something we don't understand */
 626
 627       default:
 628       return SSB_FAIL;
 629
 630       /* If we hit a bracket or a positive lookahead assertion, recurse to set
 631       bits from within the subpattern. If it can't find anything, we have to
 632       give up. If it finds some mandatory character(s), we are done for this
 633       branch. Otherwise, carry on scanning after the subpattern. */
 634
 635       case OP_BRA:
 636       case OP_SBRA:
 637       case OP_CBRA:
 638       case OP_SCBRA:
 639       case OP_ONCE:
 640       case OP_ASSERT:
 641       rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
 642       if (rc == SSB_FAIL) return SSB_FAIL;
 643       if (rc == SSB_DONE) try_next = FALSE; else
 644         {
 645         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
 646         tcode += 1 + LINK_SIZE;
 647         }
 648       break;
 649
 650       /* If we hit ALT or KET, it means we haven't found anything mandatory in
 651       this branch, though we might have found something optional. For ALT, we
 652       continue with the next alternative, but we have to arrange that the final
 653       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
 654       return SSB_CONTINUE: if this is the top level, that indicates failure,
 655       but after a nested subpattern, it causes scanning to continue. */
 656
 657       case OP_ALT:
 658       yield = SSB_CONTINUE;
 659       try_next = FALSE;
 660       break;
 661
 662       case OP_KET:
 663       case OP_KETRMAX:
 664       case OP_KETRMIN:
 665       return SSB_CONTINUE;
 666
 667       /* Skip over callout */
 668
 669       case OP_CALLOUT:
 670       tcode += 2 + 2*LINK_SIZE;
 671       break;
 672
 673       /* Skip over lookbehind and negative lookahead assertions */
 674
 675       case OP_ASSERT_NOT:
 676       case OP_ASSERTBACK:
 677       case OP_ASSERTBACK_NOT:
 678       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
 679       tcode += 1 + LINK_SIZE;
 680       break;
 681
 682       /* Skip over an option setting, changing the caseless flag */
 683
 684       case OP_OPT:
 685       caseless = (tcode[1] & PCRE_CASELESS) != 0;
 686       tcode += 2;
 687       break;
 688
 689       /* BRAZERO does the bracket, but carries on. */
 690
 691       case OP_BRAZERO:
 692       case OP_BRAMINZERO:
 693       if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
 694         return SSB_FAIL;
 695 /* =========================================================================
 696       See the comment at the head of this function concerning the next line,
 697       which was an old fudge for the benefit of OS/2.
 698       dummy = 1;
 699   ========================================================================= */
 700       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
 701       tcode += 1 + LINK_SIZE;
 702       break;
 703
 704       /* SKIPZERO skips the bracket. */
 705
 706       case OP_SKIPZERO:
 707       tcode++;
 708       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
 709       tcode += 1 + LINK_SIZE;
 710       break;
 711
 712       /* Single-char * or ? sets the bit and tries the next item */
 713
 714       case OP_STAR:
 715       case OP_MINSTAR:
 716       case OP_POSSTAR:
 717       case OP_QUERY:
 718       case OP_MINQUERY:
 719       case OP_POSQUERY:
 720       tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
 721       break;
 722
 723       /* Single-char upto sets the bit and tries the next */
 724
 725       case OP_UPTO:
 726       case OP_MINUPTO:
 727       case OP_POSUPTO:
 728       tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
 729       break;
 730
 731       /* At least one single char sets the bit and stops */
 732
 733       case OP_EXACT:       /* Fall through */
 734       tcode += 2;
 735
 736       case OP_CHAR:
 737       case OP_CHARNC:
 738       case OP_PLUS:
 739       case OP_MINPLUS:
 740       case OP_POSPLUS:
 741       (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
 742       try_next = FALSE;
 743       break;
 744
 745       /* Special spacing and line-terminating items. These recognize specific
 746       lists of characters. The difference between VSPACE and ANYNL is that the
 747       latter can match the two-character CRLF sequence, but that is not
 748       relevant for finding the first character, so their code here is
 749       identical. */
 750
 751       case OP_HSPACE:
 752       SET_BIT(0x09);
 753       SET_BIT(0x20);
 754       if (utf8)
 755         {
 756         SET_BIT(0xC2);  /* For U+00A0 */
 757         SET_BIT(0xE1);  /* For U+1680, U+180E */
 758         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
 759         SET_BIT(0xE3);  /* For U+3000 */
 760         }
 761       else SET_BIT(0xA0);
 762       try_next = FALSE;
 763       break;
 764
 765       case OP_ANYNL:
 766       case OP_VSPACE:
 767       SET_BIT(0x0A);
 768       SET_BIT(0x0B);
 769       SET_BIT(0x0C);
 770       SET_BIT(0x0D);
 771       if (utf8)
 772         {
 773         SET_BIT(0xC2);  /* For U+0085 */
 774         SET_BIT(0xE2);  /* For U+2028, U+2029 */
 775         }
 776       else SET_BIT(0x85);
 777       try_next = FALSE;
 778       break;
 779
 780       /* Single character types set the bits and stop. Note that if PCRE_UCP
 781       is set, we do not see these op codes because \d etc are converted to
 782       properties. Therefore, these apply in the case when only characters less
 783       than 256 are recognized to match the types. */
 784
 785       case OP_NOT_DIGIT:
 786       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
 787       try_next = FALSE;
 788       break;
 789
 790       case OP_DIGIT:
 791       set_type_bits(start_bits, cbit_digit, table_limit, cd);
 792       try_next = FALSE;
 793       break;
 794
 795       /* The cbit_space table has vertical tab as whitespace; we have to
 796       ensure it is set as not whitespace. */
 797
 798       case OP_NOT_WHITESPACE:
 799       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
 800       start_bits[1] |= 0x08;
 801       try_next = FALSE;
 802       break;
 803
 804       /* The cbit_space table has vertical tab as whitespace; we have to
 805       not set it from the table. */
 806
 807       case OP_WHITESPACE:
 808       c = start_bits[1];    /* Save in case it was already set */
 809       set_type_bits(start_bits, cbit_space, table_limit, cd);
 810       start_bits[1] = (start_bits[1] & ~0x08) | c;
 811       try_next = FALSE;
 812       break;
 813
 814       case OP_NOT_WORDCHAR:
 815       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
 816       try_next = FALSE;
 817       break;
 818
 819       case OP_WORDCHAR:
 820       set_type_bits(start_bits, cbit_word, table_limit, cd);
 821       try_next = FALSE;
 822       break;
 823
 824       /* One or more character type fudges the pointer and restarts, knowing
 825       it will hit a single character type and stop there. */
 826
 827       case OP_TYPEPLUS:
 828       case OP_TYPEMINPLUS:
 829       case OP_TYPEPOSPLUS:
 830       tcode++;
 831       break;
 832
 833       case OP_TYPEEXACT:
 834       tcode += 3;
 835       break;
 836
 837       /* Zero or more repeats of character types set the bits and then
 838       try again. */
 839
 840       case OP_TYPEUPTO:
 841       case OP_TYPEMINUPTO:
 842       case OP_TYPEPOSUPTO:
 843       tcode += 2;               /* Fall through */
 844
 845       case OP_TYPESTAR:
 846       case OP_TYPEMINSTAR:
 847       case OP_TYPEPOSSTAR:
 848       case OP_TYPEQUERY:
 849       case OP_TYPEMINQUERY:
 850       case OP_TYPEPOSQUERY:
 851       switch(tcode[1])
 852         {
 853         default:
 854         case OP_ANY:
 855         case OP_ALLANY:
 856         return SSB_FAIL;
 857
 858         case OP_HSPACE:
 859         SET_BIT(0x09);
 860         SET_BIT(0x20);
 861         if (utf8)
 862           {
 863           SET_BIT(0xC2);  /* For U+00A0 */
 864           SET_BIT(0xE1);  /* For U+1680, U+180E */
 865           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
 866           SET_BIT(0xE3);  /* For U+3000 */
 867           }
 868         else SET_BIT(0xA0);
 869         break;
 870
 871         case OP_ANYNL:
 872         case OP_VSPACE:
 873         SET_BIT(0x0A);
 874         SET_BIT(0x0B);
 875         SET_BIT(0x0C);
 876         SET_BIT(0x0D);
 877         if (utf8)
 878           {
 879           SET_BIT(0xC2);  /* For U+0085 */
 880           SET_BIT(0xE2);  /* For U+2028, U+2029 */
 881           }
 882         else SET_BIT(0x85);
 883         break;
 884
 885         case OP_NOT_DIGIT:
 886         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
 887         break;
 888
 889         case OP_DIGIT:
 890         set_type_bits(start_bits, cbit_digit, table_limit, cd);
 891         break;
 892
 893         /* The cbit_space table has vertical tab as whitespace; we have to
 894         ensure it gets set as not whitespace. */
 895
 896         case OP_NOT_WHITESPACE:
 897         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
 898         start_bits[1] |= 0x08;
 899         break;
 900
 901         /* The cbit_space table has vertical tab as whitespace; we have to
 902         avoid setting it. */
 903
 904         case OP_WHITESPACE:
 905         c = start_bits[1];    /* Save in case it was already set */
 906         set_type_bits(start_bits, cbit_space, table_limit, cd);
 907         start_bits[1] = (start_bits[1] & ~0x08) | c;
 908         break;
 909
 910         case OP_NOT_WORDCHAR:
 911         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
 912         break;
 913
 914         case OP_WORDCHAR:
 915         set_type_bits(start_bits, cbit_word, table_limit, cd);
 916         break;
 917         }
 918
 919       tcode += 2;
 920       break;
 921
 922       /* Character class where all the information is in a bit map: set the
 923       bits and either carry on or not, according to the repeat count. If it was
 924       a negative class, and we are operating with UTF-8 characters, any byte
 925       with a value >= 0xc4 is a potentially valid starter because it starts a
 926       character with a value > 255. */
 927
 928       case OP_NCLASS:
 929 #ifdef SUPPORT_UTF8
 930       if (utf8)
 931         {
 932         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
 933         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
 934         }
 935 #endif
 936       /* Fall through */
 937
 938       case OP_CLASS:
 939         {
 940         tcode++;
 941
 942         /* In UTF-8 mode, the bits in a bit map correspond to character
 943         values, not to byte values. However, the bit map we are constructing is
 944         for byte values. So we have to do a conversion for characters whose
 945         value is > 127. In fact, there are only two possible starting bytes for
 946         characters in the range 128 - 255. */
 947
 948 #ifdef SUPPORT_UTF8
 949         if (utf8)
 950           {
 951           for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
 952           for (c = 128; c < 256; c++)
 953             {
 954             if ((tcode[c/8] && (1 << (c&7))) != 0)
 955               {
 956               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
 957               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
 958               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
 959               }
 960             }
 961           }
 962
 963         /* In non-UTF-8 mode, the two bit maps are completely compatible. */
 964
 965         else
 966 #endif
 967           {
 968           for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
 969           }
 970
 971         /* Advance past the bit map, and act on what follows */
 972
 973         tcode += 32;
 974         switch (*tcode)
 975           {
 976           case OP_CRSTAR:
 977           case OP_CRMINSTAR:
 978           case OP_CRQUERY:
 979           case OP_CRMINQUERY:
 980           tcode++;
 981           break;
 982
 983           case OP_CRRANGE:
 984           case OP_CRMINRANGE:
 985           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
 986             else try_next = FALSE;
 987           break;
 988
 989           default:
 990           try_next = FALSE;
 991           break;
 992           }
 993         }
 994       break; /* End of bitmap class handling */
 995
 996       }      /* End of switch */
 997     }        /* End of try_next loop */
 998
 999   code += GET(code, 1);   /* Advance to next branch */
1000   }
1001 while (*code == OP_ALT);
1002 return yield;
1003 }
1004
1005
1006
1007 /*************************************************
1008 *          Study a compiled expression           *
1009 *************************************************/
1010
1011 /* This function is handed a compiled expression that it must study to produce
1012 information that will speed up the matching. It returns a pcre_extra block
1013 which then gets handed back to pcre_exec().
1014
1015 Arguments:
1016   re        points to the compiled expression
1017   options   contains option bits
1018   errorptr  points to where to place error messages;
1019             set NULL unless error
1020
1021 Returns:    pointer to a pcre_extra block, with study_data filled in and the
1022               appropriate flags set;
1023             NULL on error or if no optimization possible
1024 */
1025
1026 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1027 pcre_study(const pcre *external_re, int options, const char **errorptr)
1028 {
1029 int min;
1030 BOOL bits_set = FALSE;
1031 uschar start_bits[32];
1032 pcre_extra *extra;
1033 pcre_study_data *study;
1034 const uschar *tables;
1035 uschar *code;
1036 compile_data compile_block;
1037 const real_pcre *re = (const real_pcre *)external_re;
1038
1039 *errorptr = NULL;
1040
1041 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1042   {
1043   *errorptr = "argument is not a compiled regular expression";
1044   return NULL;
1045   }
1046
1047 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1048   {
1049   *errorptr = "unknown or incorrect option bit(s) set";
1050   return NULL;
1051   }
1052
1053 code = (uschar *)re + re->name_table_offset +
1054   (re->name_count * re->name_entry_size);
1055
1056 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1057 a multiline pattern that matches only at "line starts", there is no point in
1058 seeking a list of starting bytes. */
1059
1060 if ((re->options & PCRE_ANCHORED) == 0 &&
1061     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1062   {
1063   /* Set the character tables in the block that is passed around */
1064
1065   tables = re->tables;
1066   if (tables == NULL)
1067     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1068     (void *)(&tables));
1069
1070   compile_block.lcc = tables + lcc_offset;
1071   compile_block.fcc = tables + fcc_offset;
1072   compile_block.cbits = tables + cbits_offset;
1073   compile_block.ctypes = tables + ctypes_offset;
1074
1075   /* See if we can find a fixed set of initial characters for the pattern. */
1076
1077   memset(start_bits, 0, 32 * sizeof(uschar));
1078   bits_set = set_start_bits(code, start_bits,
1079     (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
1080     &compile_block) == SSB_DONE;
1081   }
1082
1083 /* Find the minimum length of subject string. */
1084
1085 min = find_minlength(code, code, re->options);
1086
1087 /* Return NULL if no optimization is possible. */
1088
1089 if (!bits_set && min < 0) return NULL;
1090
1091 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
1092 the latter, which is pointed to by the former, which may also get additional
1093 data set later by the calling program. At the moment, the size of
1094 pcre_study_data is fixed. We nevertheless save it in a field for returning via
1095 the pcre_fullinfo() function so that if it becomes variable in the future, we
1096 don't have to change that code. */
1097
1098 extra = (pcre_extra *)(pcre_malloc)
1099   (sizeof(pcre_extra) + sizeof(pcre_study_data));
1100
1101 if (extra == NULL)
1102   {
1103   *errorptr = "failed to get memory";
1104   return NULL;
1105   }
1106
1107 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
1108 extra->flags = PCRE_EXTRA_STUDY_DATA;
1109 extra->study_data = study;
1110
1111 study->size = sizeof(pcre_study_data);
1112 study->flags = 0;
1113
1114 if (bits_set)
1115   {
1116   study->flags |= PCRE_STUDY_MAPPED;
1117   memcpy(study->start_bits, start_bits, sizeof(start_bits));
1118   }
1119
1120 if (min >= 0)
1121   {
1122   study->flags |= PCRE_STUDY_MINLEN;
1123   study->minlength = min;
1124   }
1125
1126 return extra;
1127 }
1128
1129 /* End of pcre_study.c */