glib/pcre/pcre_study.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9            Copyright (c) 1997-2012 University of Cambridge
  10
  11 -----------------------------------------------------------------------------
  12 Redistribution and use in source and binary forms, with or without
  13 modification, are permitted provided that the following conditions are met:
  14
  15     * Redistributions of source code must retain the above copyright notice,
  16       this list of conditions and the following disclaimer.
  17
  18     * Redistributions in binary form must reproduce the above copyright
  19       notice, this list of conditions and the following disclaimer in the
  20       documentation and/or other materials provided with the distribution.
  21
  22     * Neither the name of the University of Cambridge nor the names of its
  23       contributors may be used to endorse or promote products derived from
  24       this software without specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE.
  37 -----------------------------------------------------------------------------
  38 */
  39
  40
  41 /* This module contains the external function pcre_study(), along with local
  42 supporting functions. */
  43
  44
  45 #ifdef HAVE_CONFIG_H
  46 #include "config.h"
  47 #endif
  48
  49 #include "pcre_internal.h"
  50
  51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
  52
  53 /* Returns from set_start_bits() */
  54
  55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
  56
  57
  58
  59 /*************************************************
  60 *   Find the minimum subject length for a group  *
  61 *************************************************/
  62
  63 /* Scan a parenthesized group and compute the minimum length of subject that
  64 is needed to match it. This is a lower bound; it does not mean there is a
  65 string of that length that matches. In UTF8 mode, the result is in characters
  66 rather than bytes.
  67
  68 Arguments:
  69   code            pointer to start of group (the bracket)
  70   startcode       pointer to start of the whole pattern
  71   options         the compiling options
  72   int             RECURSE depth
  73
  74 Returns:   the minimum length
  75            -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
  76            -2 internal error (missing capturing bracket)
  77            -3 internal error (opcode not listed)
  78 */
  79
  80 static int
  81 find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
  82   int recurse_depth)
  83 {
  84 int length = -1;
  85 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
  86 BOOL utf = (options & PCRE_UTF8) != 0;
  87 BOOL had_recurse = FALSE;
  88 register int branchlength = 0;
  89 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
  90
  91 if (*code == OP_CBRA || *code == OP_SCBRA ||
  92     *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
  93
  94 /* Scan along the opcodes for this branch. If we get to the end of the
  95 branch, check the length against that of the other branches. */
  96
  97 for (;;)
  98   {
  99   int d, min;
 100   pcre_uchar *cs, *ce;
 101   register int op = *cc;
 102
 103   switch (op)
 104     {
 105     case OP_COND:
 106     case OP_SCOND:
 107
 108     /* If there is only one branch in a condition, the implied branch has zero
 109     length, so we don't add anything. This covers the DEFINE "condition"
 110     automatically. */
 111
 112     cs = cc + GET(cc, 1);
 113     if (*cs != OP_ALT)
 114       {
 115       cc = cs + 1 + LINK_SIZE;
 116       break;
 117       }
 118
 119     /* Otherwise we can fall through and treat it the same as any other
 120     subpattern. */
 121
 122     case OP_CBRA:
 123     case OP_SCBRA:
 124     case OP_BRA:
 125     case OP_SBRA:
 126     case OP_CBRAPOS:
 127     case OP_SCBRAPOS:
 128     case OP_BRAPOS:
 129     case OP_SBRAPOS:
 130     case OP_ONCE:
 131     case OP_ONCE_NC:
 132     d = find_minlength(cc, startcode, options, recurse_depth);
 133     if (d < 0) return d;
 134     branchlength += d;
 135     do cc += GET(cc, 1); while (*cc == OP_ALT);
 136     cc += 1 + LINK_SIZE;
 137     break;
 138
 139     /* ACCEPT makes things far too complicated; we have to give up. */
 140
 141     case OP_ACCEPT:
 142     case OP_ASSERT_ACCEPT:
 143     return -1;
 144
 145     /* Reached end of a branch; if it's a ket it is the end of a nested
 146     call. If it's ALT it is an alternation in a nested call. If it is END it's
 147     the end of the outer call. All can be handled by the same code. If an
 148     ACCEPT was previously encountered, use the length that was in force at that
 149     time, and pass back the shortest ACCEPT length. */
 150
 151     case OP_ALT:
 152     case OP_KET:
 153     case OP_KETRMAX:
 154     case OP_KETRMIN:
 155     case OP_KETRPOS:
 156     case OP_END:
 157     if (length < 0 || (!had_recurse && branchlength < length))
 158       length = branchlength;
 159     if (op != OP_ALT) return length;
 160     cc += 1 + LINK_SIZE;
 161     branchlength = 0;
 162     had_recurse = FALSE;
 163     break;
 164
 165     /* Skip over assertive subpatterns */
 166
 167     case OP_ASSERT:
 168     case OP_ASSERT_NOT:
 169     case OP_ASSERTBACK:
 170     case OP_ASSERTBACK_NOT:
 171     do cc += GET(cc, 1); while (*cc == OP_ALT);
 172     /* Fall through */
 173
 174     /* Skip over things that don't match chars */
 175
 176     case OP_REVERSE:
 177     case OP_CREF:
 178     case OP_NCREF:
 179     case OP_RREF:
 180     case OP_NRREF:
 181     case OP_DEF:
 182     case OP_CALLOUT:
 183     case OP_SOD:
 184     case OP_SOM:
 185     case OP_EOD:
 186     case OP_EODN:
 187     case OP_CIRC:
 188     case OP_CIRCM:
 189     case OP_DOLL:
 190     case OP_DOLLM:
 191     case OP_NOT_WORD_BOUNDARY:
 192     case OP_WORD_BOUNDARY:
 193     cc += PRIV(OP_lengths)[*cc];
 194     break;
 195
 196     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
 197
 198     case OP_BRAZERO:
 199     case OP_BRAMINZERO:
 200     case OP_BRAPOSZERO:
 201     case OP_SKIPZERO:
 202     cc += PRIV(OP_lengths)[*cc];
 203     do cc += GET(cc, 1); while (*cc == OP_ALT);
 204     cc += 1 + LINK_SIZE;
 205     break;
 206
 207     /* Handle literal characters and + repetitions */
 208
 209     case OP_CHAR:
 210     case OP_CHARI:
 211     case OP_NOT:
 212     case OP_NOTI:
 213     case OP_PLUS:
 214     case OP_PLUSI:
 215     case OP_MINPLUS:
 216     case OP_MINPLUSI:
 217     case OP_POSPLUS:
 218     case OP_POSPLUSI:
 219     case OP_NOTPLUS:
 220     case OP_NOTPLUSI:
 221     case OP_NOTMINPLUS:
 222     case OP_NOTMINPLUSI:
 223     case OP_NOTPOSPLUS:
 224     case OP_NOTPOSPLUSI:
 225     branchlength++;
 226     cc += 2;
 227 #ifdef SUPPORT_UTF
 228     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 229 #endif
 230     break;
 231
 232     case OP_TYPEPLUS:
 233     case OP_TYPEMINPLUS:
 234     case OP_TYPEPOSPLUS:
 235     branchlength++;
 236     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
 237     break;
 238
 239     /* Handle exact repetitions. The count is already in characters, but we
 240     need to skip over a multibyte character in UTF8 mode.  */
 241
 242     case OP_EXACT:
 243     case OP_EXACTI:
 244     case OP_NOTEXACT:
 245     case OP_NOTEXACTI:
 246     branchlength += GET2(cc,1);
 247     cc += 2 + IMM2_SIZE;
 248 #ifdef SUPPORT_UTF
 249     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 250 #endif
 251     break;
 252
 253     case OP_TYPEEXACT:
 254     branchlength += GET2(cc,1);
 255     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
 256       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
 257     break;
 258
 259     /* Handle single-char non-literal matchers */
 260
 261     case OP_PROP:
 262     case OP_NOTPROP:
 263     cc += 2;
 264     /* Fall through */
 265
 266     case OP_NOT_DIGIT:
 267     case OP_DIGIT:
 268     case OP_NOT_WHITESPACE:
 269     case OP_WHITESPACE:
 270     case OP_NOT_WORDCHAR:
 271     case OP_WORDCHAR:
 272     case OP_ANY:
 273     case OP_ALLANY:
 274     case OP_EXTUNI:
 275     case OP_HSPACE:
 276     case OP_NOT_HSPACE:
 277     case OP_VSPACE:
 278     case OP_NOT_VSPACE:
 279     branchlength++;
 280     cc++;
 281     break;
 282
 283     /* "Any newline" might match two characters, but it also might match just
 284     one. */
 285
 286     case OP_ANYNL:
 287     branchlength += 1;
 288     cc++;
 289     break;
 290
 291     /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
 292     non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
 293     appear, but leave the code, just in case.) */
 294
 295     case OP_ANYBYTE:
 296 #ifdef SUPPORT_UTF
 297     if (utf) return -1;
 298 #endif
 299     branchlength++;
 300     cc++;
 301     break;
 302
 303     /* For repeated character types, we have to test for \p and \P, which have
 304     an extra two bytes of parameters. */
 305
 306     case OP_TYPESTAR:
 307     case OP_TYPEMINSTAR:
 308     case OP_TYPEQUERY:
 309     case OP_TYPEMINQUERY:
 310     case OP_TYPEPOSSTAR:
 311     case OP_TYPEPOSQUERY:
 312     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
 313     cc += PRIV(OP_lengths)[op];
 314     break;
 315
 316     case OP_TYPEUPTO:
 317     case OP_TYPEMINUPTO:
 318     case OP_TYPEPOSUPTO:
 319     if (cc[1 + IMM2_SIZE] == OP_PROP
 320       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
 321     cc += PRIV(OP_lengths)[op];
 322     break;
 323
 324     /* Check a class for variable quantification */
 325
 326 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 327     case OP_XCLASS:
 328     cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
 329     /* Fall through */
 330 #endif
 331
 332     case OP_CLASS:
 333     case OP_NCLASS:
 334     cc += PRIV(OP_lengths)[OP_CLASS];
 335
 336     switch (*cc)
 337       {
 338       case OP_CRPLUS:
 339       case OP_CRMINPLUS:
 340       branchlength++;
 341       /* Fall through */
 342
 343       case OP_CRSTAR:
 344       case OP_CRMINSTAR:
 345       case OP_CRQUERY:
 346       case OP_CRMINQUERY:
 347       cc++;
 348       break;
 349
 350       case OP_CRRANGE:
 351       case OP_CRMINRANGE:
 352       branchlength += GET2(cc,1);
 353       cc += 1 + 2 * IMM2_SIZE;
 354       break;
 355
 356       default:
 357       branchlength++;
 358       break;
 359       }
 360     break;
 361
 362     /* Backreferences and subroutine calls are treated in the same way: we find
 363     the minimum length for the subpattern. A recursion, however, causes an
 364     a flag to be set that causes the length of this branch to be ignored. The
 365     logic is that a recursion can only make sense if there is another
 366     alternation that stops the recursing. That will provide the minimum length
 367     (when no recursion happens). A backreference within the group that it is
 368     referencing behaves in the same way.
 369
 370     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
 371     matches an empty string (by default it causes a matching failure), so in
 372     that case we must set the minimum length to zero. */
 373
 374     case OP_REF:
 375     case OP_REFI:
 376     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
 377       {
 378       ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
 379       if (cs == NULL) return -2;
 380       do ce += GET(ce, 1); while (*ce == OP_ALT);
 381       if (cc > cs && cc < ce)
 382         {
 383         d = 0;
 384         had_recurse = TRUE;
 385         }
 386       else
 387         {
 388         d = find_minlength(cs, startcode, options, recurse_depth);
 389         }
 390       }
 391     else d = 0;
 392     cc += 1 + IMM2_SIZE;
 393
 394     /* Handle repeated back references */
 395
 396     switch (*cc)
 397       {
 398       case OP_CRSTAR:
 399       case OP_CRMINSTAR:
 400       case OP_CRQUERY:
 401       case OP_CRMINQUERY:
 402       min = 0;
 403       cc++;
 404       break;
 405
 406       case OP_CRPLUS:
 407       case OP_CRMINPLUS:
 408       min = 1;
 409       cc++;
 410       break;
 411
 412       case OP_CRRANGE:
 413       case OP_CRMINRANGE:
 414       min = GET2(cc, 1);
 415       cc += 1 + 2 * IMM2_SIZE;
 416       break;
 417
 418       default:
 419       min = 1;
 420       break;
 421       }
 422
 423     branchlength += min * d;
 424     break;
 425
 426     /* We can easily detect direct recursion, but not mutual recursion. This is
 427     caught by a recursion depth count. */
 428
 429     case OP_RECURSE:
 430     cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
 431     do ce += GET(ce, 1); while (*ce == OP_ALT);
 432     if ((cc > cs && cc < ce) || recurse_depth > 10)
 433       had_recurse = TRUE;
 434     else
 435       {
 436       branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
 437       }
 438     cc += 1 + LINK_SIZE;
 439     break;
 440
 441     /* Anything else does not or need not match a character. We can get the
 442     item's length from the table, but for those that can match zero occurrences
 443     of a character, we must take special action for UTF-8 characters. As it
 444     happens, the "NOT" versions of these opcodes are used at present only for
 445     ASCII characters, so they could be omitted from this list. However, in
 446     future that may change, so we include them here so as not to leave a
 447     gotcha for a future maintainer. */
 448
 449     case OP_UPTO:
 450     case OP_UPTOI:
 451     case OP_NOTUPTO:
 452     case OP_NOTUPTOI:
 453     case OP_MINUPTO:
 454     case OP_MINUPTOI:
 455     case OP_NOTMINUPTO:
 456     case OP_NOTMINUPTOI:
 457     case OP_POSUPTO:
 458     case OP_POSUPTOI:
 459     case OP_NOTPOSUPTO:
 460     case OP_NOTPOSUPTOI:
 461
 462     case OP_STAR:
 463     case OP_STARI:
 464     case OP_NOTSTAR:
 465     case OP_NOTSTARI:
 466     case OP_MINSTAR:
 467     case OP_MINSTARI:
 468     case OP_NOTMINSTAR:
 469     case OP_NOTMINSTARI:
 470     case OP_POSSTAR:
 471     case OP_POSSTARI:
 472     case OP_NOTPOSSTAR:
 473     case OP_NOTPOSSTARI:
 474
 475     case OP_QUERY:
 476     case OP_QUERYI:
 477     case OP_NOTQUERY:
 478     case OP_NOTQUERYI:
 479     case OP_MINQUERY:
 480     case OP_MINQUERYI:
 481     case OP_NOTMINQUERY:
 482     case OP_NOTMINQUERYI:
 483     case OP_POSQUERY:
 484     case OP_POSQUERYI:
 485     case OP_NOTPOSQUERY:
 486     case OP_NOTPOSQUERYI:
 487
 488     cc += PRIV(OP_lengths)[op];
 489 #ifdef SUPPORT_UTF
 490     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 491 #endif
 492     break;
 493
 494     /* Skip these, but we need to add in the name length. */
 495
 496     case OP_MARK:
 497     case OP_PRUNE_ARG:
 498     case OP_SKIP_ARG:
 499     case OP_THEN_ARG:
 500     cc += PRIV(OP_lengths)[op] + cc[1];
 501     break;
 502
 503     /* The remaining opcodes are just skipped over. */
 504
 505     case OP_CLOSE:
 506     case OP_COMMIT:
 507     case OP_FAIL:
 508     case OP_PRUNE:
 509     case OP_SET_SOM:
 510     case OP_SKIP:
 511     case OP_THEN:
 512     cc += PRIV(OP_lengths)[op];
 513     break;
 514
 515     /* This should not occur: we list all opcodes explicitly so that when
 516     new ones get added they are properly considered. */
 517
 518     default:
 519     return -3;
 520     }
 521   }
 522 /* Control never gets here */
 523 }
 524
 525
 526
 527 /*************************************************
 528 *      Set a bit and maybe its alternate case    *
 529 *************************************************/
 530
 531 /* Given a character, set its first byte's bit in the table, and also the
 532 corresponding bit for the other version of a letter if we are caseless. In
 533 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
 534 when Unicode property support is available.
 535
 536 Arguments:
 537   start_bits    points to the bit map
 538   p             points to the character
 539   caseless      the caseless flag
 540   cd            the block with char table pointers
 541   utf           TRUE for UTF-8 / UTF-16 mode
 542
 543 Returns:        pointer after the character
 544 */
 545
 546 static const pcre_uchar *
 547 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
 548   compile_data *cd, BOOL utf)
 549 {
 550 unsigned int c = *p;
 551
 552 #ifdef COMPILE_PCRE8
 553 SET_BIT(c);
 554
 555 #ifdef SUPPORT_UTF
 556 if (utf && c > 127)
 557   {
 558   GETCHARINC(c, p);
 559 #ifdef SUPPORT_UCP
 560   if (caseless)
 561     {
 562     pcre_uchar buff[6];
 563     c = UCD_OTHERCASE(c);
 564     (void)PRIV(ord2utf)(c, buff);
 565     SET_BIT(buff[0]);
 566     }
 567 #endif
 568   return p;
 569   }
 570 #endif
 571
 572 /* Not UTF-8 mode, or character is less than 127. */
 573
 574 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
 575 return p + 1;
 576 #endif
 577
 578 #ifdef COMPILE_PCRE16
 579 if (c > 0xff)
 580   {
 581   c = 0xff;
 582   caseless = FALSE;
 583   }
 584 SET_BIT(c);
 585
 586 #ifdef SUPPORT_UTF
 587 if (utf && c > 127)
 588   {
 589   GETCHARINC(c, p);
 590 #ifdef SUPPORT_UCP
 591   if (caseless)
 592     {
 593     c = UCD_OTHERCASE(c);
 594     if (c > 0xff)
 595       c = 0xff;
 596     SET_BIT(c);
 597     }
 598 #endif
 599   return p;
 600   }
 601 #endif
 602
 603 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
 604 return p + 1;
 605 #endif
 606 }
 607
 608
 609
 610 /*************************************************
 611 *     Set bits for a positive character type     *
 612 *************************************************/
 613
 614 /* This function sets starting bits for a character type. In UTF-8 mode, we can
 615 only do a direct setting for bytes less than 128, as otherwise there can be
 616 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
 617 environment, the tables will only recognize ASCII characters anyway, but in at
 618 least one Windows environment, some higher bytes bits were set in the tables.
 619 So we deal with that case by considering the UTF-8 encoding.
 620
 621 Arguments:
 622   start_bits     the starting bitmap
 623   cbit type      the type of character wanted
 624   table_limit    32 for non-UTF-8; 16 for UTF-8
 625   cd             the block with char table pointers
 626
 627 Returns:         nothing
 628 */
 629
 630 static void
 631 set_type_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
 632   compile_data *cd)
 633 {
 634 register int c;
 635 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
 636 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
 637 if (table_limit == 32) return;
 638 for (c = 128; c < 256; c++)
 639   {
 640   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
 641     {
 642     pcre_uchar buff[6];
 643     (void)PRIV(ord2utf)(c, buff);
 644     SET_BIT(buff[0]);
 645     }
 646   }
 647 #endif
 648 }
 649
 650
 651 /*************************************************
 652 *     Set bits for a negative character type     *
 653 *************************************************/
 654
 655 /* This function sets starting bits for a negative character type such as \D.
 656 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
 657 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
 658 Unlike in the positive case, where we can set appropriate starting bits for
 659 specific high-valued UTF-8 characters, in this case we have to set the bits for
 660 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
 661 0xc0 (192) for simplicity.
 662
 663 Arguments:
 664   start_bits     the starting bitmap
 665   cbit type      the type of character wanted
 666   table_limit    32 for non-UTF-8; 16 for UTF-8
 667   cd             the block with char table pointers
 668
 669 Returns:         nothing
 670 */
 671
 672 static void
 673 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
 674   compile_data *cd)
 675 {
 676 register int c;
 677 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
 678 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
 679 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
 680 #endif
 681 }
 682
 683
 684
 685 /*************************************************
 686 *          Create bitmap of starting bytes       *
 687 *************************************************/
 688
 689 /* This function scans a compiled unanchored expression recursively and
 690 attempts to build a bitmap of the set of possible starting bytes. As time goes
 691 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
 692 useful for parenthesized groups in patterns such as (a*)b where the group
 693 provides some optional starting bytes but scanning must continue at the outer
 694 level to find at least one mandatory byte. At the outermost level, this
 695 function fails unless the result is SSB_DONE.
 696
 697 Arguments:
 698   code         points to an expression
 699   start_bits   points to a 32-byte table, initialized to 0
 700   utf          TRUE if in UTF-8 / UTF-16 mode
 701   cd           the block with char table pointers
 702
 703 Returns:       SSB_FAIL     => Failed to find any starting bytes
 704                SSB_DONE     => Found mandatory starting bytes
 705                SSB_CONTINUE => Found optional starting bytes
 706                SSB_UNKNOWN  => Hit an unrecognized opcode
 707 */
 708
 709 static int
 710 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
 711   compile_data *cd)
 712 {
 713 register int c;
 714 int yield = SSB_DONE;
 715 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
 716 int table_limit = utf? 16:32;
 717 #else
 718 int table_limit = 32;
 719 #endif
 720
 721 #if 0
 722 /* ========================================================================= */
 723 /* The following comment and code was inserted in January 1999. In May 2006,
 724 when it was observed to cause compiler warnings about unused values, I took it
 725 out again. If anybody is still using OS/2, they will have to put it back
 726 manually. */
 727
 728 /* This next statement and the later reference to dummy are here in order to
 729 trick the optimizer of the IBM C compiler for OS/2 into generating correct
 730 code. Apparently IBM isn't going to fix the problem, and we would rather not
 731 disable optimization (in this module it actually makes a big difference, and
 732 the pcre module can use all the optimization it can get). */
 733
 734 volatile int dummy;
 735 /* ========================================================================= */
 736 #endif
 737
 738 do
 739   {
 740   BOOL try_next = TRUE;
 741   const pcre_uchar *tcode = code + 1 + LINK_SIZE;
 742
 743   if (*code == OP_CBRA || *code == OP_SCBRA ||
 744       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
 745
 746   while (try_next)    /* Loop for items in this branch */
 747     {
 748     int rc;
 749
 750     switch(*tcode)
 751       {
 752       /* If we reach something we don't understand, it means a new opcode has
 753       been created that hasn't been added to this code. Hopefully this problem
 754       will be discovered during testing. */
 755
 756       default:
 757       return SSB_UNKNOWN;
 758
 759       /* Fail for a valid opcode that implies no starting bits. */
 760
 761       case OP_ACCEPT:
 762       case OP_ASSERT_ACCEPT:
 763       case OP_ALLANY:
 764       case OP_ANY:
 765       case OP_ANYBYTE:
 766       case OP_CIRC:
 767       case OP_CIRCM:
 768       case OP_CLOSE:
 769       case OP_COMMIT:
 770       case OP_COND:
 771       case OP_CREF:
 772       case OP_DEF:
 773       case OP_DOLL:
 774       case OP_DOLLM:
 775       case OP_END:
 776       case OP_EOD:
 777       case OP_EODN:
 778       case OP_EXTUNI:
 779       case OP_FAIL:
 780       case OP_MARK:
 781       case OP_NCREF:
 782       case OP_NOT:
 783       case OP_NOTEXACT:
 784       case OP_NOTEXACTI:
 785       case OP_NOTI:
 786       case OP_NOTMINPLUS:
 787       case OP_NOTMINPLUSI:
 788       case OP_NOTMINQUERY:
 789       case OP_NOTMINQUERYI:
 790       case OP_NOTMINSTAR:
 791       case OP_NOTMINSTARI:
 792       case OP_NOTMINUPTO:
 793       case OP_NOTMINUPTOI:
 794       case OP_NOTPLUS:
 795       case OP_NOTPLUSI:
 796       case OP_NOTPOSPLUS:
 797       case OP_NOTPOSPLUSI:
 798       case OP_NOTPOSQUERY:
 799       case OP_NOTPOSQUERYI:
 800       case OP_NOTPOSSTAR:
 801       case OP_NOTPOSSTARI:
 802       case OP_NOTPOSUPTO:
 803       case OP_NOTPOSUPTOI:
 804       case OP_NOTPROP:
 805       case OP_NOTQUERY:
 806       case OP_NOTQUERYI:
 807       case OP_NOTSTAR:
 808       case OP_NOTSTARI:
 809       case OP_NOTUPTO:
 810       case OP_NOTUPTOI:
 811       case OP_NOT_HSPACE:
 812       case OP_NOT_VSPACE:
 813       case OP_NRREF:
 814       case OP_PROP:
 815       case OP_PRUNE:
 816       case OP_PRUNE_ARG:
 817       case OP_RECURSE:
 818       case OP_REF:
 819       case OP_REFI:
 820       case OP_REVERSE:
 821       case OP_RREF:
 822       case OP_SCOND:
 823       case OP_SET_SOM:
 824       case OP_SKIP:
 825       case OP_SKIP_ARG:
 826       case OP_SOD:
 827       case OP_SOM:
 828       case OP_THEN:
 829       case OP_THEN_ARG:
 830 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 831       case OP_XCLASS:
 832 #endif
 833       return SSB_FAIL;
 834
 835       /* We can ignore word boundary tests. */
 836
 837       case OP_WORD_BOUNDARY:
 838       case OP_NOT_WORD_BOUNDARY:
 839       tcode++;
 840       break;
 841
 842       /* If we hit a bracket or a positive lookahead assertion, recurse to set
 843       bits from within the subpattern. If it can't find anything, we have to
 844       give up. If it finds some mandatory character(s), we are done for this
 845       branch. Otherwise, carry on scanning after the subpattern. */
 846
 847       case OP_BRA:
 848       case OP_SBRA:
 849       case OP_CBRA:
 850       case OP_SCBRA:
 851       case OP_BRAPOS:
 852       case OP_SBRAPOS:
 853       case OP_CBRAPOS:
 854       case OP_SCBRAPOS:
 855       case OP_ONCE:
 856       case OP_ONCE_NC:
 857       case OP_ASSERT:
 858       rc = set_start_bits(tcode, start_bits, utf, cd);
 859       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
 860       if (rc == SSB_DONE) try_next = FALSE; else
 861         {
 862         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
 863         tcode += 1 + LINK_SIZE;
 864         }
 865       break;
 866
 867       /* If we hit ALT or KET, it means we haven't found anything mandatory in
 868       this branch, though we might have found something optional. For ALT, we
 869       continue with the next alternative, but we have to arrange that the final
 870       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
 871       return SSB_CONTINUE: if this is the top level, that indicates failure,
 872       but after a nested subpattern, it causes scanning to continue. */
 873
 874       case OP_ALT:
 875       yield = SSB_CONTINUE;
 876       try_next = FALSE;
 877       break;
 878
 879       case OP_KET:
 880       case OP_KETRMAX:
 881       case OP_KETRMIN:
 882       case OP_KETRPOS:
 883       return SSB_CONTINUE;
 884
 885       /* Skip over callout */
 886
 887       case OP_CALLOUT:
 888       tcode += 2 + 2*LINK_SIZE;
 889       break;
 890
 891       /* Skip over lookbehind and negative lookahead assertions */
 892
 893       case OP_ASSERT_NOT:
 894       case OP_ASSERTBACK:
 895       case OP_ASSERTBACK_NOT:
 896       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
 897       tcode += 1 + LINK_SIZE;
 898       break;
 899
 900       /* BRAZERO does the bracket, but carries on. */
 901
 902       case OP_BRAZERO:
 903       case OP_BRAMINZERO:
 904       case OP_BRAPOSZERO:
 905       rc = set_start_bits(++tcode, start_bits, utf, cd);
 906       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
 907 /* =========================================================================
 908       See the comment at the head of this function concerning the next line,
 909       which was an old fudge for the benefit of OS/2.
 910       dummy = 1;
 911   ========================================================================= */
 912       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
 913       tcode += 1 + LINK_SIZE;
 914       break;
 915
 916       /* SKIPZERO skips the bracket. */
 917
 918       case OP_SKIPZERO:
 919       tcode++;
 920       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
 921       tcode += 1 + LINK_SIZE;
 922       break;
 923
 924       /* Single-char * or ? sets the bit and tries the next item */
 925
 926       case OP_STAR:
 927       case OP_MINSTAR:
 928       case OP_POSSTAR:
 929       case OP_QUERY:
 930       case OP_MINQUERY:
 931       case OP_POSQUERY:
 932       tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
 933       break;
 934
 935       case OP_STARI:
 936       case OP_MINSTARI:
 937       case OP_POSSTARI:
 938       case OP_QUERYI:
 939       case OP_MINQUERYI:
 940       case OP_POSQUERYI:
 941       tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
 942       break;
 943
 944       /* Single-char upto sets the bit and tries the next */
 945
 946       case OP_UPTO:
 947       case OP_MINUPTO:
 948       case OP_POSUPTO:
 949       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
 950       break;
 951
 952       case OP_UPTOI:
 953       case OP_MINUPTOI:
 954       case OP_POSUPTOI:
 955       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
 956       break;
 957
 958       /* At least one single char sets the bit and stops */
 959
 960       case OP_EXACT:
 961       tcode += IMM2_SIZE;
 962       /* Fall through */
 963       case OP_CHAR:
 964       case OP_PLUS:
 965       case OP_MINPLUS:
 966       case OP_POSPLUS:
 967       (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
 968       try_next = FALSE;
 969       break;
 970
 971       case OP_EXACTI:
 972       tcode += IMM2_SIZE;
 973       /* Fall through */
 974       case OP_CHARI:
 975       case OP_PLUSI:
 976       case OP_MINPLUSI:
 977       case OP_POSPLUSI:
 978       (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
 979       try_next = FALSE;
 980       break;
 981
 982       /* Special spacing and line-terminating items. These recognize specific
 983       lists of characters. The difference between VSPACE and ANYNL is that the
 984       latter can match the two-character CRLF sequence, but that is not
 985       relevant for finding the first character, so their code here is
 986       identical. */
 987
 988       case OP_HSPACE:
 989       SET_BIT(0x09);
 990       SET_BIT(0x20);
 991 #ifdef SUPPORT_UTF
 992       if (utf)
 993         {
 994 #ifdef COMPILE_PCRE8
 995         SET_BIT(0xC2);  /* For U+00A0 */
 996         SET_BIT(0xE1);  /* For U+1680, U+180E */
 997         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
 998         SET_BIT(0xE3);  /* For U+3000 */
 999 #endif
1000 #ifdef COMPILE_PCRE16
1001         SET_BIT(0xA0);
1002         SET_BIT(0xFF);  /* For characters > 255 */
1003 #endif
1004         }
1005       else
1006 #endif /* SUPPORT_UTF */
1007         {
1008         SET_BIT(0xA0);
1009 #ifdef COMPILE_PCRE16
1010         SET_BIT(0xFF);  /* For characters > 255 */
1011 #endif
1012         }
1013       try_next = FALSE;
1014       break;
1015
1016       case OP_ANYNL:
1017       case OP_VSPACE:
1018       SET_BIT(0x0A);
1019       SET_BIT(0x0B);
1020       SET_BIT(0x0C);
1021       SET_BIT(0x0D);
1022 #ifdef SUPPORT_UTF
1023       if (utf)
1024         {
1025 #ifdef COMPILE_PCRE8
1026         SET_BIT(0xC2);  /* For U+0085 */
1027         SET_BIT(0xE2);  /* For U+2028, U+2029 */
1028 #endif
1029 #ifdef COMPILE_PCRE16
1030         SET_BIT(0x85);
1031         SET_BIT(0xFF);  /* For characters > 255 */
1032 #endif
1033         }
1034       else
1035 #endif /* SUPPORT_UTF */
1036         {
1037         SET_BIT(0x85);
1038 #ifdef COMPILE_PCRE16
1039         SET_BIT(0xFF);  /* For characters > 255 */
1040 #endif
1041         }
1042       try_next = FALSE;
1043       break;
1044
1045       /* Single character types set the bits and stop. Note that if PCRE_UCP
1046       is set, we do not see these op codes because \d etc are converted to
1047       properties. Therefore, these apply in the case when only characters less
1048       than 256 are recognized to match the types. */
1049
1050       case OP_NOT_DIGIT:
1051       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1052       try_next = FALSE;
1053       break;
1054
1055       case OP_DIGIT:
1056       set_type_bits(start_bits, cbit_digit, table_limit, cd);
1057       try_next = FALSE;
1058       break;
1059
1060       /* The cbit_space table has vertical tab as whitespace; we have to
1061       ensure it is set as not whitespace. */
1062
1063       case OP_NOT_WHITESPACE:
1064       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1065       start_bits[1] |= 0x08;
1066       try_next = FALSE;
1067       break;
1068
1069       /* The cbit_space table has vertical tab as whitespace; we have to
1070       not set it from the table. */
1071
1072       case OP_WHITESPACE:
1073       c = start_bits[1];    /* Save in case it was already set */
1074       set_type_bits(start_bits, cbit_space, table_limit, cd);
1075       start_bits[1] = (start_bits[1] & ~0x08) | c;
1076       try_next = FALSE;
1077       break;
1078
1079       case OP_NOT_WORDCHAR:
1080       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1081       try_next = FALSE;
1082       break;
1083
1084       case OP_WORDCHAR:
1085       set_type_bits(start_bits, cbit_word, table_limit, cd);
1086       try_next = FALSE;
1087       break;
1088
1089       /* One or more character type fudges the pointer and restarts, knowing
1090       it will hit a single character type and stop there. */
1091
1092       case OP_TYPEPLUS:
1093       case OP_TYPEMINPLUS:
1094       case OP_TYPEPOSPLUS:
1095       tcode++;
1096       break;
1097
1098       case OP_TYPEEXACT:
1099       tcode += 1 + IMM2_SIZE;
1100       break;
1101
1102       /* Zero or more repeats of character types set the bits and then
1103       try again. */
1104
1105       case OP_TYPEUPTO:
1106       case OP_TYPEMINUPTO:
1107       case OP_TYPEPOSUPTO:
1108       tcode += IMM2_SIZE;  /* Fall through */
1109
1110       case OP_TYPESTAR:
1111       case OP_TYPEMINSTAR:
1112       case OP_TYPEPOSSTAR:
1113       case OP_TYPEQUERY:
1114       case OP_TYPEMINQUERY:
1115       case OP_TYPEPOSQUERY:
1116       switch(tcode[1])
1117         {
1118         default:
1119         case OP_ANY:
1120         case OP_ALLANY:
1121         return SSB_FAIL;
1122
1123         case OP_HSPACE:
1124         SET_BIT(0x09);
1125         SET_BIT(0x20);
1126 #ifdef COMPILE_PCRE8
1127         if (utf)
1128           {
1129 #ifdef COMPILE_PCRE8
1130           SET_BIT(0xC2);  /* For U+00A0 */
1131           SET_BIT(0xE1);  /* For U+1680, U+180E */
1132           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1133           SET_BIT(0xE3);  /* For U+3000 */
1134 #endif
1135 #ifdef COMPILE_PCRE16
1136           SET_BIT(0xA0);
1137           SET_BIT(0xFF);  /* For characters > 255 */
1138 #endif
1139           }
1140         else
1141 #endif /* SUPPORT_UTF */
1142           SET_BIT(0xA0);
1143         break;
1144
1145         case OP_ANYNL:
1146         case OP_VSPACE:
1147         SET_BIT(0x0A);
1148         SET_BIT(0x0B);
1149         SET_BIT(0x0C);
1150         SET_BIT(0x0D);
1151 #ifdef COMPILE_PCRE8
1152         if (utf)
1153           {
1154 #ifdef COMPILE_PCRE8
1155           SET_BIT(0xC2);  /* For U+0085 */
1156           SET_BIT(0xE2);  /* For U+2028, U+2029 */
1157 #endif
1158 #ifdef COMPILE_PCRE16
1159           SET_BIT(0x85);
1160           SET_BIT(0xFF);  /* For characters > 255 */
1161 #endif
1162           }
1163         else
1164 #endif /* SUPPORT_UTF */
1165           SET_BIT(0x85);
1166         break;
1167
1168         case OP_NOT_DIGIT:
1169         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1170         break;
1171
1172         case OP_DIGIT:
1173         set_type_bits(start_bits, cbit_digit, table_limit, cd);
1174         break;
1175
1176         /* The cbit_space table has vertical tab as whitespace; we have to
1177         ensure it gets set as not whitespace. */
1178
1179         case OP_NOT_WHITESPACE:
1180         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1181         start_bits[1] |= 0x08;
1182         break;
1183
1184         /* The cbit_space table has vertical tab as whitespace; we have to
1185         avoid setting it. */
1186
1187         case OP_WHITESPACE:
1188         c = start_bits[1];    /* Save in case it was already set */
1189         set_type_bits(start_bits, cbit_space, table_limit, cd);
1190         start_bits[1] = (start_bits[1] & ~0x08) | c;
1191         break;
1192
1193         case OP_NOT_WORDCHAR:
1194         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1195         break;
1196
1197         case OP_WORDCHAR:
1198         set_type_bits(start_bits, cbit_word, table_limit, cd);
1199         break;
1200         }
1201
1202       tcode += 2;
1203       break;
1204
1205       /* Character class where all the information is in a bit map: set the
1206       bits and either carry on or not, according to the repeat count. If it was
1207       a negative class, and we are operating with UTF-8 characters, any byte
1208       with a value >= 0xc4 is a potentially valid starter because it starts a
1209       character with a value > 255. */
1210
1211       case OP_NCLASS:
1212 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1213       if (utf)
1214         {
1215         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1216         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1217         }
1218 #endif
1219 #ifdef COMPILE_PCRE16
1220       SET_BIT(0xFF);                         /* For characters > 255 */
1221 #endif
1222       /* Fall through */
1223
1224       case OP_CLASS:
1225         {
1226         pcre_uint8 *map;
1227         tcode++;
1228         map = (pcre_uint8 *)tcode;
1229
1230         /* In UTF-8 mode, the bits in a bit map correspond to character
1231         values, not to byte values. However, the bit map we are constructing is
1232         for byte values. So we have to do a conversion for characters whose
1233         value is > 127. In fact, there are only two possible starting bytes for
1234         characters in the range 128 - 255. */
1235
1236 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1237         if (utf)
1238           {
1239           for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1240           for (c = 128; c < 256; c++)
1241             {
1242             if ((map[c/8] && (1 << (c&7))) != 0)
1243               {
1244               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1245               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
1246               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
1247               }
1248             }
1249           }
1250         else
1251 #endif
1252           {
1253           /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1254           for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1255           }
1256
1257         /* Advance past the bit map, and act on what follows. For a zero
1258         minimum repeat, continue; otherwise stop processing. */
1259
1260         tcode += 32 / sizeof(pcre_uchar);
1261         switch (*tcode)
1262           {
1263           case OP_CRSTAR:
1264           case OP_CRMINSTAR:
1265           case OP_CRQUERY:
1266           case OP_CRMINQUERY:
1267           tcode++;
1268           break;
1269
1270           case OP_CRRANGE:
1271           case OP_CRMINRANGE:
1272           if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1273             else try_next = FALSE;
1274           break;
1275
1276           default:
1277           try_next = FALSE;
1278           break;
1279           }
1280         }
1281       break; /* End of bitmap class handling */
1282
1283       }      /* End of switch */
1284     }        /* End of try_next loop */
1285
1286   code += GET(code, 1);   /* Advance to next branch */
1287   }
1288 while (*code == OP_ALT);
1289 return yield;
1290 }
1291
1292
1293
1294
1295
1296 /*************************************************
1297 *          Study a compiled expression           *
1298 *************************************************/
1299
1300 /* This function is handed a compiled expression that it must study to produce
1301 information that will speed up the matching. It returns a pcre[16]_extra block
1302 which then gets handed back to pcre_exec().
1303
1304 Arguments:
1305   re        points to the compiled expression
1306   options   contains option bits
1307   errorptr  points to where to place error messages;
1308             set NULL unless error
1309
1310 Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
1311               the appropriate flags set;
1312             NULL on error or if no optimization possible
1313 */
1314
1315 #ifdef COMPILE_PCRE8
1316 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1317 pcre_study(const pcre *external_re, int options, const char **errorptr)
1318 #else
1319 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1320 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1321 #endif
1322 {
1323 int min;
1324 BOOL bits_set = FALSE;
1325 pcre_uint8 start_bits[32];
1326 PUBL(extra) *extra = NULL;
1327 pcre_study_data *study;
1328 const pcre_uint8 *tables;
1329 pcre_uchar *code;
1330 compile_data compile_block;
1331 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1332
1333 *errorptr = NULL;
1334
1335 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1336   {
1337   *errorptr = "argument is not a compiled regular expression";
1338   return NULL;
1339   }
1340
1341 if ((re->flags & PCRE_MODE) == 0)
1342   {
1343 #ifdef COMPILE_PCRE8
1344   *errorptr = "argument is compiled in 16 bit mode";
1345 #else
1346   *errorptr = "argument is compiled in 8 bit mode";
1347 #endif
1348   return NULL;
1349   }
1350
1351 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1352   {
1353   *errorptr = "unknown or incorrect option bit(s) set";
1354   return NULL;
1355   }
1356
1357 code = (pcre_uchar *)re + re->name_table_offset +
1358   (re->name_count * re->name_entry_size);
1359
1360 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1361 a multiline pattern that matches only at "line starts", there is no point in
1362 seeking a list of starting bytes. */
1363
1364 if ((re->options & PCRE_ANCHORED) == 0 &&
1365     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1366   {
1367   int rc;
1368
1369   /* Set the character tables in the block that is passed around */
1370
1371   tables = re->tables;
1372
1373 #ifdef COMPILE_PCRE8
1374   if (tables == NULL)
1375     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1376     (void *)(&tables));
1377 #else
1378   if (tables == NULL)
1379     (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1380     (void *)(&tables));
1381 #endif
1382
1383   compile_block.lcc = tables + lcc_offset;
1384   compile_block.fcc = tables + fcc_offset;
1385   compile_block.cbits = tables + cbits_offset;
1386   compile_block.ctypes = tables + ctypes_offset;
1387
1388   /* See if we can find a fixed set of initial characters for the pattern. */
1389
1390   memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1391   rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1392     &compile_block);
1393   bits_set = rc == SSB_DONE;
1394   if (rc == SSB_UNKNOWN)
1395     {
1396     *errorptr = "internal error: opcode not recognized";
1397     return NULL;
1398     }
1399   }
1400
1401 /* Find the minimum length of subject string. */
1402
1403 switch(min = find_minlength(code, code, re->options, 0))
1404   {
1405   case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1406   case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1407   default: break;
1408   }
1409
1410 /* If a set of starting bytes has been identified, or if the minimum length is
1411 greater than zero, or if JIT optimization has been requested, get a
1412 pcre[16]_extra block and a pcre_study_data block. The study data is put in the
1413 latter, which is pointed to by the former, which may also get additional data
1414 set later by the calling program. At the moment, the size of pcre_study_data
1415 is fixed. We nevertheless save it in a field for returning via the
1416 pcre_fullinfo() function so that if it becomes variable in the future,
1417 we don't have to change that code. */
1418
1419 if (bits_set || min > 0
1420 #ifdef SUPPORT_JIT
1421     || (options & PCRE_STUDY_JIT_COMPILE) != 0
1422 #endif
1423   )
1424   {
1425   extra = (PUBL(extra) *)(PUBL(malloc))
1426     (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1427   if (extra == NULL)
1428     {
1429     *errorptr = "failed to get memory";
1430     return NULL;
1431     }
1432
1433   study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1434   extra->flags = PCRE_EXTRA_STUDY_DATA;
1435   extra->study_data = study;
1436
1437   study->size = sizeof(pcre_study_data);
1438   study->flags = 0;
1439
1440   /* Set the start bits always, to avoid unset memory errors if the
1441   study data is written to a file, but set the flag only if any of the bits
1442   are set, to save time looking when none are. */
1443
1444   if (bits_set)
1445     {
1446     study->flags |= PCRE_STUDY_MAPPED;
1447     memcpy(study->start_bits, start_bits, sizeof(start_bits));
1448     }
1449   else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1450
1451 #ifdef PCRE_DEBUG
1452   if (bits_set)
1453     {
1454     pcre_uint8 *ptr = start_bits;
1455     int i;
1456
1457     printf("Start bits:\n");
1458     for (i = 0; i < 32; i++)
1459       printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1460     }
1461 #endif
1462
1463   /* Always set the minlength value in the block, because the JIT compiler
1464   makes use of it. However, don't set the bit unless the length is greater than
1465   zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1466   checking the zero case. */
1467
1468   if (min > 0)
1469     {
1470     study->flags |= PCRE_STUDY_MINLEN;
1471     study->minlength = min;
1472     }
1473   else study->minlength = 0;
1474
1475   /* If JIT support was compiled and requested, attempt the JIT compilation.
1476   If no starting bytes were found, and the minimum length is zero, and JIT
1477   compilation fails, abandon the extra block and return NULL. */
1478
1479 #ifdef SUPPORT_JIT
1480   extra->executable_jit = NULL;
1481   if ((options & PCRE_STUDY_JIT_COMPILE) != 0) PRIV(jit_compile)(re, extra);
1482   if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0)
1483     {
1484 #ifdef COMPILE_PCRE8
1485     pcre_free_study(extra);
1486 #endif
1487 #ifdef COMPILE_PCRE16
1488     pcre16_free_study(extra);
1489 #endif
1490     extra = NULL;
1491     }
1492 #endif
1493   }
1494
1495 return extra;
1496 }
1497
1498
1499 /*************************************************
1500 *          Free the study data                   *
1501 *************************************************/
1502
1503 /* This function frees the memory that was obtained by pcre_study().
1504
1505 Argument:   a pointer to the pcre[16]_extra block
1506 Returns:    nothing
1507 */
1508
1509 #ifdef COMPILE_PCRE8
1510 PCRE_EXP_DEFN void
1511 pcre_free_study(pcre_extra *extra)
1512 #else
1513 PCRE_EXP_DEFN void
1514 pcre16_free_study(pcre16_extra *extra)
1515 #endif
1516 {
1517 if (extra == NULL)
1518   return;
1519 #ifdef SUPPORT_JIT
1520 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1521      extra->executable_jit != NULL)
1522   PRIV(jit_free)(extra->executable_jit);
1523 #endif
1524 PUBL(free)(extra);
1525 }
1526
1527 /* End of pcre_study.c */