glib/pcre/pcre_dfa_exec.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9            Copyright (c) 1997-2006 University of Cambridge
  10
  11 -----------------------------------------------------------------------------
  12 Redistribution and use in source and binary forms, with or without
  13 modification, are permitted provided that the following conditions are met:
  14
  15     * Redistributions of source code must retain the above copyright notice,
  16       this list of conditions and the following disclaimer.
  17
  18     * Redistributions in binary form must reproduce the above copyright
  19       notice, this list of conditions and the following disclaimer in the
  20       documentation and/or other materials provided with the distribution.
  21
  22     * Neither the name of the University of Cambridge nor the names of its
  23       contributors may be used to endorse or promote products derived from
  24       this software without specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE.
  37 -----------------------------------------------------------------------------
  38 */
  39
  40
  41 /* This module contains the external function pcre_dfa_exec(), which is an
  42 alternative matching function that uses a sort of DFA algorithm (not a true
  43 FSM). This is NOT Perl- compatible, but it has advantages in certain
  44 applications. */
  45
  46
  47 #define NLBLOCK md             /* Block containing newline information */
  48 #define PSSTART start_subject  /* Field containing processed string start */
  49 #define PSEND   end_subject    /* Field containing processed string end */
  50
  51 #include "pcre_internal.h"
  52
  53
  54 /* For use to indent debugging output */
  55
  56 #define SP "                   "
  57
  58
  59
  60 /*************************************************
  61 *      Code parameters and static tables         *
  62 *************************************************/
  63
  64 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
  65 into others, under special conditions. A gap of 20 between the blocks should be
  66 enough. */
  67
  68 #define OP_PROP_EXTRA 100
  69 #define OP_EXTUNI_EXTRA 120
  70 #define OP_ANYNL_EXTRA 140
  71
  72
  73 /* This table identifies those opcodes that are followed immediately by a
  74 character that is to be tested in some way. This makes is possible to
  75 centralize the loading of these characters. In the case of Type * etc, the
  76 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
  77 small value. */
  78
  79 static uschar coptable[] = {
  80   0,                             /* End                                    */
  81   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */
  82   0, 0,                          /* Any, Anybyte                           */
  83   0, 0, 0, 0,                    /* NOTPROP, PROP, EXTUNI, ANYNL           */
  84   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
  85   1,                             /* Char                                   */
  86   1,                             /* Charnc                                 */
  87   1,                             /* not                                    */
  88   /* Positive single-char repeats                                          */
  89   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
  90   3, 3, 3,                       /* upto, minupto, exact                   */
  91   1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
  92   /* Negative single-char repeats - only for chars < 256                   */
  93   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
  94   3, 3, 3,                       /* NOT upto, minupto, exact               */
  95   1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */
  96   /* Positive type repeats                                                 */
  97   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
  98   3, 3, 3,                       /* Type upto, minupto, exact              */
  99   1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
 100   /* Character class & ref repeats                                         */
 101   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
 102   0, 0,                          /* CRRANGE, CRMINRANGE                    */
 103   0,                             /* CLASS                                  */
 104   0,                             /* NCLASS                                 */
 105   0,                             /* XCLASS - variable length               */
 106   0,                             /* REF                                    */
 107   0,                             /* RECURSE                                */
 108   0,                             /* CALLOUT                                */
 109   0,                             /* Alt                                    */
 110   0,                             /* Ket                                    */
 111   0,                             /* KetRmax                                */
 112   0,                             /* KetRmin                                */
 113   0,                             /* Assert                                 */
 114   0,                             /* Assert not                             */
 115   0,                             /* Assert behind                          */
 116   0,                             /* Assert behind not                      */
 117   0,                             /* Reverse                                */
 118   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
 119   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
 120   0,                             /* CREF                                   */
 121   0,                             /* RREF                                   */
 122   0,                             /* DEF                                    */
 123   0, 0                           /* BRAZERO, BRAMINZERO                    */
 124 };
 125
 126 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
 127 and \w */
 128
 129 static uschar toptable1[] = {
 130   0, 0, 0, 0, 0,
 131   ctype_digit, ctype_digit,
 132   ctype_space, ctype_space,
 133   ctype_word,  ctype_word,
 134   0                               /* OP_ANY */
 135 };
 136
 137 static uschar toptable2[] = {
 138   0, 0, 0, 0, 0,
 139   ctype_digit, 0,
 140   ctype_space, 0,
 141   ctype_word,  0,
 142   1                               /* OP_ANY */
 143 };
 144
 145
 146 /* Structure for holding data about a particular state, which is in effect the
 147 current data for an active path through the match tree. It must consist
 148 entirely of ints because the working vector we are passed, and which we put
 149 these structures in, is a vector of ints. */
 150
 151 typedef struct stateblock {
 152   int offset;                     /* Offset to opcode */
 153   int count;                      /* Count for repeats */
 154   int ims;                        /* ims flag bits */
 155   int data;                       /* Some use extra data */
 156 } stateblock;
 157
 158 #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
 159
 160
 161 #ifdef DEBUG
 162 /*************************************************
 163 *             Print character string             *
 164 *************************************************/
 165
 166 /* Character string printing function for debugging.
 167
 168 Arguments:
 169   p            points to string
 170   length       number of bytes
 171   f            where to print
 172
 173 Returns:       nothing
 174 */
 175
 176 static void
 177 pchars(unsigned char *p, int length, FILE *f)
 178 {
 179 int c;
 180 while (length-- > 0)
 181   {
 182   if (isprint(c = *(p++)))
 183     fprintf(f, "%c", c);
 184   else
 185     fprintf(f, "\\x%02x", c);
 186   }
 187 }
 188 #endif
 189
 190
 191
 192 /*************************************************
 193 *    Execute a Regular Expression - DFA engine   *
 194 *************************************************/
 195
 196 /* This internal function applies a compiled pattern to a subject string,
 197 starting at a given point, using a DFA engine. This function is called from the
 198 external one, possibly multiple times if the pattern is not anchored. The
 199 function calls itself recursively for some kinds of subpattern.
 200
 201 Arguments:
 202   md                the match_data block with fixed information
 203   this_start_code   the opening bracket of this subexpression's code
 204   current_subject   where we currently are in the subject string
 205   start_offset      start offset in the subject string
 206   offsets           vector to contain the matching string offsets
 207   offsetcount       size of same
 208   workspace         vector of workspace
 209   wscount           size of same
 210   ims               the current ims flags
 211   rlevel            function call recursion level
 212   recursing         regex recursive call level
 213
 214 Returns:            > 0 =>
 215                     = 0 =>
 216                      -1 => failed to match
 217                    < -1 => some kind of unexpected problem
 218
 219 The following macros are used for adding states to the two state vectors (one
 220 for the current character, one for the following character). */
 221
 222 #define ADD_ACTIVE(x,y) \
 223   if (active_count++ < wscount) \
 224     { \
 225     next_active_state->offset = (x); \
 226     next_active_state->count  = (y); \
 227     next_active_state->ims    = ims; \
 228     next_active_state++; \
 229     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 230     } \
 231   else return PCRE_ERROR_DFA_WSSIZE
 232
 233 #define ADD_ACTIVE_DATA(x,y,z) \
 234   if (active_count++ < wscount) \
 235     { \
 236     next_active_state->offset = (x); \
 237     next_active_state->count  = (y); \
 238     next_active_state->ims    = ims; \
 239     next_active_state->data   = (z); \
 240     next_active_state++; \
 241     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 242     } \
 243   else return PCRE_ERROR_DFA_WSSIZE
 244
 245 #define ADD_NEW(x,y) \
 246   if (new_count++ < wscount) \
 247     { \
 248     next_new_state->offset = (x); \
 249     next_new_state->count  = (y); \
 250     next_new_state->ims    = ims; \
 251     next_new_state++; \
 252     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 253     } \
 254   else return PCRE_ERROR_DFA_WSSIZE
 255
 256 #define ADD_NEW_DATA(x,y,z) \
 257   if (new_count++ < wscount) \
 258     { \
 259     next_new_state->offset = (x); \
 260     next_new_state->count  = (y); \
 261     next_new_state->ims    = ims; \
 262     next_new_state->data   = (z); \
 263     next_new_state++; \
 264     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 265     } \
 266   else return PCRE_ERROR_DFA_WSSIZE
 267
 268 /* And now, here is the code */
 269
 270 static int
 271 internal_dfa_exec(
 272   dfa_match_data *md,
 273   const uschar *this_start_code,
 274   const uschar *current_subject,
 275   int start_offset,
 276   int *offsets,
 277   int offsetcount,
 278   int *workspace,
 279   int wscount,
 280   int ims,
 281   int  rlevel,
 282   int  recursing)
 283 {
 284 stateblock *active_states, *new_states, *temp_states;
 285 stateblock *next_active_state, *next_new_state;
 286
 287 const uschar *ctypes, *lcc, *fcc;
 288 const uschar *ptr;
 289 const uschar *end_code, *first_op;
 290
 291 int active_count, new_count, match_count;
 292
 293 /* Some fields in the md block are frequently referenced, so we load them into
 294 independent variables in the hope that this will perform better. */
 295
 296 const uschar *start_subject = md->start_subject;
 297 const uschar *end_subject = md->end_subject;
 298 const uschar *start_code = md->start_code;
 299
 300 #ifdef SUPPORT_UTF8
 301 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
 302 #else
 303 BOOL utf8 = FALSE;
 304 #endif
 305
 306 rlevel++;
 307 offsetcount &= (-2);
 308
 309 wscount -= 2;
 310 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
 311           (2 * INTS_PER_STATEBLOCK);
 312
 313 DPRINTF(("\n%.*s---------------------\n"
 314   "%.*sCall to internal_dfa_exec f=%d r=%d\n",
 315   rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
 316
 317 ctypes = md->tables + ctypes_offset;
 318 lcc = md->tables + lcc_offset;
 319 fcc = md->tables + fcc_offset;
 320
 321 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
 322
 323 active_states = (stateblock *)(workspace + 2);
 324 next_new_state = new_states = active_states + wscount;
 325 new_count = 0;
 326
 327 first_op = this_start_code + 1 + LINK_SIZE +
 328   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
 329
 330 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
 331 the alternative states onto the list, and find out where the end is. This
 332 makes is possible to use this function recursively, when we want to stop at a
 333 matching internal ket rather than at the end.
 334
 335 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
 336 a backward assertion. In that case, we have to find out the maximum amount to
 337 move back, and set up each alternative appropriately. */
 338
 339 if (*first_op == OP_REVERSE)
 340   {
 341   int max_back = 0;
 342   int gone_back;
 343
 344   end_code = this_start_code;
 345   do
 346     {
 347     int back = GET(end_code, 2+LINK_SIZE);
 348     if (back > max_back) max_back = back;
 349     end_code += GET(end_code, 1);
 350     }
 351   while (*end_code == OP_ALT);
 352
 353   /* If we can't go back the amount required for the longest lookbehind
 354   pattern, go back as far as we can; some alternatives may still be viable. */
 355
 356 #ifdef SUPPORT_UTF8
 357   /* In character mode we have to step back character by character */
 358
 359   if (utf8)
 360     {
 361     for (gone_back = 0; gone_back < max_back; gone_back++)
 362       {
 363       if (current_subject <= start_subject) break;
 364       current_subject--;
 365       while (current_subject > start_subject &&
 366              (*current_subject & 0xc0) == 0x80)
 367         current_subject--;
 368       }
 369     }
 370   else
 371 #endif
 372
 373   /* In byte-mode we can do this quickly. */
 374
 375     {
 376     gone_back = (current_subject - max_back < start_subject)?
 377       current_subject - start_subject : max_back;
 378     current_subject -= gone_back;
 379     }
 380
 381   /* Now we can process the individual branches. */
 382
 383   end_code = this_start_code;
 384   do
 385     {
 386     int back = GET(end_code, 2+LINK_SIZE);
 387     if (back <= gone_back)
 388       {
 389       int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
 390       ADD_NEW_DATA(-bstate, 0, gone_back - back);
 391       }
 392     end_code += GET(end_code, 1);
 393     }
 394   while (*end_code == OP_ALT);
 395  }
 396
 397 /* This is the code for a "normal" subpattern (not a backward assertion). The
 398 start of a whole pattern is always one of these. If we are at the top level,
 399 we may be asked to restart matching from the same point that we reached for a
 400 previous partial match. We still have to scan through the top-level branches to
 401 find the end state. */
 402
 403 else
 404   {
 405   end_code = this_start_code;
 406
 407   /* Restarting */
 408
 409   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
 410     {
 411     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
 412     new_count = workspace[1];
 413     if (!workspace[0])
 414       memcpy(new_states, active_states, new_count * sizeof(stateblock));
 415     }
 416
 417   /* Not restarting */
 418
 419   else
 420     {
 421     int length = 1 + LINK_SIZE +
 422       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
 423     do
 424       {
 425       ADD_NEW(end_code - start_code + length, 0);
 426       end_code += GET(end_code, 1);
 427       length = 1 + LINK_SIZE;
 428       }
 429     while (*end_code == OP_ALT);
 430     }
 431   }
 432
 433 workspace[0] = 0;    /* Bit indicating which vector is current */
 434
 435 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
 436
 437 /* Loop for scanning the subject */
 438
 439 ptr = current_subject;
 440 for (;;)
 441   {
 442   int i, j;
 443   int clen, dlen;
 444   unsigned int c, d;
 445
 446   /* Make the new state list into the active state list and empty the
 447   new state list. */
 448
 449   temp_states = active_states;
 450   active_states = new_states;
 451   new_states = temp_states;
 452   active_count = new_count;
 453   new_count = 0;
 454
 455   workspace[0] ^= 1;              /* Remember for the restarting feature */
 456   workspace[1] = active_count;
 457
 458 #ifdef DEBUG
 459   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
 460   pchars((uschar *)ptr, strlen((char *)ptr), stdout);
 461   printf("\"\n");
 462
 463   printf("%.*sActive states: ", rlevel*2-2, SP);
 464   for (i = 0; i < active_count; i++)
 465     printf("%d/%d ", active_states[i].offset, active_states[i].count);
 466   printf("\n");
 467 #endif
 468
 469   /* Set the pointers for adding new states */
 470
 471   next_active_state = active_states + active_count;
 472   next_new_state = new_states;
 473
 474   /* Load the current character from the subject outside the loop, as many
 475   different states may want to look at it, and we assume that at least one
 476   will. */
 477
 478   if (ptr < end_subject)
 479     {
 480     clen = 1;        /* Number of bytes in the character */
 481 #ifdef SUPPORT_UTF8
 482     if (utf8) { GETCHARLEN(c, ptr, clen); } else
 483 #endif  /* SUPPORT_UTF8 */
 484     c = *ptr;
 485     }
 486   else
 487     {
 488     clen = 0;        /* This indicates the end of the subject */
 489     c = NOTACHAR;    /* This value should never actually be used */
 490     }
 491
 492   /* Scan up the active states and act on each one. The result of an action
 493   may be to add more states to the currently active list (e.g. on hitting a
 494   parenthesis) or it may be to put states on the new list, for considering
 495   when we move the character pointer on. */
 496
 497   for (i = 0; i < active_count; i++)
 498     {
 499     stateblock *current_state = active_states + i;
 500     const uschar *code;
 501     int state_offset = current_state->offset;
 502     int count, codevalue;
 503     int chartype, script;
 504
 505 #ifdef DEBUG
 506     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
 507     if (clen == 0) printf("EOL\n");
 508       else if (c > 32 && c < 127) printf("'%c'\n", c);
 509         else printf("0x%02x\n", c);
 510 #endif
 511
 512     /* This variable is referred to implicity in the ADD_xxx macros. */
 513
 514     ims = current_state->ims;
 515
 516     /* A negative offset is a special case meaning "hold off going to this
 517     (negated) state until the number of characters in the data field have
 518     been skipped". */
 519
 520     if (state_offset < 0)
 521       {
 522       if (current_state->data > 0)
 523         {
 524         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
 525         ADD_NEW_DATA(state_offset, current_state->count,
 526           current_state->data - 1);
 527         continue;
 528         }
 529       else
 530         {
 531         current_state->offset = state_offset = -state_offset;
 532         }
 533       }
 534
 535     /* Check for a duplicate state with the same count, and skip if found. */
 536
 537     for (j = 0; j < i; j++)
 538       {
 539       if (active_states[j].offset == state_offset &&
 540           active_states[j].count == current_state->count)
 541         {
 542         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
 543         goto NEXT_ACTIVE_STATE;
 544         }
 545       }
 546
 547     /* The state offset is the offset to the opcode */
 548
 549     code = start_code + state_offset;
 550     codevalue = *code;
 551
 552     /* If this opcode is followed by an inline character, load it. It is
 553     tempting to test for the presence of a subject character here, but that
 554     is wrong, because sometimes zero repetitions of the subject are
 555     permitted.
 556
 557     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
 558     argument that is not a data character - but is always one byte long.
 559     Unfortunately, we have to take special action to deal with  \P, \p, and
 560     \X in this case. To keep the other cases fast, convert these ones to new
 561     opcodes. */
 562
 563     if (coptable[codevalue] > 0)
 564       {
 565       dlen = 1;
 566 #ifdef SUPPORT_UTF8
 567       if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
 568 #endif  /* SUPPORT_UTF8 */
 569       d = code[coptable[codevalue]];
 570       if (codevalue >= OP_TYPESTAR)
 571         {
 572         switch(d)
 573           {
 574           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
 575           case OP_NOTPROP:
 576           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
 577           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
 578           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
 579           default: break;
 580           }
 581         }
 582       }
 583     else
 584       {
 585       dlen = 0;         /* Not strictly necessary, but compilers moan */
 586       d = NOTACHAR;     /* if these variables are not set. */
 587       }
 588
 589
 590     /* Now process the individual opcodes */
 591
 592     switch (codevalue)
 593       {
 594
 595 /* ========================================================================== */
 596       /* Reached a closing bracket. If not at the end of the pattern, carry
 597       on with the next opcode. Otherwise, unless we have an empty string and
 598       PCRE_NOTEMPTY is set, save the match data, shifting up all previous
 599       matches so we always have the longest first. */
 600
 601       case OP_KET:
 602       case OP_KETRMIN:
 603       case OP_KETRMAX:
 604       if (code != end_code)
 605         {
 606         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
 607         if (codevalue != OP_KET)
 608           {
 609           ADD_ACTIVE(state_offset - GET(code, 1), 0);
 610           }
 611         }
 612       else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
 613         {
 614         if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
 615           else if (match_count > 0 && ++match_count * 2 >= offsetcount)
 616             match_count = 0;
 617         count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
 618         if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
 619         if (offsetcount >= 2)
 620           {
 621           offsets[0] = current_subject - start_subject;
 622           offsets[1] = ptr - start_subject;
 623           DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
 624             offsets[1] - offsets[0], current_subject));
 625           }
 626         if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
 627           {
 628           DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
 629             "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
 630             match_count, rlevel*2-2, SP));
 631           return match_count;
 632           }
 633         }
 634       break;
 635
 636 /* ========================================================================== */
 637       /* These opcodes add to the current list of states without looking
 638       at the current character. */
 639
 640       /*-----------------------------------------------------------------*/
 641       case OP_ALT:
 642       do { code += GET(code, 1); } while (*code == OP_ALT);
 643       ADD_ACTIVE(code - start_code, 0);
 644       break;
 645
 646       /*-----------------------------------------------------------------*/
 647       case OP_BRA:
 648       case OP_SBRA:
 649       do
 650         {
 651         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
 652         code += GET(code, 1);
 653         }
 654       while (*code == OP_ALT);
 655       break;
 656
 657       /*-----------------------------------------------------------------*/
 658       case OP_CBRA:
 659       case OP_SCBRA:
 660       ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
 661       code += GET(code, 1);
 662       while (*code == OP_ALT)
 663         {
 664         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
 665         code += GET(code, 1);
 666         }
 667       break;
 668
 669       /*-----------------------------------------------------------------*/
 670       case OP_BRAZERO:
 671       case OP_BRAMINZERO:
 672       ADD_ACTIVE(state_offset + 1, 0);
 673       code += 1 + GET(code, 2);
 674       while (*code == OP_ALT) code += GET(code, 1);
 675       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
 676       break;
 677
 678       /*-----------------------------------------------------------------*/
 679       case OP_CIRC:
 680       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
 681           ((ims & PCRE_MULTILINE) != 0 &&
 682             ptr != end_subject &&
 683             WAS_NEWLINE(ptr)))
 684         { ADD_ACTIVE(state_offset + 1, 0); }
 685       break;
 686
 687       /*-----------------------------------------------------------------*/
 688       case OP_EOD:
 689       if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 690       break;
 691
 692       /*-----------------------------------------------------------------*/
 693       case OP_OPT:
 694       ims = code[1];
 695       ADD_ACTIVE(state_offset + 2, 0);
 696       break;
 697
 698       /*-----------------------------------------------------------------*/
 699       case OP_SOD:
 700       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 701       break;
 702
 703       /*-----------------------------------------------------------------*/
 704       case OP_SOM:
 705       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
 706       break;
 707
 708
 709 /* ========================================================================== */
 710       /* These opcodes inspect the next subject character, and sometimes
 711       the previous one as well, but do not have an argument. The variable
 712       clen contains the length of the current character and is zero if we are
 713       at the end of the subject. */
 714
 715       /*-----------------------------------------------------------------*/
 716       case OP_ANY:
 717       if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
 718         { ADD_NEW(state_offset + 1, 0); }
 719       break;
 720
 721       /*-----------------------------------------------------------------*/
 722       case OP_EODN:
 723       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
 724         { ADD_ACTIVE(state_offset + 1, 0); }
 725       break;
 726
 727       /*-----------------------------------------------------------------*/
 728       case OP_DOLL:
 729       if ((md->moptions & PCRE_NOTEOL) == 0)
 730         {
 731         if (clen == 0 ||
 732             (IS_NEWLINE(ptr) &&
 733                ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
 734             ))
 735           { ADD_ACTIVE(state_offset + 1, 0); }
 736         }
 737       else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
 738         { ADD_ACTIVE(state_offset + 1, 0); }
 739       break;
 740
 741       /*-----------------------------------------------------------------*/
 742
 743       case OP_DIGIT:
 744       case OP_WHITESPACE:
 745       case OP_WORDCHAR:
 746       if (clen > 0 && c < 256 &&
 747             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
 748         { ADD_NEW(state_offset + 1, 0); }
 749       break;
 750
 751       /*-----------------------------------------------------------------*/
 752       case OP_NOT_DIGIT:
 753       case OP_NOT_WHITESPACE:
 754       case OP_NOT_WORDCHAR:
 755       if (clen > 0 && (c >= 256 ||
 756             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
 757         { ADD_NEW(state_offset + 1, 0); }
 758       break;
 759
 760       /*-----------------------------------------------------------------*/
 761       case OP_WORD_BOUNDARY:
 762       case OP_NOT_WORD_BOUNDARY:
 763         {
 764         int left_word, right_word;
 765
 766         if (ptr > start_subject)
 767           {
 768           const uschar *temp = ptr - 1;
 769 #ifdef SUPPORT_UTF8
 770           if (utf8) BACKCHAR(temp);
 771 #endif
 772           GETCHARTEST(d, temp);
 773           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
 774           }
 775         else left_word = 0;
 776
 777         if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
 778           else right_word = 0;
 779
 780         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
 781           { ADD_ACTIVE(state_offset + 1, 0); }
 782         }
 783       break;
 784
 785
 786 #ifdef SUPPORT_UCP
 787
 788       /*-----------------------------------------------------------------*/
 789       /* Check the next character by Unicode property. We will get here only
 790       if the support is in the binary; otherwise a compile-time error occurs.
 791       */
 792
 793       case OP_PROP:
 794       case OP_NOTPROP:
 795       if (clen > 0)
 796         {
 797         BOOL OK;
 798         int category = _pcre_ucp_findprop(c, &chartype, &script);
 799         switch(code[1])
 800           {
 801           case PT_ANY:
 802           OK = TRUE;
 803           break;
 804
 805           case PT_LAMP:
 806           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
 807           break;
 808
 809           case PT_GC:
 810           OK = category == code[2];
 811           break;
 812
 813           case PT_PC:
 814           OK = chartype == code[2];
 815           break;
 816
 817           case PT_SC:
 818           OK = script == code[2];
 819           break;
 820
 821           /* Should never occur, but keep compilers from grumbling. */
 822
 823           default:
 824           OK = codevalue != OP_PROP;
 825           break;
 826           }
 827
 828         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
 829         }
 830       break;
 831 #endif
 832
 833
 834
 835 /* ========================================================================== */
 836       /* These opcodes likewise inspect the subject character, but have an
 837       argument that is not a data character. It is one of these opcodes:
 838       OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
 839       OP_NOT_WORDCHAR. The value is loaded into d. */
 840
 841       case OP_TYPEPLUS:
 842       case OP_TYPEMINPLUS:
 843       case OP_TYPEPOSPLUS:
 844       count = current_state->count;  /* Already matched */
 845       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 846       if (clen > 0)
 847         {
 848         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 849             (c < 256 &&
 850               (d != OP_ANY ||
 851                (ims & PCRE_DOTALL) != 0 ||
 852                !IS_NEWLINE(ptr)
 853               ) &&
 854               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 855           {
 856           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
 857             {
 858             active_count--;            /* Remove non-match possibility */
 859             next_active_state--;
 860             }
 861           count++;
 862           ADD_NEW(state_offset, count);
 863           }
 864         }
 865       break;
 866
 867       /*-----------------------------------------------------------------*/
 868       case OP_TYPEQUERY:
 869       case OP_TYPEMINQUERY:
 870       case OP_TYPEPOSQUERY:
 871       ADD_ACTIVE(state_offset + 2, 0);
 872       if (clen > 0)
 873         {
 874         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 875             (c < 256 &&
 876               (d != OP_ANY ||
 877                (ims & PCRE_DOTALL) != 0 ||
 878                !IS_NEWLINE(ptr)
 879               ) &&
 880               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 881           {
 882           if (codevalue == OP_TYPEPOSQUERY)
 883             {
 884             active_count--;            /* Remove non-match possibility */
 885             next_active_state--;
 886             }
 887           ADD_NEW(state_offset + 2, 0);
 888           }
 889         }
 890       break;
 891
 892       /*-----------------------------------------------------------------*/
 893       case OP_TYPESTAR:
 894       case OP_TYPEMINSTAR:
 895       case OP_TYPEPOSSTAR:
 896       ADD_ACTIVE(state_offset + 2, 0);
 897       if (clen > 0)
 898         {
 899         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 900             (c < 256 &&
 901               (d != OP_ANY ||
 902                (ims & PCRE_DOTALL) != 0 ||
 903                !IS_NEWLINE(ptr)
 904               ) &&
 905               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 906           {
 907           if (codevalue == OP_TYPEPOSSTAR)
 908             {
 909             active_count--;            /* Remove non-match possibility */
 910             next_active_state--;
 911             }
 912           ADD_NEW(state_offset, 0);
 913           }
 914         }
 915       break;
 916
 917       /*-----------------------------------------------------------------*/
 918       case OP_TYPEEXACT:
 919       count = current_state->count;  /* Number already matched */
 920       if (clen > 0)
 921         {
 922         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 923             (c < 256 &&
 924               (d != OP_ANY ||
 925                (ims & PCRE_DOTALL) != 0 ||
 926                !IS_NEWLINE(ptr)
 927               ) &&
 928               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 929           {
 930           if (++count >= GET2(code, 1))
 931             { ADD_NEW(state_offset + 4, 0); }
 932           else
 933             { ADD_NEW(state_offset, count); }
 934           }
 935         }
 936       break;
 937
 938       /*-----------------------------------------------------------------*/
 939       case OP_TYPEUPTO:
 940       case OP_TYPEMINUPTO:
 941       case OP_TYPEPOSUPTO:
 942       ADD_ACTIVE(state_offset + 4, 0);
 943       count = current_state->count;  /* Number already matched */
 944       if (clen > 0)
 945         {
 946         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 947             (c < 256 &&
 948               (d != OP_ANY ||
 949                (ims & PCRE_DOTALL) != 0 ||
 950                !IS_NEWLINE(ptr)
 951               ) &&
 952               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 953           {
 954           if (codevalue == OP_TYPEPOSUPTO)
 955             {
 956             active_count--;           /* Remove non-match possibility */
 957             next_active_state--;
 958             }
 959           if (++count >= GET2(code, 1))
 960             { ADD_NEW(state_offset + 4, 0); }
 961           else
 962             { ADD_NEW(state_offset, count); }
 963           }
 964         }
 965       break;
 966
 967 /* ========================================================================== */
 968       /* These are virtual opcodes that are used when something like
 969       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
 970       argument. It keeps the code above fast for the other cases. The argument
 971       is in the d variable. */
 972
 973       case OP_PROP_EXTRA + OP_TYPEPLUS:
 974       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
 975       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
 976       count = current_state->count;           /* Already matched */
 977       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
 978       if (clen > 0)
 979         {
 980         BOOL OK;
 981         int category = _pcre_ucp_findprop(c, &chartype, &script);
 982         switch(code[2])
 983           {
 984           case PT_ANY:
 985           OK = TRUE;
 986           break;
 987
 988           case PT_LAMP:
 989           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
 990           break;
 991
 992           case PT_GC:
 993           OK = category == code[3];
 994           break;
 995
 996           case PT_PC:
 997           OK = chartype == code[3];
 998           break;
 999
1000           case PT_SC:
1001           OK = script == code[3];
1002           break;
1003
1004           /* Should never occur, but keep compilers from grumbling. */
1005
1006           default:
1007           OK = codevalue != OP_PROP;
1008           break;
1009           }
1010
1011         if (OK == (d == OP_PROP))
1012           {
1013           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1014             {
1015             active_count--;           /* Remove non-match possibility */
1016             next_active_state--;
1017             }
1018           count++;
1019           ADD_NEW(state_offset, count);
1020           }
1021         }
1022       break;
1023
1024       /*-----------------------------------------------------------------*/
1025       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1026       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1027       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1028       count = current_state->count;  /* Already matched */
1029       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1030       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1031         {
1032         const uschar *nptr = ptr + clen;
1033         int ncount = 0;
1034         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1035           {
1036           active_count--;           /* Remove non-match possibility */
1037           next_active_state--;
1038           }
1039         while (nptr < end_subject)
1040           {
1041           int nd;
1042           int ndlen = 1;
1043           GETCHARLEN(nd, nptr, ndlen);
1044           if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1045           ncount++;
1046           nptr += ndlen;
1047           }
1048         count++;
1049         ADD_NEW_DATA(-state_offset, count, ncount);
1050         }
1051       break;
1052
1053       /*-----------------------------------------------------------------*/
1054       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1055       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1056       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1057       count = current_state->count;  /* Already matched */
1058       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1059       if (clen > 0)
1060         {
1061         int ncount = 0;
1062         switch (c)
1063           {
1064           case 0x000d:
1065           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1066           /* Fall through */
1067           case 0x000a:
1068           case 0x000b:
1069           case 0x000c:
1070           case 0x0085:
1071           case 0x2028:
1072           case 0x2029:
1073           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1074             {
1075             active_count--;           /* Remove non-match possibility */
1076             next_active_state--;
1077             }
1078           count++;
1079           ADD_NEW_DATA(-state_offset, count, ncount);
1080           break;
1081           default:
1082           break;
1083           }
1084         }
1085       break;
1086
1087       /*-----------------------------------------------------------------*/
1088       case OP_PROP_EXTRA + OP_TYPEQUERY:
1089       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1090       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1091       count = 4;
1092       goto QS1;
1093
1094       case OP_PROP_EXTRA + OP_TYPESTAR:
1095       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1096       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1097       count = 0;
1098
1099       QS1:
1100
1101       ADD_ACTIVE(state_offset + 4, 0);
1102       if (clen > 0)
1103         {
1104         BOOL OK;
1105         int category = _pcre_ucp_findprop(c, &chartype, &script);
1106         switch(code[2])
1107           {
1108           case PT_ANY:
1109           OK = TRUE;
1110           break;
1111
1112           case PT_LAMP:
1113           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1114           break;
1115
1116           case PT_GC:
1117           OK = category == code[3];
1118           break;
1119
1120           case PT_PC:
1121           OK = chartype == code[3];
1122           break;
1123
1124           case PT_SC:
1125           OK = script == code[3];
1126           break;
1127
1128           /* Should never occur, but keep compilers from grumbling. */
1129
1130           default:
1131           OK = codevalue != OP_PROP;
1132           break;
1133           }
1134
1135         if (OK == (d == OP_PROP))
1136           {
1137           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1138               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1139             {
1140             active_count--;           /* Remove non-match possibility */
1141             next_active_state--;
1142             }
1143           ADD_NEW(state_offset + count, 0);
1144           }
1145         }
1146       break;
1147
1148       /*-----------------------------------------------------------------*/
1149       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1150       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1151       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1152       count = 2;
1153       goto QS2;
1154
1155       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1156       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1157       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1158       count = 0;
1159
1160       QS2:
1161
1162       ADD_ACTIVE(state_offset + 2, 0);
1163       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1164         {
1165         const uschar *nptr = ptr + clen;
1166         int ncount = 0;
1167         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1168             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1169           {
1170           active_count--;           /* Remove non-match possibility */
1171           next_active_state--;
1172           }
1173         while (nptr < end_subject)
1174           {
1175           int nd;
1176           int ndlen = 1;
1177           GETCHARLEN(nd, nptr, ndlen);
1178           if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1179           ncount++;
1180           nptr += ndlen;
1181           }
1182         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1183         }
1184       break;
1185
1186       /*-----------------------------------------------------------------*/
1187       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1188       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1189       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1190       count = 2;
1191       goto QS3;
1192
1193       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1194       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1195       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1196       count = 0;
1197
1198       QS3:
1199       ADD_ACTIVE(state_offset + 2, 0);
1200       if (clen > 0)
1201         {
1202         int ncount = 0;
1203         switch (c)
1204           {
1205           case 0x000d:
1206           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1207           /* Fall through */
1208           case 0x000a:
1209           case 0x000b:
1210           case 0x000c:
1211           case 0x0085:
1212           case 0x2028:
1213           case 0x2029:
1214           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1215               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1216             {
1217             active_count--;           /* Remove non-match possibility */
1218             next_active_state--;
1219             }
1220           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1221           break;
1222           default:
1223           break;
1224           }
1225         }
1226       break;
1227
1228       /*-----------------------------------------------------------------*/
1229       case OP_PROP_EXTRA + OP_TYPEEXACT:
1230       case OP_PROP_EXTRA + OP_TYPEUPTO:
1231       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1232       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1233       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1234         { ADD_ACTIVE(state_offset + 6, 0); }
1235       count = current_state->count;  /* Number already matched */
1236       if (clen > 0)
1237         {
1238         BOOL OK;
1239         int category = _pcre_ucp_findprop(c, &chartype, &script);
1240         switch(code[4])
1241           {
1242           case PT_ANY:
1243           OK = TRUE;
1244           break;
1245
1246           case PT_LAMP:
1247           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1248           break;
1249
1250           case PT_GC:
1251           OK = category == code[5];
1252           break;
1253
1254           case PT_PC:
1255           OK = chartype == code[5];
1256           break;
1257
1258           case PT_SC:
1259           OK = script == code[5];
1260           break;
1261
1262           /* Should never occur, but keep compilers from grumbling. */
1263
1264           default:
1265           OK = codevalue != OP_PROP;
1266           break;
1267           }
1268
1269         if (OK == (d == OP_PROP))
1270           {
1271           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1272             {
1273             active_count--;           /* Remove non-match possibility */
1274             next_active_state--;
1275             }
1276           if (++count >= GET2(code, 1))
1277             { ADD_NEW(state_offset + 6, 0); }
1278           else
1279             { ADD_NEW(state_offset, count); }
1280           }
1281         }
1282       break;
1283
1284       /*-----------------------------------------------------------------*/
1285       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1286       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1287       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1288       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1289       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1290         { ADD_ACTIVE(state_offset + 4, 0); }
1291       count = current_state->count;  /* Number already matched */
1292       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1293         {
1294         const uschar *nptr = ptr + clen;
1295         int ncount = 0;
1296         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1297           {
1298           active_count--;           /* Remove non-match possibility */
1299           next_active_state--;
1300           }
1301         while (nptr < end_subject)
1302           {
1303           int nd;
1304           int ndlen = 1;
1305           GETCHARLEN(nd, nptr, ndlen);
1306           if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1307           ncount++;
1308           nptr += ndlen;
1309           }
1310         if (++count >= GET2(code, 1))
1311           { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1312         else
1313           { ADD_NEW_DATA(-state_offset, count, ncount); }
1314         }
1315       break;
1316
1317       /*-----------------------------------------------------------------*/
1318       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1319       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1320       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1321       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1322       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1323         { ADD_ACTIVE(state_offset + 4, 0); }
1324       count = current_state->count;  /* Number already matched */
1325       if (clen > 0)
1326         {
1327         int ncount = 0;
1328         switch (c)
1329           {
1330           case 0x000d:
1331           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1332           /* Fall through */
1333           case 0x000a:
1334           case 0x000b:
1335           case 0x000c:
1336           case 0x0085:
1337           case 0x2028:
1338           case 0x2029:
1339           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1340             {
1341             active_count--;           /* Remove non-match possibility */
1342             next_active_state--;
1343             }
1344           if (++count >= GET2(code, 1))
1345             { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1346           else
1347             { ADD_NEW_DATA(-state_offset, count, ncount); }
1348           break;
1349           default:
1350           break;
1351           }
1352         }
1353       break;
1354
1355 /* ========================================================================== */
1356       /* These opcodes are followed by a character that is usually compared
1357       to the current subject character; it is loaded into d. We still get
1358       here even if there is no subject character, because in some cases zero
1359       repetitions are permitted. */
1360
1361       /*-----------------------------------------------------------------*/
1362       case OP_CHAR:
1363       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1364       break;
1365
1366       /*-----------------------------------------------------------------*/
1367       case OP_CHARNC:
1368       if (clen == 0) break;
1369
1370 #ifdef SUPPORT_UTF8
1371       if (utf8)
1372         {
1373         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1374           {
1375           unsigned int othercase;
1376           if (c < 128) othercase = fcc[c]; else
1377
1378           /* If we have Unicode property support, we can use it to test the
1379           other case of the character. */
1380
1381 #ifdef SUPPORT_UCP
1382           othercase = _pcre_ucp_othercase(c);
1383 #else
1384           othercase = NOTACHAR;
1385 #endif
1386
1387           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1388           }
1389         }
1390       else
1391 #endif  /* SUPPORT_UTF8 */
1392
1393       /* Non-UTF-8 mode */
1394         {
1395         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1396         }
1397       break;
1398
1399
1400 #ifdef SUPPORT_UCP
1401       /*-----------------------------------------------------------------*/
1402       /* This is a tricky one because it can match more than one character.
1403       Find out how many characters to skip, and then set up a negative state
1404       to wait for them to pass before continuing. */
1405
1406       case OP_EXTUNI:
1407       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1408         {
1409         const uschar *nptr = ptr + clen;
1410         int ncount = 0;
1411         while (nptr < end_subject)
1412           {
1413           int nclen = 1;
1414           GETCHARLEN(c, nptr, nclen);
1415           if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1416           ncount++;
1417           nptr += nclen;
1418           }
1419         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1420         }
1421       break;
1422 #endif
1423
1424       /*-----------------------------------------------------------------*/
1425       /* This is a tricky like EXTUNI because it too can match more than one
1426       character (when CR is followed by LF). In this case, set up a negative
1427       state to wait for one character to pass before continuing. */
1428
1429       case OP_ANYNL:
1430       if (clen > 0) switch(c)
1431         {
1432         case 0x000a:
1433         case 0x000b:
1434         case 0x000c:
1435         case 0x0085:
1436         case 0x2028:
1437         case 0x2029:
1438         ADD_NEW(state_offset + 1, 0);
1439         break;
1440         case 0x000d:
1441         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1442           {
1443           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1444           }
1445         else
1446           {
1447           ADD_NEW(state_offset + 1, 0);
1448           }
1449         break;
1450         }
1451       break;
1452
1453       /*-----------------------------------------------------------------*/
1454       /* Match a negated single character. This is only used for one-byte
1455       characters, that is, we know that d < 256. The character we are
1456       checking (c) can be multibyte. */
1457
1458       case OP_NOT:
1459       if (clen > 0)
1460         {
1461         unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1462         if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1463         }
1464       break;
1465
1466       /*-----------------------------------------------------------------*/
1467       case OP_PLUS:
1468       case OP_MINPLUS:
1469       case OP_POSPLUS:
1470       case OP_NOTPLUS:
1471       case OP_NOTMINPLUS:
1472       case OP_NOTPOSPLUS:
1473       count = current_state->count;  /* Already matched */
1474       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1475       if (clen > 0)
1476         {
1477         unsigned int otherd = NOTACHAR;
1478         if ((ims & PCRE_CASELESS) != 0)
1479           {
1480 #ifdef SUPPORT_UTF8
1481           if (utf8 && d >= 128)
1482             {
1483 #ifdef SUPPORT_UCP
1484             otherd = _pcre_ucp_othercase(d);
1485 #endif  /* SUPPORT_UCP */
1486             }
1487           else
1488 #endif  /* SUPPORT_UTF8 */
1489           otherd = fcc[d];
1490           }
1491         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1492           {
1493           if (count > 0 &&
1494               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1495             {
1496             active_count--;             /* Remove non-match possibility */
1497             next_active_state--;
1498             }
1499           count++;
1500           ADD_NEW(state_offset, count);
1501           }
1502         }
1503       break;
1504
1505       /*-----------------------------------------------------------------*/
1506       case OP_QUERY:
1507       case OP_MINQUERY:
1508       case OP_POSQUERY:
1509       case OP_NOTQUERY:
1510       case OP_NOTMINQUERY:
1511       case OP_NOTPOSQUERY:
1512       ADD_ACTIVE(state_offset + dlen + 1, 0);
1513       if (clen > 0)
1514         {
1515         unsigned int otherd = NOTACHAR;
1516         if ((ims & PCRE_CASELESS) != 0)
1517           {
1518 #ifdef SUPPORT_UTF8
1519           if (utf8 && d >= 128)
1520             {
1521 #ifdef SUPPORT_UCP
1522             otherd = _pcre_ucp_othercase(d);
1523 #endif  /* SUPPORT_UCP */
1524             }
1525           else
1526 #endif  /* SUPPORT_UTF8 */
1527           otherd = fcc[d];
1528           }
1529         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1530           {
1531           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1532             {
1533             active_count--;            /* Remove non-match possibility */
1534             next_active_state--;
1535             }
1536           ADD_NEW(state_offset + dlen + 1, 0);
1537           }
1538         }
1539       break;
1540
1541       /*-----------------------------------------------------------------*/
1542       case OP_STAR:
1543       case OP_MINSTAR:
1544       case OP_POSSTAR:
1545       case OP_NOTSTAR:
1546       case OP_NOTMINSTAR:
1547       case OP_NOTPOSSTAR:
1548       ADD_ACTIVE(state_offset + dlen + 1, 0);
1549       if (clen > 0)
1550         {
1551         unsigned int otherd = NOTACHAR;
1552         if ((ims & PCRE_CASELESS) != 0)
1553           {
1554 #ifdef SUPPORT_UTF8
1555           if (utf8 && d >= 128)
1556             {
1557 #ifdef SUPPORT_UCP
1558             otherd = _pcre_ucp_othercase(d);
1559 #endif  /* SUPPORT_UCP */
1560             }
1561           else
1562 #endif  /* SUPPORT_UTF8 */
1563           otherd = fcc[d];
1564           }
1565         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1566           {
1567           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1568             {
1569             active_count--;            /* Remove non-match possibility */
1570             next_active_state--;
1571             }
1572           ADD_NEW(state_offset, 0);
1573           }
1574         }
1575       break;
1576
1577       /*-----------------------------------------------------------------*/
1578       case OP_EXACT:
1579       case OP_NOTEXACT:
1580       count = current_state->count;  /* Number already matched */
1581       if (clen > 0)
1582         {
1583         unsigned int otherd = NOTACHAR;
1584         if ((ims & PCRE_CASELESS) != 0)
1585           {
1586 #ifdef SUPPORT_UTF8
1587           if (utf8 && d >= 128)
1588             {
1589 #ifdef SUPPORT_UCP
1590             otherd = _pcre_ucp_othercase(d);
1591 #endif  /* SUPPORT_UCP */
1592             }
1593           else
1594 #endif  /* SUPPORT_UTF8 */
1595           otherd = fcc[d];
1596           }
1597         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1598           {
1599           if (++count >= GET2(code, 1))
1600             { ADD_NEW(state_offset + dlen + 3, 0); }
1601           else
1602             { ADD_NEW(state_offset, count); }
1603           }
1604         }
1605       break;
1606
1607       /*-----------------------------------------------------------------*/
1608       case OP_UPTO:
1609       case OP_MINUPTO:
1610       case OP_POSUPTO:
1611       case OP_NOTUPTO:
1612       case OP_NOTMINUPTO:
1613       case OP_NOTPOSUPTO:
1614       ADD_ACTIVE(state_offset + dlen + 3, 0);
1615       count = current_state->count;  /* Number already matched */
1616       if (clen > 0)
1617         {
1618         unsigned int otherd = NOTACHAR;
1619         if ((ims & PCRE_CASELESS) != 0)
1620           {
1621 #ifdef SUPPORT_UTF8
1622           if (utf8 && d >= 128)
1623             {
1624 #ifdef SUPPORT_UCP
1625             otherd = _pcre_ucp_othercase(d);
1626 #endif  /* SUPPORT_UCP */
1627             }
1628           else
1629 #endif  /* SUPPORT_UTF8 */
1630           otherd = fcc[d];
1631           }
1632         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1633           {
1634           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
1635             {
1636             active_count--;             /* Remove non-match possibility */
1637             next_active_state--;
1638             }
1639           if (++count >= GET2(code, 1))
1640             { ADD_NEW(state_offset + dlen + 3, 0); }
1641           else
1642             { ADD_NEW(state_offset, count); }
1643           }
1644         }
1645       break;
1646
1647
1648 /* ========================================================================== */
1649       /* These are the class-handling opcodes */
1650
1651       case OP_CLASS:
1652       case OP_NCLASS:
1653       case OP_XCLASS:
1654         {
1655         BOOL isinclass = FALSE;
1656         int next_state_offset;
1657         const uschar *ecode;
1658
1659         /* For a simple class, there is always just a 32-byte table, and we
1660         can set isinclass from it. */
1661
1662         if (codevalue != OP_XCLASS)
1663           {
1664           ecode = code + 33;
1665           if (clen > 0)
1666             {
1667             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1668               ((code[1 + c/8] & (1 << (c&7))) != 0);
1669             }
1670           }
1671
1672         /* An extended class may have a table or a list of single characters,
1673         ranges, or both, and it may be positive or negative. There's a
1674         function that sorts all this out. */
1675
1676         else
1677          {
1678          ecode = code + GET(code, 1);
1679          if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
1680          }
1681
1682         /* At this point, isinclass is set for all kinds of class, and ecode
1683         points to the byte after the end of the class. If there is a
1684         quantifier, this is where it will be. */
1685
1686         next_state_offset = ecode - start_code;
1687
1688         switch (*ecode)
1689           {
1690           case OP_CRSTAR:
1691           case OP_CRMINSTAR:
1692           ADD_ACTIVE(next_state_offset + 1, 0);
1693           if (isinclass) { ADD_NEW(state_offset, 0); }
1694           break;
1695
1696           case OP_CRPLUS:
1697           case OP_CRMINPLUS:
1698           count = current_state->count;  /* Already matched */
1699           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
1700           if (isinclass) { count++; ADD_NEW(state_offset, count); }
1701           break;
1702
1703           case OP_CRQUERY:
1704           case OP_CRMINQUERY:
1705           ADD_ACTIVE(next_state_offset + 1, 0);
1706           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
1707           break;
1708
1709           case OP_CRRANGE:
1710           case OP_CRMINRANGE:
1711           count = current_state->count;  /* Already matched */
1712           if (count >= GET2(ecode, 1))
1713             { ADD_ACTIVE(next_state_offset + 5, 0); }
1714           if (isinclass)
1715             {
1716             int max = GET2(ecode, 3);
1717             if (++count >= max && max != 0)   /* Max 0 => no limit */
1718               { ADD_NEW(next_state_offset + 5, 0); }
1719             else
1720               { ADD_NEW(state_offset, count); }
1721             }
1722           break;
1723
1724           default:
1725           if (isinclass) { ADD_NEW(next_state_offset, 0); }
1726           break;
1727           }
1728         }
1729       break;
1730
1731 /* ========================================================================== */
1732       /* These are the opcodes for fancy brackets of various kinds. We have
1733       to use recursion in order to handle them. */
1734
1735       case OP_ASSERT:
1736       case OP_ASSERT_NOT:
1737       case OP_ASSERTBACK:
1738       case OP_ASSERTBACK_NOT:
1739         {
1740         int rc;
1741         int local_offsets[2];
1742         int local_workspace[1000];
1743         const uschar *endasscode = code + GET(code, 1);
1744
1745         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1746
1747         rc = internal_dfa_exec(
1748           md,                                   /* static match data */
1749           code,                                 /* this subexpression's code */
1750           ptr,                                  /* where we currently are */
1751           ptr - start_subject,                  /* start offset */
1752           local_offsets,                        /* offset vector */
1753           sizeof(local_offsets)/sizeof(int),    /* size of same */
1754           local_workspace,                      /* workspace vector */
1755           sizeof(local_workspace)/sizeof(int),  /* size of same */
1756           ims,                                  /* the current ims flags */
1757           rlevel,                               /* function recursion level */
1758           recursing);                           /* pass on regex recursion */
1759
1760         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
1761             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1762         }
1763       break;
1764
1765       /*-----------------------------------------------------------------*/
1766       case OP_COND:
1767       case OP_SCOND:
1768         {
1769         int local_offsets[1000];
1770         int local_workspace[1000];
1771         int condcode = code[LINK_SIZE+1];
1772
1773         /* Back reference conditions are not supported */
1774
1775         if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
1776
1777         /* The DEFINE condition is always false */
1778
1779         if (condcode == OP_DEF)
1780           {
1781           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
1782           }
1783
1784         /* The only supported version of OP_RREF is for the value RREF_ANY,
1785         which means "test if in any recursion". We can't test for specifically
1786         recursed groups. */
1787
1788         else if (condcode == OP_RREF)
1789           {
1790           int value = GET2(code, LINK_SIZE+2);
1791           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
1792           if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
1793             else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1794           }
1795
1796         /* Otherwise, the condition is an assertion */
1797
1798         else
1799           {
1800           int rc;
1801           const uschar *asscode = code + LINK_SIZE + 1;
1802           const uschar *endasscode = asscode + GET(asscode, 1);
1803
1804           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1805
1806           rc = internal_dfa_exec(
1807             md,                                   /* fixed match data */
1808             asscode,                              /* this subexpression's code */
1809             ptr,                                  /* where we currently are */
1810             ptr - start_subject,                  /* start offset */
1811             local_offsets,                        /* offset vector */
1812             sizeof(local_offsets)/sizeof(int),    /* size of same */
1813             local_workspace,                      /* workspace vector */
1814             sizeof(local_workspace)/sizeof(int),  /* size of same */
1815             ims,                                  /* the current ims flags */
1816             rlevel,                               /* function recursion level */
1817             recursing);                           /* pass on regex recursion */
1818
1819           if ((rc >= 0) ==
1820                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
1821             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1822           else
1823             { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1824           }
1825         }
1826       break;
1827
1828       /*-----------------------------------------------------------------*/
1829       case OP_RECURSE:
1830         {
1831         int local_offsets[1000];
1832         int local_workspace[1000];
1833         int rc;
1834
1835         DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
1836           recursing + 1));
1837
1838         rc = internal_dfa_exec(
1839           md,                                   /* fixed match data */
1840           start_code + GET(code, 1),            /* this subexpression's code */
1841           ptr,                                  /* where we currently are */
1842           ptr - start_subject,                  /* start offset */
1843           local_offsets,                        /* offset vector */
1844           sizeof(local_offsets)/sizeof(int),    /* size of same */
1845           local_workspace,                      /* workspace vector */
1846           sizeof(local_workspace)/sizeof(int),  /* size of same */
1847           ims,                                  /* the current ims flags */
1848           rlevel,                               /* function recursion level */
1849           recursing + 1);                       /* regex recurse level */
1850
1851         DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
1852           recursing + 1, rc));
1853
1854         /* Ran out of internal offsets */
1855
1856         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
1857
1858         /* For each successful matched substring, set up the next state with a
1859         count of characters to skip before trying it. Note that the count is in
1860         characters, not bytes. */
1861
1862         if (rc > 0)
1863           {
1864           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
1865             {
1866             const uschar *p = start_subject + local_offsets[rc];
1867             const uschar *pp = start_subject + local_offsets[rc+1];
1868             int charcount = local_offsets[rc+1] - local_offsets[rc];
1869             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1870             if (charcount > 0)
1871               {
1872               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
1873               }
1874             else
1875               {
1876               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
1877               }
1878             }
1879           }
1880         else if (rc != PCRE_ERROR_NOMATCH) return rc;
1881         }
1882       break;
1883
1884       /*-----------------------------------------------------------------*/
1885       case OP_ONCE:
1886         {
1887         int local_offsets[2];
1888         int local_workspace[1000];
1889
1890         int rc = internal_dfa_exec(
1891           md,                                   /* fixed match data */
1892           code,                                 /* this subexpression's code */
1893           ptr,                                  /* where we currently are */
1894           ptr - start_subject,                  /* start offset */
1895           local_offsets,                        /* offset vector */
1896           sizeof(local_offsets)/sizeof(int),    /* size of same */
1897           local_workspace,                      /* workspace vector */
1898           sizeof(local_workspace)/sizeof(int),  /* size of same */
1899           ims,                                  /* the current ims flags */
1900           rlevel,                               /* function recursion level */
1901           recursing);                           /* pass on regex recursion */
1902
1903         if (rc >= 0)
1904           {
1905           const uschar *end_subpattern = code;
1906           int charcount = local_offsets[1] - local_offsets[0];
1907           int next_state_offset, repeat_state_offset;
1908
1909           do { end_subpattern += GET(end_subpattern, 1); }
1910             while (*end_subpattern == OP_ALT);
1911           next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
1912
1913           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
1914           arrange for the repeat state also to be added to the relevant list.
1915           Calculate the offset, or set -1 for no repeat. */
1916
1917           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
1918                                  *end_subpattern == OP_KETRMIN)?
1919             end_subpattern - start_code - GET(end_subpattern, 1) : -1;
1920
1921           /* If we have matched an empty string, add the next state at the
1922           current character pointer. This is important so that the duplicate
1923           checking kicks in, which is what breaks infinite loops that match an
1924           empty string. */
1925
1926           if (charcount == 0)
1927             {
1928             ADD_ACTIVE(next_state_offset, 0);
1929             }
1930
1931           /* Optimization: if there are no more active states, and there
1932           are no new states yet set up, then skip over the subject string
1933           right here, to save looping. Otherwise, set up the new state to swing
1934           into action when the end of the substring is reached. */
1935
1936           else if (i + 1 >= active_count && new_count == 0)
1937             {
1938             ptr += charcount;
1939             clen = 0;
1940             ADD_NEW(next_state_offset, 0);
1941
1942             /* If we are adding a repeat state at the new character position,
1943             we must fudge things so that it is the only current state.
1944             Otherwise, it might be a duplicate of one we processed before, and
1945             that would cause it to be skipped. */
1946
1947             if (repeat_state_offset >= 0)
1948               {
1949               next_active_state = active_states;
1950               active_count = 0;
1951               i = -1;
1952               ADD_ACTIVE(repeat_state_offset, 0);
1953               }
1954             }
1955           else
1956             {
1957             const uschar *p = start_subject + local_offsets[0];
1958             const uschar *pp = start_subject + local_offsets[1];
1959             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1960             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
1961             if (repeat_state_offset >= 0)
1962               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
1963             }
1964
1965           }
1966         else if (rc != PCRE_ERROR_NOMATCH) return rc;
1967         }
1968       break;
1969
1970
1971 /* ========================================================================== */
1972       /* Handle callouts */
1973
1974       case OP_CALLOUT:
1975       if (pcre_callout != NULL)
1976         {
1977         int rrc;
1978         pcre_callout_block cb;
1979         cb.version          = 1;   /* Version 1 of the callout block */
1980         cb.callout_number   = code[1];
1981         cb.offset_vector    = offsets;
1982         cb.subject          = (PCRE_SPTR)start_subject;
1983         cb.subject_length   = end_subject - start_subject;
1984         cb.start_match      = current_subject - start_subject;
1985         cb.current_position = ptr - start_subject;
1986         cb.pattern_position = GET(code, 2);
1987         cb.next_item_length = GET(code, 2 + LINK_SIZE);
1988         cb.capture_top      = 1;
1989         cb.capture_last     = -1;
1990         cb.callout_data     = md->callout_data;
1991         if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
1992         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
1993         }
1994       break;
1995
1996
1997 /* ========================================================================== */
1998       default:        /* Unsupported opcode */
1999       return PCRE_ERROR_DFA_UITEM;
2000       }
2001
2002     NEXT_ACTIVE_STATE: continue;
2003
2004     }      /* End of loop scanning active states */
2005
2006   /* We have finished the processing at the current subject character. If no
2007   new states have been set for the next character, we have found all the
2008   matches that we are going to find. If we are at the top level and partial
2009   matching has been requested, check for appropriate conditions. */
2010
2011   if (new_count <= 0)
2012     {
2013     if (match_count < 0 &&                     /* No matches found */
2014         rlevel == 1 &&                         /* Top level match function */
2015         (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */
2016         ptr >= end_subject &&                  /* Reached end of subject */
2017         ptr > current_subject)                 /* Matched non-empty string */
2018       {
2019       if (offsetcount >= 2)
2020         {
2021         offsets[0] = current_subject - start_subject;
2022         offsets[1] = end_subject - start_subject;
2023         }
2024       match_count = PCRE_ERROR_PARTIAL;
2025       }
2026
2027     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2028       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2029       rlevel*2-2, SP));
2030     break;        /* In effect, "return", but see the comment below */
2031     }
2032
2033   /* One or more states are active for the next character. */
2034
2035   ptr += clen;    /* Advance to next subject character */
2036   }               /* Loop to move along the subject string */
2037
2038 /* Control gets here from "break" a few lines above. We do it this way because
2039 if we use "return" above, we have compiler trouble. Some compilers warn if
2040 there's nothing here because they think the function doesn't return a value. On
2041 the other hand, if we put a dummy statement here, some more clever compilers
2042 complain that it can't be reached. Sigh. */
2043
2044 return match_count;
2045 }
2046
2047
2048
2049
2050 /*************************************************
2051 *    Execute a Regular Expression - DFA engine   *
2052 *************************************************/
2053
2054 /* This external function applies a compiled re to a subject string using a DFA
2055 engine. This function calls the internal function multiple times if the pattern
2056 is not anchored.
2057
2058 Arguments:
2059   argument_re     points to the compiled expression
2060   extra_data      points to extra data or is NULL (not currently used)
2061   subject         points to the subject string
2062   length          length of subject string (may contain binary zeros)
2063   start_offset    where to start in the subject string
2064   options         option bits
2065   offsets         vector of match offsets
2066   offsetcount     size of same
2067   workspace       workspace vector
2068   wscount         size of same
2069
2070 Returns:          > 0 => number of match offset pairs placed in offsets
2071                   = 0 => offsets overflowed; longest matches are present
2072                    -1 => failed to match
2073                  < -1 => some kind of unexpected problem
2074 */
2075
2076 PCRE_DATA_SCOPE int
2077 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2078   const char *subject, int length, int start_offset, int options, int *offsets,
2079   int offsetcount, int *workspace, int wscount)
2080 {
2081 real_pcre *re = (real_pcre *)argument_re;
2082 dfa_match_data match_block;
2083 dfa_match_data *md = &match_block;
2084 BOOL utf8, anchored, startline, firstline;
2085 const uschar *current_subject, *end_subject, *lcc;
2086
2087 pcre_study_data internal_study;
2088 const pcre_study_data *study = NULL;
2089 real_pcre internal_re;
2090
2091 const uschar *req_byte_ptr;
2092 const uschar *start_bits = NULL;
2093 BOOL first_byte_caseless = FALSE;
2094 BOOL req_byte_caseless = FALSE;
2095 int first_byte = -1;
2096 int req_byte = -1;
2097 int req_byte2 = -1;
2098 int newline;
2099
2100 /* Plausibility checks */
2101
2102 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2103 if (re == NULL || subject == NULL || workspace == NULL ||
2104    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2105 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2106 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2107
2108 /* We need to find the pointer to any study data before we test for byte
2109 flipping, so we scan the extra_data block first. This may set two fields in the
2110 match block, so we must initialize them beforehand. However, the other fields
2111 in the match block must not be set until after the byte flipping. */
2112
2113 md->tables = re->tables;
2114 md->callout_data = NULL;
2115
2116 if (extra_data != NULL)
2117   {
2118   unsigned int flags = extra_data->flags;
2119   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2120     study = (const pcre_study_data *)extra_data->study_data;
2121   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2122   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2123     return PCRE_ERROR_DFA_UMLIMIT;
2124   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2125     md->callout_data = extra_data->callout_data;
2126   if ((flags & PCRE_EXTRA_TABLES) != 0)
2127     md->tables = extra_data->tables;
2128   }
2129
2130 /* Check that the first field in the block is the magic number. If it is not,
2131 test for a regex that was compiled on a host of opposite endianness. If this is
2132 the case, flipped values are put in internal_re and internal_study if there was
2133 study data too. */
2134
2135 if (re->magic_number != MAGIC_NUMBER)
2136   {
2137   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2138   if (re == NULL) return PCRE_ERROR_BADMAGIC;
2139   if (study != NULL) study = &internal_study;
2140   }
2141
2142 /* Set some local values */
2143
2144 current_subject = (const unsigned char *)subject + start_offset;
2145 end_subject = (const unsigned char *)subject + length;
2146 req_byte_ptr = current_subject - 1;
2147
2148 #ifdef SUPPORT_UTF8
2149 utf8 = (re->options & PCRE_UTF8) != 0;
2150 #else
2151 utf8 = FALSE;
2152 #endif
2153
2154 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2155   (re->options & PCRE_ANCHORED) != 0;
2156
2157 /* The remaining fixed data for passing around. */
2158
2159 md->start_code = (const uschar *)argument_re +
2160     re->name_table_offset + re->name_count * re->name_entry_size;
2161 md->start_subject = (const unsigned char *)subject;
2162 md->end_subject = end_subject;
2163 md->moptions = options;
2164 md->poptions = re->options;
2165
2166 /* Handle different types of newline. The two bits give four cases. If nothing
2167 is set at run time, whatever was used at compile time applies. */
2168
2169 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) &
2170          PCRE_NEWLINE_BITS)
2171   {
2172   case 0: newline = NEWLINE; break;   /* Compile-time default */
2173   case PCRE_NEWLINE_CR: newline = '\r'; break;
2174   case PCRE_NEWLINE_LF: newline = '\n'; break;
2175   case PCRE_NEWLINE_CR+
2176        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2177   case PCRE_NEWLINE_ANY: newline = -1; break;
2178   default: return PCRE_ERROR_BADNEWLINE;
2179   }
2180
2181 if (newline < 0)
2182   {
2183   md->nltype = NLTYPE_ANY;
2184   }
2185 else
2186   {
2187   md->nltype = NLTYPE_FIXED;
2188   if (newline > 255)
2189     {
2190     md->nllen = 2;
2191     md->nl[0] = (newline >> 8) & 255;
2192     md->nl[1] = newline & 255;
2193     }
2194   else
2195     {
2196     md->nllen = 1;
2197     md->nl[0] = newline;
2198     }
2199   }
2200
2201 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2202 back the character offset. */
2203
2204 #ifdef SUPPORT_UTF8
2205 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2206   {
2207   if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2208     return PCRE_ERROR_BADUTF8;
2209   if (start_offset > 0 && start_offset < length)
2210     {
2211     int tb = ((uschar *)subject)[start_offset];
2212     if (tb > 127)
2213       {
2214       tb &= 0xc0;
2215       if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2216       }
2217     }
2218   }
2219 #endif
2220
2221 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2222 is a feature that makes it possible to save compiled regex and re-use them
2223 in other programs later. */
2224
2225 if (md->tables == NULL) md->tables = _pcre_default_tables;
2226
2227 /* The lower casing table and the "must be at the start of a line" flag are
2228 used in a loop when finding where to start. */
2229
2230 lcc = md->tables + lcc_offset;
2231 startline = (re->options & PCRE_STARTLINE) != 0;
2232 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2233
2234 /* Set up the first character to match, if available. The first_byte value is
2235 never set for an anchored regular expression, but the anchoring may be forced
2236 at run time, so we have to test for anchoring. The first char may be unset for
2237 an unanchored pattern, of course. If there's no first char and the pattern was
2238 studied, there may be a bitmap of possible first characters. */
2239
2240 if (!anchored)
2241   {
2242   if ((re->options & PCRE_FIRSTSET) != 0)
2243     {
2244     first_byte = re->first_byte & 255;
2245     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2246       first_byte = lcc[first_byte];
2247     }
2248   else
2249     {
2250     if (startline && study != NULL &&
2251          (study->options & PCRE_STUDY_MAPPED) != 0)
2252       start_bits = study->start_bits;
2253     }
2254   }
2255
2256 /* For anchored or unanchored matches, there may be a "last known required
2257 character" set. */
2258
2259 if ((re->options & PCRE_REQCHSET) != 0)
2260   {
2261   req_byte = re->req_byte & 255;
2262   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2263   req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
2264   }
2265
2266 /* Call the main matching function, looping for a non-anchored regex after a
2267 failed match. Unless restarting, optimize by moving to the first match
2268 character if possible, when not anchored. Then unless wanting a partial match,
2269 check for a required later character. */
2270
2271 for (;;)
2272   {
2273   int rc;
2274
2275   if ((options & PCRE_DFA_RESTART) == 0)
2276     {
2277     const uschar *save_end_subject = end_subject;
2278
2279     /* Advance to a unique first char if possible. If firstline is TRUE, the
2280     start of the match is constrained to the first line of a multiline string.
2281     Implement this by temporarily adjusting end_subject so that we stop
2282     scanning at a newline. If the match fails at the newline, later code breaks
2283     this loop. */
2284
2285     if (firstline)
2286       {
2287       const uschar *t = current_subject;
2288       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2289       end_subject = t;
2290       }
2291
2292     if (first_byte >= 0)
2293       {
2294       if (first_byte_caseless)
2295         while (current_subject < end_subject &&
2296                lcc[*current_subject] != first_byte)
2297           current_subject++;
2298       else
2299         while (current_subject < end_subject && *current_subject != first_byte)
2300           current_subject++;
2301       }
2302
2303     /* Or to just after a linebreak for a multiline match if possible */
2304
2305     else if (startline)
2306       {
2307       if (current_subject > md->start_subject + start_offset)
2308         {
2309         while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2310           current_subject++;
2311         }
2312       }
2313
2314     /* Or to a non-unique first char after study */
2315
2316     else if (start_bits != NULL)
2317       {
2318       while (current_subject < end_subject)
2319         {
2320         register unsigned int c = *current_subject;
2321         if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2322           else break;
2323         }
2324       }
2325
2326     /* Restore fudged end_subject */
2327
2328     end_subject = save_end_subject;
2329     }
2330
2331   /* If req_byte is set, we know that that character must appear in the subject
2332   for the match to succeed. If the first character is set, req_byte must be
2333   later in the subject; otherwise the test starts at the match point. This
2334   optimization can save a huge amount of work in patterns with nested unlimited
2335   repeats that aren't going to match. Writing separate code for cased/caseless
2336   versions makes it go faster, as does using an autoincrement and backing off
2337   on a match.
2338
2339   HOWEVER: when the subject string is very, very long, searching to its end can
2340   take a long time, and give bad performance on quite ordinary patterns. This
2341   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2342   don't do this when the string is sufficiently long.
2343
2344   ALSO: this processing is disabled when partial matching is requested.
2345   */
2346
2347   if (req_byte >= 0 &&
2348       end_subject - current_subject < REQ_BYTE_MAX &&
2349       (options & PCRE_PARTIAL) == 0)
2350     {
2351     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2352
2353     /* We don't need to repeat the search if we haven't yet reached the
2354     place we found it at last time. */
2355
2356     if (p > req_byte_ptr)
2357       {
2358       if (req_byte_caseless)
2359         {
2360         while (p < end_subject)
2361           {
2362           register int pp = *p++;
2363           if (pp == req_byte || pp == req_byte2) { p--; break; }
2364           }
2365         }
2366       else
2367         {
2368         while (p < end_subject)
2369           {
2370           if (*p++ == req_byte) { p--; break; }
2371           }
2372         }
2373
2374       /* If we can't find the required character, break the matching loop,
2375       which will cause a return or PCRE_ERROR_NOMATCH. */
2376
2377       if (p >= end_subject) break;
2378
2379       /* If we have found the required character, save the point where we
2380       found it, so that we don't search again next time round the loop if
2381       the start hasn't passed this character yet. */
2382
2383       req_byte_ptr = p;
2384       }
2385     }
2386
2387   /* OK, now we can do the business */
2388
2389   rc = internal_dfa_exec(
2390     md,                                /* fixed match data */
2391     md->start_code,                    /* this subexpression's code */
2392     current_subject,                   /* where we currently are */
2393     start_offset,                      /* start offset in subject */
2394     offsets,                           /* offset vector */
2395     offsetcount,                       /* size of same */
2396     workspace,                         /* workspace vector */
2397     wscount,                           /* size of same */
2398     re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2399     0,                                 /* function recurse level */
2400     0);                                /* regex recurse level */
2401
2402   /* Anything other than "no match" means we are done, always; otherwise, carry
2403   on only if not anchored. */
2404
2405   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2406
2407   /* Advance to the next subject character unless we are at the end of a line
2408   and firstline is set. */
2409
2410   if (firstline && IS_NEWLINE(current_subject)) break;
2411   current_subject++;
2412   if (utf8)
2413     {
2414     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2415       current_subject++;
2416     }
2417   if (current_subject > end_subject) break;
2418
2419   /* If we have just passed a CR and the newline option is CRLF or ANY, and we
2420   are now at a LF, advance the match position by one more character. */
2421
2422   if (current_subject[-1] == '\r' &&
2423        (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
2424        current_subject < end_subject &&
2425        *current_subject == '\n')
2426     current_subject++;
2427
2428   }   /* "Bumpalong" loop */
2429
2430 return PCRE_ERROR_NOMATCH;
2431 }
2432
2433 /* End of pcre_dfa_exec.c */