glib/pcre/pcre_dfa_exec.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9            Copyright (c) 1997-2007 University of Cambridge
  10
  11 -----------------------------------------------------------------------------
  12 Redistribution and use in source and binary forms, with or without
  13 modification, are permitted provided that the following conditions are met:
  14
  15     * Redistributions of source code must retain the above copyright notice,
  16       this list of conditions and the following disclaimer.
  17
  18     * Redistributions in binary form must reproduce the above copyright
  19       notice, this list of conditions and the following disclaimer in the
  20       documentation and/or other materials provided with the distribution.
  21
  22     * Neither the name of the University of Cambridge nor the names of its
  23       contributors may be used to endorse or promote products derived from
  24       this software without specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE.
  37 -----------------------------------------------------------------------------
  38 */
  39
  40
  41 /* This module contains the external function pcre_dfa_exec(), which is an
  42 alternative matching function that uses a sort of DFA algorithm (not a true
  43 FSM). This is NOT Perl- compatible, but it has advantages in certain
  44 applications. */
  45
  46
  47 #define NLBLOCK md             /* Block containing newline information */
  48 #define PSSTART start_subject  /* Field containing processed string start */
  49 #define PSEND   end_subject    /* Field containing processed string end */
  50
  51 #include "pcre_internal.h"
  52
  53
  54 /* For use to indent debugging output */
  55
  56 #define SP "                   "
  57
  58
  59
  60 /*************************************************
  61 *      Code parameters and static tables         *
  62 *************************************************/
  63
  64 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
  65 into others, under special conditions. A gap of 20 between the blocks should be
  66 enough. The resulting opcodes don't have to be less than 256 because they are
  67 never stored, so we push them well clear of the normal opcodes. */
  68
  69 #define OP_PROP_EXTRA       300
  70 #define OP_EXTUNI_EXTRA     320
  71 #define OP_ANYNL_EXTRA      340
  72 #define OP_HSPACE_EXTRA     360
  73 #define OP_VSPACE_EXTRA     380
  74
  75
  76 /* This table identifies those opcodes that are followed immediately by a
  77 character that is to be tested in some way. This makes is possible to
  78 centralize the loading of these characters. In the case of Type * etc, the
  79 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
  80 small value. ***NOTE*** If the start of this table is modified, the two tables
  81 that follow must also be modified. */
  82
  83 static uschar coptable[] = {
  84   0,                             /* End                                    */
  85   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
  86   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
  87   0, 0,                          /* Any, Anybyte                           */
  88   0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
  89   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
  90   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
  91   1,                             /* Char                                   */
  92   1,                             /* Charnc                                 */
  93   1,                             /* not                                    */
  94   /* Positive single-char repeats                                          */
  95   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
  96   3, 3, 3,                       /* upto, minupto, exact                   */
  97   1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
  98   /* Negative single-char repeats - only for chars < 256                   */
  99   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
 100   3, 3, 3,                       /* NOT upto, minupto, exact               */
 101   1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */
 102   /* Positive type repeats                                                 */
 103   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
 104   3, 3, 3,                       /* Type upto, minupto, exact              */
 105   1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
 106   /* Character class & ref repeats                                         */
 107   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
 108   0, 0,                          /* CRRANGE, CRMINRANGE                    */
 109   0,                             /* CLASS                                  */
 110   0,                             /* NCLASS                                 */
 111   0,                             /* XCLASS - variable length               */
 112   0,                             /* REF                                    */
 113   0,                             /* RECURSE                                */
 114   0,                             /* CALLOUT                                */
 115   0,                             /* Alt                                    */
 116   0,                             /* Ket                                    */
 117   0,                             /* KetRmax                                */
 118   0,                             /* KetRmin                                */
 119   0,                             /* Assert                                 */
 120   0,                             /* Assert not                             */
 121   0,                             /* Assert behind                          */
 122   0,                             /* Assert behind not                      */
 123   0,                             /* Reverse                                */
 124   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
 125   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
 126   0,                             /* CREF                                   */
 127   0,                             /* RREF                                   */
 128   0,                             /* DEF                                    */
 129   0, 0                           /* BRAZERO, BRAMINZERO                    */
 130 };
 131
 132 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
 133 and \w */
 134
 135 static uschar toptable1[] = {
 136   0, 0, 0, 0, 0, 0,
 137   ctype_digit, ctype_digit,
 138   ctype_space, ctype_space,
 139   ctype_word,  ctype_word,
 140   0                               /* OP_ANY */
 141 };
 142
 143 static uschar toptable2[] = {
 144   0, 0, 0, 0, 0, 0,
 145   ctype_digit, 0,
 146   ctype_space, 0,
 147   ctype_word,  0,
 148   1                               /* OP_ANY */
 149 };
 150
 151
 152 /* Structure for holding data about a particular state, which is in effect the
 153 current data for an active path through the match tree. It must consist
 154 entirely of ints because the working vector we are passed, and which we put
 155 these structures in, is a vector of ints. */
 156
 157 typedef struct stateblock {
 158   int offset;                     /* Offset to opcode */
 159   int count;                      /* Count for repeats */
 160   int ims;                        /* ims flag bits */
 161   int data;                       /* Some use extra data */
 162 } stateblock;
 163
 164 #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
 165
 166
 167 #ifdef DEBUG
 168 /*************************************************
 169 *             Print character string             *
 170 *************************************************/
 171
 172 /* Character string printing function for debugging.
 173
 174 Arguments:
 175   p            points to string
 176   length       number of bytes
 177   f            where to print
 178
 179 Returns:       nothing
 180 */
 181
 182 static void
 183 pchars(unsigned char *p, int length, FILE *f)
 184 {
 185 int c;
 186 while (length-- > 0)
 187   {
 188   if (isprint(c = *(p++)))
 189     fprintf(f, "%c", c);
 190   else
 191     fprintf(f, "\\x%02x", c);
 192   }
 193 }
 194 #endif
 195
 196
 197
 198 /*************************************************
 199 *    Execute a Regular Expression - DFA engine   *
 200 *************************************************/
 201
 202 /* This internal function applies a compiled pattern to a subject string,
 203 starting at a given point, using a DFA engine. This function is called from the
 204 external one, possibly multiple times if the pattern is not anchored. The
 205 function calls itself recursively for some kinds of subpattern.
 206
 207 Arguments:
 208   md                the match_data block with fixed information
 209   this_start_code   the opening bracket of this subexpression's code
 210   current_subject   where we currently are in the subject string
 211   start_offset      start offset in the subject string
 212   offsets           vector to contain the matching string offsets
 213   offsetcount       size of same
 214   workspace         vector of workspace
 215   wscount           size of same
 216   ims               the current ims flags
 217   rlevel            function call recursion level
 218   recursing         regex recursive call level
 219
 220 Returns:            > 0 =>
 221                     = 0 =>
 222                      -1 => failed to match
 223                    < -1 => some kind of unexpected problem
 224
 225 The following macros are used for adding states to the two state vectors (one
 226 for the current character, one for the following character). */
 227
 228 #define ADD_ACTIVE(x,y) \
 229   if (active_count++ < wscount) \
 230     { \
 231     next_active_state->offset = (x); \
 232     next_active_state->count  = (y); \
 233     next_active_state->ims    = ims; \
 234     next_active_state++; \
 235     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 236     } \
 237   else return PCRE_ERROR_DFA_WSSIZE
 238
 239 #define ADD_ACTIVE_DATA(x,y,z) \
 240   if (active_count++ < wscount) \
 241     { \
 242     next_active_state->offset = (x); \
 243     next_active_state->count  = (y); \
 244     next_active_state->ims    = ims; \
 245     next_active_state->data   = (z); \
 246     next_active_state++; \
 247     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 248     } \
 249   else return PCRE_ERROR_DFA_WSSIZE
 250
 251 #define ADD_NEW(x,y) \
 252   if (new_count++ < wscount) \
 253     { \
 254     next_new_state->offset = (x); \
 255     next_new_state->count  = (y); \
 256     next_new_state->ims    = ims; \
 257     next_new_state++; \
 258     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 259     } \
 260   else return PCRE_ERROR_DFA_WSSIZE
 261
 262 #define ADD_NEW_DATA(x,y,z) \
 263   if (new_count++ < wscount) \
 264     { \
 265     next_new_state->offset = (x); \
 266     next_new_state->count  = (y); \
 267     next_new_state->ims    = ims; \
 268     next_new_state->data   = (z); \
 269     next_new_state++; \
 270     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 271     } \
 272   else return PCRE_ERROR_DFA_WSSIZE
 273
 274 /* And now, here is the code */
 275
 276 static int
 277 internal_dfa_exec(
 278   dfa_match_data *md,
 279   const uschar *this_start_code,
 280   const uschar *current_subject,
 281   int start_offset,
 282   int *offsets,
 283   int offsetcount,
 284   int *workspace,
 285   int wscount,
 286   int ims,
 287   int  rlevel,
 288   int  recursing)
 289 {
 290 stateblock *active_states, *new_states, *temp_states;
 291 stateblock *next_active_state, *next_new_state;
 292
 293 const uschar *ctypes, *lcc, *fcc;
 294 const uschar *ptr;
 295 const uschar *end_code, *first_op;
 296
 297 int active_count, new_count, match_count;
 298
 299 /* Some fields in the md block are frequently referenced, so we load them into
 300 independent variables in the hope that this will perform better. */
 301
 302 const uschar *start_subject = md->start_subject;
 303 const uschar *end_subject = md->end_subject;
 304 const uschar *start_code = md->start_code;
 305
 306 #ifdef SUPPORT_UTF8
 307 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
 308 #else
 309 BOOL utf8 = FALSE;
 310 #endif
 311
 312 rlevel++;
 313 offsetcount &= (-2);
 314
 315 wscount -= 2;
 316 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
 317           (2 * INTS_PER_STATEBLOCK);
 318
 319 DPRINTF(("\n%.*s---------------------\n"
 320   "%.*sCall to internal_dfa_exec f=%d r=%d\n",
 321   rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
 322
 323 ctypes = md->tables + ctypes_offset;
 324 lcc = md->tables + lcc_offset;
 325 fcc = md->tables + fcc_offset;
 326
 327 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
 328
 329 active_states = (stateblock *)(workspace + 2);
 330 next_new_state = new_states = active_states + wscount;
 331 new_count = 0;
 332
 333 first_op = this_start_code + 1 + LINK_SIZE +
 334   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
 335
 336 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
 337 the alternative states onto the list, and find out where the end is. This
 338 makes is possible to use this function recursively, when we want to stop at a
 339 matching internal ket rather than at the end.
 340
 341 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
 342 a backward assertion. In that case, we have to find out the maximum amount to
 343 move back, and set up each alternative appropriately. */
 344
 345 if (*first_op == OP_REVERSE)
 346   {
 347   int max_back = 0;
 348   int gone_back;
 349
 350   end_code = this_start_code;
 351   do
 352     {
 353     int back = GET(end_code, 2+LINK_SIZE);
 354     if (back > max_back) max_back = back;
 355     end_code += GET(end_code, 1);
 356     }
 357   while (*end_code == OP_ALT);
 358
 359   /* If we can't go back the amount required for the longest lookbehind
 360   pattern, go back as far as we can; some alternatives may still be viable. */
 361
 362 #ifdef SUPPORT_UTF8
 363   /* In character mode we have to step back character by character */
 364
 365   if (utf8)
 366     {
 367     for (gone_back = 0; gone_back < max_back; gone_back++)
 368       {
 369       if (current_subject <= start_subject) break;
 370       current_subject--;
 371       while (current_subject > start_subject &&
 372              (*current_subject & 0xc0) == 0x80)
 373         current_subject--;
 374       }
 375     }
 376   else
 377 #endif
 378
 379   /* In byte-mode we can do this quickly. */
 380
 381     {
 382     gone_back = (current_subject - max_back < start_subject)?
 383       current_subject - start_subject : max_back;
 384     current_subject -= gone_back;
 385     }
 386
 387   /* Now we can process the individual branches. */
 388
 389   end_code = this_start_code;
 390   do
 391     {
 392     int back = GET(end_code, 2+LINK_SIZE);
 393     if (back <= gone_back)
 394       {
 395       int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
 396       ADD_NEW_DATA(-bstate, 0, gone_back - back);
 397       }
 398     end_code += GET(end_code, 1);
 399     }
 400   while (*end_code == OP_ALT);
 401  }
 402
 403 /* This is the code for a "normal" subpattern (not a backward assertion). The
 404 start of a whole pattern is always one of these. If we are at the top level,
 405 we may be asked to restart matching from the same point that we reached for a
 406 previous partial match. We still have to scan through the top-level branches to
 407 find the end state. */
 408
 409 else
 410   {
 411   end_code = this_start_code;
 412
 413   /* Restarting */
 414
 415   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
 416     {
 417     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
 418     new_count = workspace[1];
 419     if (!workspace[0])
 420       memcpy(new_states, active_states, new_count * sizeof(stateblock));
 421     }
 422
 423   /* Not restarting */
 424
 425   else
 426     {
 427     int length = 1 + LINK_SIZE +
 428       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
 429     do
 430       {
 431       ADD_NEW(end_code - start_code + length, 0);
 432       end_code += GET(end_code, 1);
 433       length = 1 + LINK_SIZE;
 434       }
 435     while (*end_code == OP_ALT);
 436     }
 437   }
 438
 439 workspace[0] = 0;    /* Bit indicating which vector is current */
 440
 441 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
 442
 443 /* Loop for scanning the subject */
 444
 445 ptr = current_subject;
 446 for (;;)
 447   {
 448   int i, j;
 449   int clen, dlen;
 450   unsigned int c, d;
 451
 452   /* Make the new state list into the active state list and empty the
 453   new state list. */
 454
 455   temp_states = active_states;
 456   active_states = new_states;
 457   new_states = temp_states;
 458   active_count = new_count;
 459   new_count = 0;
 460
 461   workspace[0] ^= 1;              /* Remember for the restarting feature */
 462   workspace[1] = active_count;
 463
 464 #ifdef DEBUG
 465   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
 466   pchars((uschar *)ptr, strlen((char *)ptr), stdout);
 467   printf("\"\n");
 468
 469   printf("%.*sActive states: ", rlevel*2-2, SP);
 470   for (i = 0; i < active_count; i++)
 471     printf("%d/%d ", active_states[i].offset, active_states[i].count);
 472   printf("\n");
 473 #endif
 474
 475   /* Set the pointers for adding new states */
 476
 477   next_active_state = active_states + active_count;
 478   next_new_state = new_states;
 479
 480   /* Load the current character from the subject outside the loop, as many
 481   different states may want to look at it, and we assume that at least one
 482   will. */
 483
 484   if (ptr < end_subject)
 485     {
 486     clen = 1;        /* Number of bytes in the character */
 487 #ifdef SUPPORT_UTF8
 488     if (utf8) { GETCHARLEN(c, ptr, clen); } else
 489 #endif  /* SUPPORT_UTF8 */
 490     c = *ptr;
 491     }
 492   else
 493     {
 494     clen = 0;        /* This indicates the end of the subject */
 495     c = NOTACHAR;    /* This value should never actually be used */
 496     }
 497
 498   /* Scan up the active states and act on each one. The result of an action
 499   may be to add more states to the currently active list (e.g. on hitting a
 500   parenthesis) or it may be to put states on the new list, for considering
 501   when we move the character pointer on. */
 502
 503   for (i = 0; i < active_count; i++)
 504     {
 505     stateblock *current_state = active_states + i;
 506     const uschar *code;
 507     int state_offset = current_state->offset;
 508     int count, codevalue;
 509 #ifdef SUPPORT_UCP
 510     int chartype, script;
 511 #endif
 512
 513 #ifdef DEBUG
 514     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
 515     if (clen == 0) printf("EOL\n");
 516       else if (c > 32 && c < 127) printf("'%c'\n", c);
 517         else printf("0x%02x\n", c);
 518 #endif
 519
 520     /* This variable is referred to implicity in the ADD_xxx macros. */
 521
 522     ims = current_state->ims;
 523
 524     /* A negative offset is a special case meaning "hold off going to this
 525     (negated) state until the number of characters in the data field have
 526     been skipped". */
 527
 528     if (state_offset < 0)
 529       {
 530       if (current_state->data > 0)
 531         {
 532         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
 533         ADD_NEW_DATA(state_offset, current_state->count,
 534           current_state->data - 1);
 535         continue;
 536         }
 537       else
 538         {
 539         current_state->offset = state_offset = -state_offset;
 540         }
 541       }
 542
 543     /* Check for a duplicate state with the same count, and skip if found. */
 544
 545     for (j = 0; j < i; j++)
 546       {
 547       if (active_states[j].offset == state_offset &&
 548           active_states[j].count == current_state->count)
 549         {
 550         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
 551         goto NEXT_ACTIVE_STATE;
 552         }
 553       }
 554
 555     /* The state offset is the offset to the opcode */
 556
 557     code = start_code + state_offset;
 558     codevalue = *code;
 559
 560     /* If this opcode is followed by an inline character, load it. It is
 561     tempting to test for the presence of a subject character here, but that
 562     is wrong, because sometimes zero repetitions of the subject are
 563     permitted.
 564
 565     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
 566     argument that is not a data character - but is always one byte long. We
 567     have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
 568     this case. To keep the other cases fast, convert these ones to new opcodes.
 569     */
 570
 571     if (coptable[codevalue] > 0)
 572       {
 573       dlen = 1;
 574 #ifdef SUPPORT_UTF8
 575       if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
 576 #endif  /* SUPPORT_UTF8 */
 577       d = code[coptable[codevalue]];
 578       if (codevalue >= OP_TYPESTAR)
 579         {
 580         switch(d)
 581           {
 582           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
 583           case OP_NOTPROP:
 584           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
 585           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
 586           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
 587           case OP_NOT_HSPACE:
 588           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
 589           case OP_NOT_VSPACE:
 590           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
 591           default: break;
 592           }
 593         }
 594       }
 595     else
 596       {
 597       dlen = 0;         /* Not strictly necessary, but compilers moan */
 598       d = NOTACHAR;     /* if these variables are not set. */
 599       }
 600
 601
 602     /* Now process the individual opcodes */
 603
 604     switch (codevalue)
 605       {
 606
 607 /* ========================================================================== */
 608       /* Reached a closing bracket. If not at the end of the pattern, carry
 609       on with the next opcode. Otherwise, unless we have an empty string and
 610       PCRE_NOTEMPTY is set, save the match data, shifting up all previous
 611       matches so we always have the longest first. */
 612
 613       case OP_KET:
 614       case OP_KETRMIN:
 615       case OP_KETRMAX:
 616       if (code != end_code)
 617         {
 618         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
 619         if (codevalue != OP_KET)
 620           {
 621           ADD_ACTIVE(state_offset - GET(code, 1), 0);
 622           }
 623         }
 624       else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
 625         {
 626         if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
 627           else if (match_count > 0 && ++match_count * 2 >= offsetcount)
 628             match_count = 0;
 629         count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
 630         if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
 631         if (offsetcount >= 2)
 632           {
 633           offsets[0] = current_subject - start_subject;
 634           offsets[1] = ptr - start_subject;
 635           DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
 636             offsets[1] - offsets[0], current_subject));
 637           }
 638         if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
 639           {
 640           DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
 641             "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
 642             match_count, rlevel*2-2, SP));
 643           return match_count;
 644           }
 645         }
 646       break;
 647
 648 /* ========================================================================== */
 649       /* These opcodes add to the current list of states without looking
 650       at the current character. */
 651
 652       /*-----------------------------------------------------------------*/
 653       case OP_ALT:
 654       do { code += GET(code, 1); } while (*code == OP_ALT);
 655       ADD_ACTIVE(code - start_code, 0);
 656       break;
 657
 658       /*-----------------------------------------------------------------*/
 659       case OP_BRA:
 660       case OP_SBRA:
 661       do
 662         {
 663         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
 664         code += GET(code, 1);
 665         }
 666       while (*code == OP_ALT);
 667       break;
 668
 669       /*-----------------------------------------------------------------*/
 670       case OP_CBRA:
 671       case OP_SCBRA:
 672       ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
 673       code += GET(code, 1);
 674       while (*code == OP_ALT)
 675         {
 676         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
 677         code += GET(code, 1);
 678         }
 679       break;
 680
 681       /*-----------------------------------------------------------------*/
 682       case OP_BRAZERO:
 683       case OP_BRAMINZERO:
 684       ADD_ACTIVE(state_offset + 1, 0);
 685       code += 1 + GET(code, 2);
 686       while (*code == OP_ALT) code += GET(code, 1);
 687       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
 688       break;
 689
 690       /*-----------------------------------------------------------------*/
 691       case OP_CIRC:
 692       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
 693           ((ims & PCRE_MULTILINE) != 0 &&
 694             ptr != end_subject &&
 695             WAS_NEWLINE(ptr)))
 696         { ADD_ACTIVE(state_offset + 1, 0); }
 697       break;
 698
 699       /*-----------------------------------------------------------------*/
 700       case OP_EOD:
 701       if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 702       break;
 703
 704       /*-----------------------------------------------------------------*/
 705       case OP_OPT:
 706       ims = code[1];
 707       ADD_ACTIVE(state_offset + 2, 0);
 708       break;
 709
 710       /*-----------------------------------------------------------------*/
 711       case OP_SOD:
 712       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 713       break;
 714
 715       /*-----------------------------------------------------------------*/
 716       case OP_SOM:
 717       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
 718       break;
 719
 720
 721 /* ========================================================================== */
 722       /* These opcodes inspect the next subject character, and sometimes
 723       the previous one as well, but do not have an argument. The variable
 724       clen contains the length of the current character and is zero if we are
 725       at the end of the subject. */
 726
 727       /*-----------------------------------------------------------------*/
 728       case OP_ANY:
 729       if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
 730         { ADD_NEW(state_offset + 1, 0); }
 731       break;
 732
 733       /*-----------------------------------------------------------------*/
 734       case OP_EODN:
 735       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
 736         { ADD_ACTIVE(state_offset + 1, 0); }
 737       break;
 738
 739       /*-----------------------------------------------------------------*/
 740       case OP_DOLL:
 741       if ((md->moptions & PCRE_NOTEOL) == 0)
 742         {
 743         if (clen == 0 ||
 744             (IS_NEWLINE(ptr) &&
 745                ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
 746             ))
 747           { ADD_ACTIVE(state_offset + 1, 0); }
 748         }
 749       else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
 750         { ADD_ACTIVE(state_offset + 1, 0); }
 751       break;
 752
 753       /*-----------------------------------------------------------------*/
 754
 755       case OP_DIGIT:
 756       case OP_WHITESPACE:
 757       case OP_WORDCHAR:
 758       if (clen > 0 && c < 256 &&
 759             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
 760         { ADD_NEW(state_offset + 1, 0); }
 761       break;
 762
 763       /*-----------------------------------------------------------------*/
 764       case OP_NOT_DIGIT:
 765       case OP_NOT_WHITESPACE:
 766       case OP_NOT_WORDCHAR:
 767       if (clen > 0 && (c >= 256 ||
 768             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
 769         { ADD_NEW(state_offset + 1, 0); }
 770       break;
 771
 772       /*-----------------------------------------------------------------*/
 773       case OP_WORD_BOUNDARY:
 774       case OP_NOT_WORD_BOUNDARY:
 775         {
 776         int left_word, right_word;
 777
 778         if (ptr > start_subject)
 779           {
 780           const uschar *temp = ptr - 1;
 781 #ifdef SUPPORT_UTF8
 782           if (utf8) BACKCHAR(temp);
 783 #endif
 784           GETCHARTEST(d, temp);
 785           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
 786           }
 787         else left_word = 0;
 788
 789         if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
 790           else right_word = 0;
 791
 792         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
 793           { ADD_ACTIVE(state_offset + 1, 0); }
 794         }
 795       break;
 796
 797
 798       /*-----------------------------------------------------------------*/
 799       /* Check the next character by Unicode property. We will get here only
 800       if the support is in the binary; otherwise a compile-time error occurs.
 801       */
 802
 803 #ifdef SUPPORT_UCP
 804       case OP_PROP:
 805       case OP_NOTPROP:
 806       if (clen > 0)
 807         {
 808         BOOL OK;
 809         int category = _pcre_ucp_findprop(c, &chartype, &script);
 810         switch(code[1])
 811           {
 812           case PT_ANY:
 813           OK = TRUE;
 814           break;
 815
 816           case PT_LAMP:
 817           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
 818           break;
 819
 820           case PT_GC:
 821           OK = category == code[2];
 822           break;
 823
 824           case PT_PC:
 825           OK = chartype == code[2];
 826           break;
 827
 828           case PT_SC:
 829           OK = script == code[2];
 830           break;
 831
 832           /* Should never occur, but keep compilers from grumbling. */
 833
 834           default:
 835           OK = codevalue != OP_PROP;
 836           break;
 837           }
 838
 839         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
 840         }
 841       break;
 842 #endif
 843
 844
 845
 846 /* ========================================================================== */
 847       /* These opcodes likewise inspect the subject character, but have an
 848       argument that is not a data character. It is one of these opcodes:
 849       OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
 850       OP_NOT_WORDCHAR. The value is loaded into d. */
 851
 852       case OP_TYPEPLUS:
 853       case OP_TYPEMINPLUS:
 854       case OP_TYPEPOSPLUS:
 855       count = current_state->count;  /* Already matched */
 856       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 857       if (clen > 0)
 858         {
 859         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 860             (c < 256 &&
 861               (d != OP_ANY ||
 862                (ims & PCRE_DOTALL) != 0 ||
 863                !IS_NEWLINE(ptr)
 864               ) &&
 865               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 866           {
 867           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
 868             {
 869             active_count--;            /* Remove non-match possibility */
 870             next_active_state--;
 871             }
 872           count++;
 873           ADD_NEW(state_offset, count);
 874           }
 875         }
 876       break;
 877
 878       /*-----------------------------------------------------------------*/
 879       case OP_TYPEQUERY:
 880       case OP_TYPEMINQUERY:
 881       case OP_TYPEPOSQUERY:
 882       ADD_ACTIVE(state_offset + 2, 0);
 883       if (clen > 0)
 884         {
 885         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 886             (c < 256 &&
 887               (d != OP_ANY ||
 888                (ims & PCRE_DOTALL) != 0 ||
 889                !IS_NEWLINE(ptr)
 890               ) &&
 891               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 892           {
 893           if (codevalue == OP_TYPEPOSQUERY)
 894             {
 895             active_count--;            /* Remove non-match possibility */
 896             next_active_state--;
 897             }
 898           ADD_NEW(state_offset + 2, 0);
 899           }
 900         }
 901       break;
 902
 903       /*-----------------------------------------------------------------*/
 904       case OP_TYPESTAR:
 905       case OP_TYPEMINSTAR:
 906       case OP_TYPEPOSSTAR:
 907       ADD_ACTIVE(state_offset + 2, 0);
 908       if (clen > 0)
 909         {
 910         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 911             (c < 256 &&
 912               (d != OP_ANY ||
 913                (ims & PCRE_DOTALL) != 0 ||
 914                !IS_NEWLINE(ptr)
 915               ) &&
 916               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 917           {
 918           if (codevalue == OP_TYPEPOSSTAR)
 919             {
 920             active_count--;            /* Remove non-match possibility */
 921             next_active_state--;
 922             }
 923           ADD_NEW(state_offset, 0);
 924           }
 925         }
 926       break;
 927
 928       /*-----------------------------------------------------------------*/
 929       case OP_TYPEEXACT:
 930       count = current_state->count;  /* Number already matched */
 931       if (clen > 0)
 932         {
 933         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 934             (c < 256 &&
 935               (d != OP_ANY ||
 936                (ims & PCRE_DOTALL) != 0 ||
 937                !IS_NEWLINE(ptr)
 938               ) &&
 939               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 940           {
 941           if (++count >= GET2(code, 1))
 942             { ADD_NEW(state_offset + 4, 0); }
 943           else
 944             { ADD_NEW(state_offset, count); }
 945           }
 946         }
 947       break;
 948
 949       /*-----------------------------------------------------------------*/
 950       case OP_TYPEUPTO:
 951       case OP_TYPEMINUPTO:
 952       case OP_TYPEPOSUPTO:
 953       ADD_ACTIVE(state_offset + 4, 0);
 954       count = current_state->count;  /* Number already matched */
 955       if (clen > 0)
 956         {
 957         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 958             (c < 256 &&
 959               (d != OP_ANY ||
 960                (ims & PCRE_DOTALL) != 0 ||
 961                !IS_NEWLINE(ptr)
 962               ) &&
 963               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 964           {
 965           if (codevalue == OP_TYPEPOSUPTO)
 966             {
 967             active_count--;           /* Remove non-match possibility */
 968             next_active_state--;
 969             }
 970           if (++count >= GET2(code, 1))
 971             { ADD_NEW(state_offset + 4, 0); }
 972           else
 973             { ADD_NEW(state_offset, count); }
 974           }
 975         }
 976       break;
 977
 978 /* ========================================================================== */
 979       /* These are virtual opcodes that are used when something like
 980       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
 981       argument. It keeps the code above fast for the other cases. The argument
 982       is in the d variable. */
 983
 984 #ifdef SUPPORT_UCP
 985       case OP_PROP_EXTRA + OP_TYPEPLUS:
 986       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
 987       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
 988       count = current_state->count;           /* Already matched */
 989       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
 990       if (clen > 0)
 991         {
 992         BOOL OK;
 993         int category = _pcre_ucp_findprop(c, &chartype, &script);
 994         switch(code[2])
 995           {
 996           case PT_ANY:
 997           OK = TRUE;
 998           break;
 999
1000           case PT_LAMP:
1001           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1002           break;
1003
1004           case PT_GC:
1005           OK = category == code[3];
1006           break;
1007
1008           case PT_PC:
1009           OK = chartype == code[3];
1010           break;
1011
1012           case PT_SC:
1013           OK = script == code[3];
1014           break;
1015
1016           /* Should never occur, but keep compilers from grumbling. */
1017
1018           default:
1019           OK = codevalue != OP_PROP;
1020           break;
1021           }
1022
1023         if (OK == (d == OP_PROP))
1024           {
1025           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1026             {
1027             active_count--;           /* Remove non-match possibility */
1028             next_active_state--;
1029             }
1030           count++;
1031           ADD_NEW(state_offset, count);
1032           }
1033         }
1034       break;
1035
1036       /*-----------------------------------------------------------------*/
1037       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1038       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1039       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1040       count = current_state->count;  /* Already matched */
1041       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1042       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1043         {
1044         const uschar *nptr = ptr + clen;
1045         int ncount = 0;
1046         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1047           {
1048           active_count--;           /* Remove non-match possibility */
1049           next_active_state--;
1050           }
1051         while (nptr < end_subject)
1052           {
1053           int nd;
1054           int ndlen = 1;
1055           GETCHARLEN(nd, nptr, ndlen);
1056           if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1057           ncount++;
1058           nptr += ndlen;
1059           }
1060         count++;
1061         ADD_NEW_DATA(-state_offset, count, ncount);
1062         }
1063       break;
1064 #endif
1065
1066       /*-----------------------------------------------------------------*/
1067       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1068       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1069       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1070       count = current_state->count;  /* Already matched */
1071       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1072       if (clen > 0)
1073         {
1074         int ncount = 0;
1075         switch (c)
1076           {
1077           case 0x000d:
1078           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1079           /* Fall through */
1080           case 0x000a:
1081           case 0x000b:
1082           case 0x000c:
1083           case 0x0085:
1084           case 0x2028:
1085           case 0x2029:
1086           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1087             {
1088             active_count--;           /* Remove non-match possibility */
1089             next_active_state--;
1090             }
1091           count++;
1092           ADD_NEW_DATA(-state_offset, count, ncount);
1093           break;
1094           default:
1095           break;
1096           }
1097         }
1098       break;
1099
1100       /*-----------------------------------------------------------------*/
1101       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1102       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1103       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1104       count = current_state->count;  /* Already matched */
1105       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1106       if (clen > 0)
1107         {
1108         BOOL OK;
1109         switch (c)
1110           {
1111           case 0x000a:
1112           case 0x000b:
1113           case 0x000c:
1114           case 0x000d:
1115           case 0x0085:
1116           case 0x2028:
1117           case 0x2029:
1118           OK = TRUE;
1119           break;
1120
1121           default:
1122           OK = FALSE;
1123           break;
1124           }
1125
1126         if (OK == (d == OP_VSPACE))
1127           {
1128           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1129             {
1130             active_count--;           /* Remove non-match possibility */
1131             next_active_state--;
1132             }
1133           count++;
1134           ADD_NEW_DATA(-state_offset, count, 0);
1135           }
1136         }
1137       break;
1138
1139       /*-----------------------------------------------------------------*/
1140       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1141       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1142       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1143       count = current_state->count;  /* Already matched */
1144       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1145       if (clen > 0)
1146         {
1147         BOOL OK;
1148         switch (c)
1149           {
1150           case 0x09:      /* HT */
1151           case 0x20:      /* SPACE */
1152           case 0xa0:      /* NBSP */
1153           case 0x1680:    /* OGHAM SPACE MARK */
1154           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1155           case 0x2000:    /* EN QUAD */
1156           case 0x2001:    /* EM QUAD */
1157           case 0x2002:    /* EN SPACE */
1158           case 0x2003:    /* EM SPACE */
1159           case 0x2004:    /* THREE-PER-EM SPACE */
1160           case 0x2005:    /* FOUR-PER-EM SPACE */
1161           case 0x2006:    /* SIX-PER-EM SPACE */
1162           case 0x2007:    /* FIGURE SPACE */
1163           case 0x2008:    /* PUNCTUATION SPACE */
1164           case 0x2009:    /* THIN SPACE */
1165           case 0x200A:    /* HAIR SPACE */
1166           case 0x202f:    /* NARROW NO-BREAK SPACE */
1167           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1168           case 0x3000:    /* IDEOGRAPHIC SPACE */
1169           OK = TRUE;
1170           break;
1171
1172           default:
1173           OK = FALSE;
1174           break;
1175           }
1176
1177         if (OK == (d == OP_HSPACE))
1178           {
1179           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1180             {
1181             active_count--;           /* Remove non-match possibility */
1182             next_active_state--;
1183             }
1184           count++;
1185           ADD_NEW_DATA(-state_offset, count, 0);
1186           }
1187         }
1188       break;
1189
1190       /*-----------------------------------------------------------------*/
1191 #ifdef SUPPORT_UCP
1192       case OP_PROP_EXTRA + OP_TYPEQUERY:
1193       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1194       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1195       count = 4;
1196       goto QS1;
1197
1198       case OP_PROP_EXTRA + OP_TYPESTAR:
1199       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1200       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1201       count = 0;
1202
1203       QS1:
1204
1205       ADD_ACTIVE(state_offset + 4, 0);
1206       if (clen > 0)
1207         {
1208         BOOL OK;
1209         int category = _pcre_ucp_findprop(c, &chartype, &script);
1210         switch(code[2])
1211           {
1212           case PT_ANY:
1213           OK = TRUE;
1214           break;
1215
1216           case PT_LAMP:
1217           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1218           break;
1219
1220           case PT_GC:
1221           OK = category == code[3];
1222           break;
1223
1224           case PT_PC:
1225           OK = chartype == code[3];
1226           break;
1227
1228           case PT_SC:
1229           OK = script == code[3];
1230           break;
1231
1232           /* Should never occur, but keep compilers from grumbling. */
1233
1234           default:
1235           OK = codevalue != OP_PROP;
1236           break;
1237           }
1238
1239         if (OK == (d == OP_PROP))
1240           {
1241           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1242               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1243             {
1244             active_count--;           /* Remove non-match possibility */
1245             next_active_state--;
1246             }
1247           ADD_NEW(state_offset + count, 0);
1248           }
1249         }
1250       break;
1251
1252       /*-----------------------------------------------------------------*/
1253       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1254       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1255       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1256       count = 2;
1257       goto QS2;
1258
1259       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1260       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1261       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1262       count = 0;
1263
1264       QS2:
1265
1266       ADD_ACTIVE(state_offset + 2, 0);
1267       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1268         {
1269         const uschar *nptr = ptr + clen;
1270         int ncount = 0;
1271         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1272             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1273           {
1274           active_count--;           /* Remove non-match possibility */
1275           next_active_state--;
1276           }
1277         while (nptr < end_subject)
1278           {
1279           int nd;
1280           int ndlen = 1;
1281           GETCHARLEN(nd, nptr, ndlen);
1282           if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1283           ncount++;
1284           nptr += ndlen;
1285           }
1286         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1287         }
1288       break;
1289 #endif
1290
1291       /*-----------------------------------------------------------------*/
1292       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1293       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1294       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1295       count = 2;
1296       goto QS3;
1297
1298       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1299       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1300       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1301       count = 0;
1302
1303       QS3:
1304       ADD_ACTIVE(state_offset + 2, 0);
1305       if (clen > 0)
1306         {
1307         int ncount = 0;
1308         switch (c)
1309           {
1310           case 0x000d:
1311           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1312           /* Fall through */
1313           case 0x000a:
1314           case 0x000b:
1315           case 0x000c:
1316           case 0x0085:
1317           case 0x2028:
1318           case 0x2029:
1319           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1320               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1321             {
1322             active_count--;           /* Remove non-match possibility */
1323             next_active_state--;
1324             }
1325           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1326           break;
1327           default:
1328           break;
1329           }
1330         }
1331       break;
1332
1333       /*-----------------------------------------------------------------*/
1334       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1335       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1336       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1337       count = 2;
1338       goto QS4;
1339
1340       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1341       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1342       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1343       count = 0;
1344
1345       QS4:
1346       ADD_ACTIVE(state_offset + 2, 0);
1347       if (clen > 0)
1348         {
1349         BOOL OK;
1350         switch (c)
1351           {
1352           case 0x000a:
1353           case 0x000b:
1354           case 0x000c:
1355           case 0x000d:
1356           case 0x0085:
1357           case 0x2028:
1358           case 0x2029:
1359           OK = TRUE;
1360           break;
1361
1362           default:
1363           OK = FALSE;
1364           break;
1365           }
1366         if (OK == (d == OP_VSPACE))
1367           {
1368           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1369               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1370             {
1371             active_count--;           /* Remove non-match possibility */
1372             next_active_state--;
1373             }
1374           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1375           }
1376         }
1377       break;
1378
1379       /*-----------------------------------------------------------------*/
1380       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1381       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1382       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1383       count = 2;
1384       goto QS5;
1385
1386       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1387       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1388       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1389       count = 0;
1390
1391       QS5:
1392       ADD_ACTIVE(state_offset + 2, 0);
1393       if (clen > 0)
1394         {
1395         BOOL OK;
1396         switch (c)
1397           {
1398           case 0x09:      /* HT */
1399           case 0x20:      /* SPACE */
1400           case 0xa0:      /* NBSP */
1401           case 0x1680:    /* OGHAM SPACE MARK */
1402           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1403           case 0x2000:    /* EN QUAD */
1404           case 0x2001:    /* EM QUAD */
1405           case 0x2002:    /* EN SPACE */
1406           case 0x2003:    /* EM SPACE */
1407           case 0x2004:    /* THREE-PER-EM SPACE */
1408           case 0x2005:    /* FOUR-PER-EM SPACE */
1409           case 0x2006:    /* SIX-PER-EM SPACE */
1410           case 0x2007:    /* FIGURE SPACE */
1411           case 0x2008:    /* PUNCTUATION SPACE */
1412           case 0x2009:    /* THIN SPACE */
1413           case 0x200A:    /* HAIR SPACE */
1414           case 0x202f:    /* NARROW NO-BREAK SPACE */
1415           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1416           case 0x3000:    /* IDEOGRAPHIC SPACE */
1417           OK = TRUE;
1418           break;
1419
1420           default:
1421           OK = FALSE;
1422           break;
1423           }
1424
1425         if (OK == (d == OP_HSPACE))
1426           {
1427           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1428               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1429             {
1430             active_count--;           /* Remove non-match possibility */
1431             next_active_state--;
1432             }
1433           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1434           }
1435         }
1436       break;
1437
1438       /*-----------------------------------------------------------------*/
1439 #ifdef SUPPORT_UCP
1440       case OP_PROP_EXTRA + OP_TYPEEXACT:
1441       case OP_PROP_EXTRA + OP_TYPEUPTO:
1442       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1443       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1444       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1445         { ADD_ACTIVE(state_offset + 6, 0); }
1446       count = current_state->count;  /* Number already matched */
1447       if (clen > 0)
1448         {
1449         BOOL OK;
1450         int category = _pcre_ucp_findprop(c, &chartype, &script);
1451         switch(code[4])
1452           {
1453           case PT_ANY:
1454           OK = TRUE;
1455           break;
1456
1457           case PT_LAMP:
1458           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1459           break;
1460
1461           case PT_GC:
1462           OK = category == code[5];
1463           break;
1464
1465           case PT_PC:
1466           OK = chartype == code[5];
1467           break;
1468
1469           case PT_SC:
1470           OK = script == code[5];
1471           break;
1472
1473           /* Should never occur, but keep compilers from grumbling. */
1474
1475           default:
1476           OK = codevalue != OP_PROP;
1477           break;
1478           }
1479
1480         if (OK == (d == OP_PROP))
1481           {
1482           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1483             {
1484             active_count--;           /* Remove non-match possibility */
1485             next_active_state--;
1486             }
1487           if (++count >= GET2(code, 1))
1488             { ADD_NEW(state_offset + 6, 0); }
1489           else
1490             { ADD_NEW(state_offset, count); }
1491           }
1492         }
1493       break;
1494
1495       /*-----------------------------------------------------------------*/
1496       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1497       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1498       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1499       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1500       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1501         { ADD_ACTIVE(state_offset + 4, 0); }
1502       count = current_state->count;  /* Number already matched */
1503       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1504         {
1505         const uschar *nptr = ptr + clen;
1506         int ncount = 0;
1507         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1508           {
1509           active_count--;           /* Remove non-match possibility */
1510           next_active_state--;
1511           }
1512         while (nptr < end_subject)
1513           {
1514           int nd;
1515           int ndlen = 1;
1516           GETCHARLEN(nd, nptr, ndlen);
1517           if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1518           ncount++;
1519           nptr += ndlen;
1520           }
1521         if (++count >= GET2(code, 1))
1522           { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1523         else
1524           { ADD_NEW_DATA(-state_offset, count, ncount); }
1525         }
1526       break;
1527 #endif
1528
1529       /*-----------------------------------------------------------------*/
1530       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1531       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1532       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1533       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1534       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1535         { ADD_ACTIVE(state_offset + 4, 0); }
1536       count = current_state->count;  /* Number already matched */
1537       if (clen > 0)
1538         {
1539         int ncount = 0;
1540         switch (c)
1541           {
1542           case 0x000d:
1543           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1544           /* Fall through */
1545           case 0x000a:
1546           case 0x000b:
1547           case 0x000c:
1548           case 0x0085:
1549           case 0x2028:
1550           case 0x2029:
1551           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1552             {
1553             active_count--;           /* Remove non-match possibility */
1554             next_active_state--;
1555             }
1556           if (++count >= GET2(code, 1))
1557             { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1558           else
1559             { ADD_NEW_DATA(-state_offset, count, ncount); }
1560           break;
1561           default:
1562           break;
1563           }
1564         }
1565       break;
1566
1567       /*-----------------------------------------------------------------*/
1568       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1569       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1570       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1571       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1572       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1573         { ADD_ACTIVE(state_offset + 4, 0); }
1574       count = current_state->count;  /* Number already matched */
1575       if (clen > 0)
1576         {
1577         BOOL OK;
1578         switch (c)
1579           {
1580           case 0x000a:
1581           case 0x000b:
1582           case 0x000c:
1583           case 0x000d:
1584           case 0x0085:
1585           case 0x2028:
1586           case 0x2029:
1587           OK = TRUE;
1588           break;
1589
1590           default:
1591           OK = FALSE;
1592           }
1593
1594         if (OK == (d == OP_VSPACE))
1595           {
1596           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1597             {
1598             active_count--;           /* Remove non-match possibility */
1599             next_active_state--;
1600             }
1601           if (++count >= GET2(code, 1))
1602             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1603           else
1604             { ADD_NEW_DATA(-state_offset, count, 0); }
1605           }
1606         }
1607       break;
1608
1609       /*-----------------------------------------------------------------*/
1610       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1611       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1612       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1613       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1614       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1615         { ADD_ACTIVE(state_offset + 4, 0); }
1616       count = current_state->count;  /* Number already matched */
1617       if (clen > 0)
1618         {
1619         BOOL OK;
1620         switch (c)
1621           {
1622           case 0x09:      /* HT */
1623           case 0x20:      /* SPACE */
1624           case 0xa0:      /* NBSP */
1625           case 0x1680:    /* OGHAM SPACE MARK */
1626           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1627           case 0x2000:    /* EN QUAD */
1628           case 0x2001:    /* EM QUAD */
1629           case 0x2002:    /* EN SPACE */
1630           case 0x2003:    /* EM SPACE */
1631           case 0x2004:    /* THREE-PER-EM SPACE */
1632           case 0x2005:    /* FOUR-PER-EM SPACE */
1633           case 0x2006:    /* SIX-PER-EM SPACE */
1634           case 0x2007:    /* FIGURE SPACE */
1635           case 0x2008:    /* PUNCTUATION SPACE */
1636           case 0x2009:    /* THIN SPACE */
1637           case 0x200A:    /* HAIR SPACE */
1638           case 0x202f:    /* NARROW NO-BREAK SPACE */
1639           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1640           case 0x3000:    /* IDEOGRAPHIC SPACE */
1641           OK = TRUE;
1642           break;
1643
1644           default:
1645           OK = FALSE;
1646           break;
1647           }
1648
1649         if (OK == (d == OP_HSPACE))
1650           {
1651           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1652             {
1653             active_count--;           /* Remove non-match possibility */
1654             next_active_state--;
1655             }
1656           if (++count >= GET2(code, 1))
1657             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1658           else
1659             { ADD_NEW_DATA(-state_offset, count, 0); }
1660           }
1661         }
1662       break;
1663
1664 /* ========================================================================== */
1665       /* These opcodes are followed by a character that is usually compared
1666       to the current subject character; it is loaded into d. We still get
1667       here even if there is no subject character, because in some cases zero
1668       repetitions are permitted. */
1669
1670       /*-----------------------------------------------------------------*/
1671       case OP_CHAR:
1672       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1673       break;
1674
1675       /*-----------------------------------------------------------------*/
1676       case OP_CHARNC:
1677       if (clen == 0) break;
1678
1679 #ifdef SUPPORT_UTF8
1680       if (utf8)
1681         {
1682         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1683           {
1684           unsigned int othercase;
1685           if (c < 128) othercase = fcc[c]; else
1686
1687           /* If we have Unicode property support, we can use it to test the
1688           other case of the character. */
1689
1690 #ifdef SUPPORT_UCP
1691           othercase = _pcre_ucp_othercase(c);
1692 #else
1693           othercase = NOTACHAR;
1694 #endif
1695
1696           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1697           }
1698         }
1699       else
1700 #endif  /* SUPPORT_UTF8 */
1701
1702       /* Non-UTF-8 mode */
1703         {
1704         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1705         }
1706       break;
1707
1708
1709 #ifdef SUPPORT_UCP
1710       /*-----------------------------------------------------------------*/
1711       /* This is a tricky one because it can match more than one character.
1712       Find out how many characters to skip, and then set up a negative state
1713       to wait for them to pass before continuing. */
1714
1715       case OP_EXTUNI:
1716       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1717         {
1718         const uschar *nptr = ptr + clen;
1719         int ncount = 0;
1720         while (nptr < end_subject)
1721           {
1722           int nclen = 1;
1723           GETCHARLEN(c, nptr, nclen);
1724           if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1725           ncount++;
1726           nptr += nclen;
1727           }
1728         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1729         }
1730       break;
1731 #endif
1732
1733       /*-----------------------------------------------------------------*/
1734       /* This is a tricky like EXTUNI because it too can match more than one
1735       character (when CR is followed by LF). In this case, set up a negative
1736       state to wait for one character to pass before continuing. */
1737
1738       case OP_ANYNL:
1739       if (clen > 0) switch(c)
1740         {
1741         case 0x000a:
1742         case 0x000b:
1743         case 0x000c:
1744         case 0x0085:
1745         case 0x2028:
1746         case 0x2029:
1747         ADD_NEW(state_offset + 1, 0);
1748         break;
1749         case 0x000d:
1750         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1751           {
1752           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1753           }
1754         else
1755           {
1756           ADD_NEW(state_offset + 1, 0);
1757           }
1758         break;
1759         }
1760       break;
1761
1762       /*-----------------------------------------------------------------*/
1763       case OP_NOT_VSPACE:
1764       if (clen > 0) switch(c)
1765         {
1766         case 0x000a:
1767         case 0x000b:
1768         case 0x000c:
1769         case 0x000d:
1770         case 0x0085:
1771         case 0x2028:
1772         case 0x2029:
1773         break;
1774
1775         default:
1776         ADD_NEW(state_offset + 1, 0);
1777         break;
1778         }
1779       break;
1780
1781       /*-----------------------------------------------------------------*/
1782       case OP_VSPACE:
1783       if (clen > 0) switch(c)
1784         {
1785         case 0x000a:
1786         case 0x000b:
1787         case 0x000c:
1788         case 0x000d:
1789         case 0x0085:
1790         case 0x2028:
1791         case 0x2029:
1792         ADD_NEW(state_offset + 1, 0);
1793         break;
1794
1795         default: break;
1796         }
1797       break;
1798
1799       /*-----------------------------------------------------------------*/
1800       case OP_NOT_HSPACE:
1801       if (clen > 0) switch(c)
1802         {
1803         case 0x09:      /* HT */
1804         case 0x20:      /* SPACE */
1805         case 0xa0:      /* NBSP */
1806         case 0x1680:    /* OGHAM SPACE MARK */
1807         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1808         case 0x2000:    /* EN QUAD */
1809         case 0x2001:    /* EM QUAD */
1810         case 0x2002:    /* EN SPACE */
1811         case 0x2003:    /* EM SPACE */
1812         case 0x2004:    /* THREE-PER-EM SPACE */
1813         case 0x2005:    /* FOUR-PER-EM SPACE */
1814         case 0x2006:    /* SIX-PER-EM SPACE */
1815         case 0x2007:    /* FIGURE SPACE */
1816         case 0x2008:    /* PUNCTUATION SPACE */
1817         case 0x2009:    /* THIN SPACE */
1818         case 0x200A:    /* HAIR SPACE */
1819         case 0x202f:    /* NARROW NO-BREAK SPACE */
1820         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1821         case 0x3000:    /* IDEOGRAPHIC SPACE */
1822         break;
1823
1824         default:
1825         ADD_NEW(state_offset + 1, 0);
1826         break;
1827         }
1828       break;
1829
1830       /*-----------------------------------------------------------------*/
1831       case OP_HSPACE:
1832       if (clen > 0) switch(c)
1833         {
1834         case 0x09:      /* HT */
1835         case 0x20:      /* SPACE */
1836         case 0xa0:      /* NBSP */
1837         case 0x1680:    /* OGHAM SPACE MARK */
1838         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1839         case 0x2000:    /* EN QUAD */
1840         case 0x2001:    /* EM QUAD */
1841         case 0x2002:    /* EN SPACE */
1842         case 0x2003:    /* EM SPACE */
1843         case 0x2004:    /* THREE-PER-EM SPACE */
1844         case 0x2005:    /* FOUR-PER-EM SPACE */
1845         case 0x2006:    /* SIX-PER-EM SPACE */
1846         case 0x2007:    /* FIGURE SPACE */
1847         case 0x2008:    /* PUNCTUATION SPACE */
1848         case 0x2009:    /* THIN SPACE */
1849         case 0x200A:    /* HAIR SPACE */
1850         case 0x202f:    /* NARROW NO-BREAK SPACE */
1851         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1852         case 0x3000:    /* IDEOGRAPHIC SPACE */
1853         ADD_NEW(state_offset + 1, 0);
1854         break;
1855         }
1856       break;
1857
1858       /*-----------------------------------------------------------------*/
1859       /* Match a negated single character. This is only used for one-byte
1860       characters, that is, we know that d < 256. The character we are
1861       checking (c) can be multibyte. */
1862
1863       case OP_NOT:
1864       if (clen > 0)
1865         {
1866         unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1867         if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1868         }
1869       break;
1870
1871       /*-----------------------------------------------------------------*/
1872       case OP_PLUS:
1873       case OP_MINPLUS:
1874       case OP_POSPLUS:
1875       case OP_NOTPLUS:
1876       case OP_NOTMINPLUS:
1877       case OP_NOTPOSPLUS:
1878       count = current_state->count;  /* Already matched */
1879       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1880       if (clen > 0)
1881         {
1882         unsigned int otherd = NOTACHAR;
1883         if ((ims & PCRE_CASELESS) != 0)
1884           {
1885 #ifdef SUPPORT_UTF8
1886           if (utf8 && d >= 128)
1887             {
1888 #ifdef SUPPORT_UCP
1889             otherd = _pcre_ucp_othercase(d);
1890 #endif  /* SUPPORT_UCP */
1891             }
1892           else
1893 #endif  /* SUPPORT_UTF8 */
1894           otherd = fcc[d];
1895           }
1896         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1897           {
1898           if (count > 0 &&
1899               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1900             {
1901             active_count--;             /* Remove non-match possibility */
1902             next_active_state--;
1903             }
1904           count++;
1905           ADD_NEW(state_offset, count);
1906           }
1907         }
1908       break;
1909
1910       /*-----------------------------------------------------------------*/
1911       case OP_QUERY:
1912       case OP_MINQUERY:
1913       case OP_POSQUERY:
1914       case OP_NOTQUERY:
1915       case OP_NOTMINQUERY:
1916       case OP_NOTPOSQUERY:
1917       ADD_ACTIVE(state_offset + dlen + 1, 0);
1918       if (clen > 0)
1919         {
1920         unsigned int otherd = NOTACHAR;
1921         if ((ims & PCRE_CASELESS) != 0)
1922           {
1923 #ifdef SUPPORT_UTF8
1924           if (utf8 && d >= 128)
1925             {
1926 #ifdef SUPPORT_UCP
1927             otherd = _pcre_ucp_othercase(d);
1928 #endif  /* SUPPORT_UCP */
1929             }
1930           else
1931 #endif  /* SUPPORT_UTF8 */
1932           otherd = fcc[d];
1933           }
1934         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1935           {
1936           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1937             {
1938             active_count--;            /* Remove non-match possibility */
1939             next_active_state--;
1940             }
1941           ADD_NEW(state_offset + dlen + 1, 0);
1942           }
1943         }
1944       break;
1945
1946       /*-----------------------------------------------------------------*/
1947       case OP_STAR:
1948       case OP_MINSTAR:
1949       case OP_POSSTAR:
1950       case OP_NOTSTAR:
1951       case OP_NOTMINSTAR:
1952       case OP_NOTPOSSTAR:
1953       ADD_ACTIVE(state_offset + dlen + 1, 0);
1954       if (clen > 0)
1955         {
1956         unsigned int otherd = NOTACHAR;
1957         if ((ims & PCRE_CASELESS) != 0)
1958           {
1959 #ifdef SUPPORT_UTF8
1960           if (utf8 && d >= 128)
1961             {
1962 #ifdef SUPPORT_UCP
1963             otherd = _pcre_ucp_othercase(d);
1964 #endif  /* SUPPORT_UCP */
1965             }
1966           else
1967 #endif  /* SUPPORT_UTF8 */
1968           otherd = fcc[d];
1969           }
1970         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1971           {
1972           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1973             {
1974             active_count--;            /* Remove non-match possibility */
1975             next_active_state--;
1976             }
1977           ADD_NEW(state_offset, 0);
1978           }
1979         }
1980       break;
1981
1982       /*-----------------------------------------------------------------*/
1983       case OP_EXACT:
1984       case OP_NOTEXACT:
1985       count = current_state->count;  /* Number already matched */
1986       if (clen > 0)
1987         {
1988         unsigned int otherd = NOTACHAR;
1989         if ((ims & PCRE_CASELESS) != 0)
1990           {
1991 #ifdef SUPPORT_UTF8
1992           if (utf8 && d >= 128)
1993             {
1994 #ifdef SUPPORT_UCP
1995             otherd = _pcre_ucp_othercase(d);
1996 #endif  /* SUPPORT_UCP */
1997             }
1998           else
1999 #endif  /* SUPPORT_UTF8 */
2000           otherd = fcc[d];
2001           }
2002         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2003           {
2004           if (++count >= GET2(code, 1))
2005             { ADD_NEW(state_offset + dlen + 3, 0); }
2006           else
2007             { ADD_NEW(state_offset, count); }
2008           }
2009         }
2010       break;
2011
2012       /*-----------------------------------------------------------------*/
2013       case OP_UPTO:
2014       case OP_MINUPTO:
2015       case OP_POSUPTO:
2016       case OP_NOTUPTO:
2017       case OP_NOTMINUPTO:
2018       case OP_NOTPOSUPTO:
2019       ADD_ACTIVE(state_offset + dlen + 3, 0);
2020       count = current_state->count;  /* Number already matched */
2021       if (clen > 0)
2022         {
2023         unsigned int otherd = NOTACHAR;
2024         if ((ims & PCRE_CASELESS) != 0)
2025           {
2026 #ifdef SUPPORT_UTF8
2027           if (utf8 && d >= 128)
2028             {
2029 #ifdef SUPPORT_UCP
2030             otherd = _pcre_ucp_othercase(d);
2031 #endif  /* SUPPORT_UCP */
2032             }
2033           else
2034 #endif  /* SUPPORT_UTF8 */
2035           otherd = fcc[d];
2036           }
2037         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2038           {
2039           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2040             {
2041             active_count--;             /* Remove non-match possibility */
2042             next_active_state--;
2043             }
2044           if (++count >= GET2(code, 1))
2045             { ADD_NEW(state_offset + dlen + 3, 0); }
2046           else
2047             { ADD_NEW(state_offset, count); }
2048           }
2049         }
2050       break;
2051
2052
2053 /* ========================================================================== */
2054       /* These are the class-handling opcodes */
2055
2056       case OP_CLASS:
2057       case OP_NCLASS:
2058       case OP_XCLASS:
2059         {
2060         BOOL isinclass = FALSE;
2061         int next_state_offset;
2062         const uschar *ecode;
2063
2064         /* For a simple class, there is always just a 32-byte table, and we
2065         can set isinclass from it. */
2066
2067         if (codevalue != OP_XCLASS)
2068           {
2069           ecode = code + 33;
2070           if (clen > 0)
2071             {
2072             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2073               ((code[1 + c/8] & (1 << (c&7))) != 0);
2074             }
2075           }
2076
2077         /* An extended class may have a table or a list of single characters,
2078         ranges, or both, and it may be positive or negative. There's a
2079         function that sorts all this out. */
2080
2081         else
2082          {
2083          ecode = code + GET(code, 1);
2084          if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2085          }
2086
2087         /* At this point, isinclass is set for all kinds of class, and ecode
2088         points to the byte after the end of the class. If there is a
2089         quantifier, this is where it will be. */
2090
2091         next_state_offset = ecode - start_code;
2092
2093         switch (*ecode)
2094           {
2095           case OP_CRSTAR:
2096           case OP_CRMINSTAR:
2097           ADD_ACTIVE(next_state_offset + 1, 0);
2098           if (isinclass) { ADD_NEW(state_offset, 0); }
2099           break;
2100
2101           case OP_CRPLUS:
2102           case OP_CRMINPLUS:
2103           count = current_state->count;  /* Already matched */
2104           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2105           if (isinclass) { count++; ADD_NEW(state_offset, count); }
2106           break;
2107
2108           case OP_CRQUERY:
2109           case OP_CRMINQUERY:
2110           ADD_ACTIVE(next_state_offset + 1, 0);
2111           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2112           break;
2113
2114           case OP_CRRANGE:
2115           case OP_CRMINRANGE:
2116           count = current_state->count;  /* Already matched */
2117           if (count >= GET2(ecode, 1))
2118             { ADD_ACTIVE(next_state_offset + 5, 0); }
2119           if (isinclass)
2120             {
2121             int max = GET2(ecode, 3);
2122             if (++count >= max && max != 0)   /* Max 0 => no limit */
2123               { ADD_NEW(next_state_offset + 5, 0); }
2124             else
2125               { ADD_NEW(state_offset, count); }
2126             }
2127           break;
2128
2129           default:
2130           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2131           break;
2132           }
2133         }
2134       break;
2135
2136 /* ========================================================================== */
2137       /* These are the opcodes for fancy brackets of various kinds. We have
2138       to use recursion in order to handle them. */
2139
2140       case OP_ASSERT:
2141       case OP_ASSERT_NOT:
2142       case OP_ASSERTBACK:
2143       case OP_ASSERTBACK_NOT:
2144         {
2145         int rc;
2146         int local_offsets[2];
2147         int local_workspace[1000];
2148         const uschar *endasscode = code + GET(code, 1);
2149
2150         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2151
2152         rc = internal_dfa_exec(
2153           md,                                   /* static match data */
2154           code,                                 /* this subexpression's code */
2155           ptr,                                  /* where we currently are */
2156           ptr - start_subject,                  /* start offset */
2157           local_offsets,                        /* offset vector */
2158           sizeof(local_offsets)/sizeof(int),    /* size of same */
2159           local_workspace,                      /* workspace vector */
2160           sizeof(local_workspace)/sizeof(int),  /* size of same */
2161           ims,                                  /* the current ims flags */
2162           rlevel,                               /* function recursion level */
2163           recursing);                           /* pass on regex recursion */
2164
2165         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2166             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2167         }
2168       break;
2169
2170       /*-----------------------------------------------------------------*/
2171       case OP_COND:
2172       case OP_SCOND:
2173         {
2174         int local_offsets[1000];
2175         int local_workspace[1000];
2176         int condcode = code[LINK_SIZE+1];
2177
2178         /* Back reference conditions are not supported */
2179
2180         if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2181
2182         /* The DEFINE condition is always false */
2183
2184         if (condcode == OP_DEF)
2185           {
2186           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2187           }
2188
2189         /* The only supported version of OP_RREF is for the value RREF_ANY,
2190         which means "test if in any recursion". We can't test for specifically
2191         recursed groups. */
2192
2193         else if (condcode == OP_RREF)
2194           {
2195           int value = GET2(code, LINK_SIZE+2);
2196           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2197           if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2198             else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2199           }
2200
2201         /* Otherwise, the condition is an assertion */
2202
2203         else
2204           {
2205           int rc;
2206           const uschar *asscode = code + LINK_SIZE + 1;
2207           const uschar *endasscode = asscode + GET(asscode, 1);
2208
2209           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2210
2211           rc = internal_dfa_exec(
2212             md,                                   /* fixed match data */
2213             asscode,                              /* this subexpression's code */
2214             ptr,                                  /* where we currently are */
2215             ptr - start_subject,                  /* start offset */
2216             local_offsets,                        /* offset vector */
2217             sizeof(local_offsets)/sizeof(int),    /* size of same */
2218             local_workspace,                      /* workspace vector */
2219             sizeof(local_workspace)/sizeof(int),  /* size of same */
2220             ims,                                  /* the current ims flags */
2221             rlevel,                               /* function recursion level */
2222             recursing);                           /* pass on regex recursion */
2223
2224           if ((rc >= 0) ==
2225                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2226             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2227           else
2228             { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2229           }
2230         }
2231       break;
2232
2233       /*-----------------------------------------------------------------*/
2234       case OP_RECURSE:
2235         {
2236         int local_offsets[1000];
2237         int local_workspace[1000];
2238         int rc;
2239
2240         DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2241           recursing + 1));
2242
2243         rc = internal_dfa_exec(
2244           md,                                   /* fixed match data */
2245           start_code + GET(code, 1),            /* this subexpression's code */
2246           ptr,                                  /* where we currently are */
2247           ptr - start_subject,                  /* start offset */
2248           local_offsets,                        /* offset vector */
2249           sizeof(local_offsets)/sizeof(int),    /* size of same */
2250           local_workspace,                      /* workspace vector */
2251           sizeof(local_workspace)/sizeof(int),  /* size of same */
2252           ims,                                  /* the current ims flags */
2253           rlevel,                               /* function recursion level */
2254           recursing + 1);                       /* regex recurse level */
2255
2256         DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2257           recursing + 1, rc));
2258
2259         /* Ran out of internal offsets */
2260
2261         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2262
2263         /* For each successful matched substring, set up the next state with a
2264         count of characters to skip before trying it. Note that the count is in
2265         characters, not bytes. */
2266
2267         if (rc > 0)
2268           {
2269           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2270             {
2271             const uschar *p = start_subject + local_offsets[rc];
2272             const uschar *pp = start_subject + local_offsets[rc+1];
2273             int charcount = local_offsets[rc+1] - local_offsets[rc];
2274             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2275             if (charcount > 0)
2276               {
2277               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2278               }
2279             else
2280               {
2281               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2282               }
2283             }
2284           }
2285         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2286         }
2287       break;
2288
2289       /*-----------------------------------------------------------------*/
2290       case OP_ONCE:
2291         {
2292         int local_offsets[2];
2293         int local_workspace[1000];
2294
2295         int rc = internal_dfa_exec(
2296           md,                                   /* fixed match data */
2297           code,                                 /* this subexpression's code */
2298           ptr,                                  /* where we currently are */
2299           ptr - start_subject,                  /* start offset */
2300           local_offsets,                        /* offset vector */
2301           sizeof(local_offsets)/sizeof(int),    /* size of same */
2302           local_workspace,                      /* workspace vector */
2303           sizeof(local_workspace)/sizeof(int),  /* size of same */
2304           ims,                                  /* the current ims flags */
2305           rlevel,                               /* function recursion level */
2306           recursing);                           /* pass on regex recursion */
2307
2308         if (rc >= 0)
2309           {
2310           const uschar *end_subpattern = code;
2311           int charcount = local_offsets[1] - local_offsets[0];
2312           int next_state_offset, repeat_state_offset;
2313
2314           do { end_subpattern += GET(end_subpattern, 1); }
2315             while (*end_subpattern == OP_ALT);
2316           next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2317
2318           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2319           arrange for the repeat state also to be added to the relevant list.
2320           Calculate the offset, or set -1 for no repeat. */
2321
2322           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2323                                  *end_subpattern == OP_KETRMIN)?
2324             end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2325
2326           /* If we have matched an empty string, add the next state at the
2327           current character pointer. This is important so that the duplicate
2328           checking kicks in, which is what breaks infinite loops that match an
2329           empty string. */
2330
2331           if (charcount == 0)
2332             {
2333             ADD_ACTIVE(next_state_offset, 0);
2334             }
2335
2336           /* Optimization: if there are no more active states, and there
2337           are no new states yet set up, then skip over the subject string
2338           right here, to save looping. Otherwise, set up the new state to swing
2339           into action when the end of the substring is reached. */
2340
2341           else if (i + 1 >= active_count && new_count == 0)
2342             {
2343             ptr += charcount;
2344             clen = 0;
2345             ADD_NEW(next_state_offset, 0);
2346
2347             /* If we are adding a repeat state at the new character position,
2348             we must fudge things so that it is the only current state.
2349             Otherwise, it might be a duplicate of one we processed before, and
2350             that would cause it to be skipped. */
2351
2352             if (repeat_state_offset >= 0)
2353               {
2354               next_active_state = active_states;
2355               active_count = 0;
2356               i = -1;
2357               ADD_ACTIVE(repeat_state_offset, 0);
2358               }
2359             }
2360           else
2361             {
2362             const uschar *p = start_subject + local_offsets[0];
2363             const uschar *pp = start_subject + local_offsets[1];
2364             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2365             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2366             if (repeat_state_offset >= 0)
2367               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2368             }
2369
2370           }
2371         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2372         }
2373       break;
2374
2375
2376 /* ========================================================================== */
2377       /* Handle callouts */
2378
2379       case OP_CALLOUT:
2380       if (pcre_callout != NULL)
2381         {
2382         int rrc;
2383         pcre_callout_block cb;
2384         cb.version          = 1;   /* Version 1 of the callout block */
2385         cb.callout_number   = code[1];
2386         cb.offset_vector    = offsets;
2387         cb.subject          = (PCRE_SPTR)start_subject;
2388         cb.subject_length   = end_subject - start_subject;
2389         cb.start_match      = current_subject - start_subject;
2390         cb.current_position = ptr - start_subject;
2391         cb.pattern_position = GET(code, 2);
2392         cb.next_item_length = GET(code, 2 + LINK_SIZE);
2393         cb.capture_top      = 1;
2394         cb.capture_last     = -1;
2395         cb.callout_data     = md->callout_data;
2396         if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2397         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2398         }
2399       break;
2400
2401
2402 /* ========================================================================== */
2403       default:        /* Unsupported opcode */
2404       return PCRE_ERROR_DFA_UITEM;
2405       }
2406
2407     NEXT_ACTIVE_STATE: continue;
2408
2409     }      /* End of loop scanning active states */
2410
2411   /* We have finished the processing at the current subject character. If no
2412   new states have been set for the next character, we have found all the
2413   matches that we are going to find. If we are at the top level and partial
2414   matching has been requested, check for appropriate conditions. */
2415
2416   if (new_count <= 0)
2417     {
2418     if (match_count < 0 &&                     /* No matches found */
2419         rlevel == 1 &&                         /* Top level match function */
2420         (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */
2421         ptr >= end_subject &&                  /* Reached end of subject */
2422         ptr > current_subject)                 /* Matched non-empty string */
2423       {
2424       if (offsetcount >= 2)
2425         {
2426         offsets[0] = current_subject - start_subject;
2427         offsets[1] = end_subject - start_subject;
2428         }
2429       match_count = PCRE_ERROR_PARTIAL;
2430       }
2431
2432     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2433       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2434       rlevel*2-2, SP));
2435     break;        /* In effect, "return", but see the comment below */
2436     }
2437
2438   /* One or more states are active for the next character. */
2439
2440   ptr += clen;    /* Advance to next subject character */
2441   }               /* Loop to move along the subject string */
2442
2443 /* Control gets here from "break" a few lines above. We do it this way because
2444 if we use "return" above, we have compiler trouble. Some compilers warn if
2445 there's nothing here because they think the function doesn't return a value. On
2446 the other hand, if we put a dummy statement here, some more clever compilers
2447 complain that it can't be reached. Sigh. */
2448
2449 return match_count;
2450 }
2451
2452
2453
2454
2455 /*************************************************
2456 *    Execute a Regular Expression - DFA engine   *
2457 *************************************************/
2458
2459 /* This external function applies a compiled re to a subject string using a DFA
2460 engine. This function calls the internal function multiple times if the pattern
2461 is not anchored.
2462
2463 Arguments:
2464   argument_re     points to the compiled expression
2465   extra_data      points to extra data or is NULL
2466   subject         points to the subject string
2467   length          length of subject string (may contain binary zeros)
2468   start_offset    where to start in the subject string
2469   options         option bits
2470   offsets         vector of match offsets
2471   offsetcount     size of same
2472   workspace       workspace vector
2473   wscount         size of same
2474
2475 Returns:          > 0 => number of match offset pairs placed in offsets
2476                   = 0 => offsets overflowed; longest matches are present
2477                    -1 => failed to match
2478                  < -1 => some kind of unexpected problem
2479 */
2480
2481 PCRE_EXP_DEFN int
2482 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2483   const char *subject, int length, int start_offset, int options, int *offsets,
2484   int offsetcount, int *workspace, int wscount)
2485 {
2486 real_pcre *re = (real_pcre *)argument_re;
2487 dfa_match_data match_block;
2488 dfa_match_data *md = &match_block;
2489 BOOL utf8, anchored, startline, firstline;
2490 const uschar *current_subject, *end_subject, *lcc;
2491
2492 pcre_study_data internal_study;
2493 const pcre_study_data *study = NULL;
2494 real_pcre internal_re;
2495
2496 const uschar *req_byte_ptr;
2497 const uschar *start_bits = NULL;
2498 BOOL first_byte_caseless = FALSE;
2499 BOOL req_byte_caseless = FALSE;
2500 int first_byte = -1;
2501 int req_byte = -1;
2502 int req_byte2 = -1;
2503 int newline;
2504
2505 /* Plausibility checks */
2506
2507 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2508 if (re == NULL || subject == NULL || workspace == NULL ||
2509    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2510 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2511 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2512
2513 /* We need to find the pointer to any study data before we test for byte
2514 flipping, so we scan the extra_data block first. This may set two fields in the
2515 match block, so we must initialize them beforehand. However, the other fields
2516 in the match block must not be set until after the byte flipping. */
2517
2518 md->tables = re->tables;
2519 md->callout_data = NULL;
2520
2521 if (extra_data != NULL)
2522   {
2523   unsigned int flags = extra_data->flags;
2524   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2525     study = (const pcre_study_data *)extra_data->study_data;
2526   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2527   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2528     return PCRE_ERROR_DFA_UMLIMIT;
2529   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2530     md->callout_data = extra_data->callout_data;
2531   if ((flags & PCRE_EXTRA_TABLES) != 0)
2532     md->tables = extra_data->tables;
2533   }
2534
2535 /* Check that the first field in the block is the magic number. If it is not,
2536 test for a regex that was compiled on a host of opposite endianness. If this is
2537 the case, flipped values are put in internal_re and internal_study if there was
2538 study data too. */
2539
2540 if (re->magic_number != MAGIC_NUMBER)
2541   {
2542   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2543   if (re == NULL) return PCRE_ERROR_BADMAGIC;
2544   if (study != NULL) study = &internal_study;
2545   }
2546
2547 /* Set some local values */
2548
2549 current_subject = (const unsigned char *)subject + start_offset;
2550 end_subject = (const unsigned char *)subject + length;
2551 req_byte_ptr = current_subject - 1;
2552
2553 #ifdef SUPPORT_UTF8
2554 utf8 = (re->options & PCRE_UTF8) != 0;
2555 #else
2556 utf8 = FALSE;
2557 #endif
2558
2559 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2560   (re->options & PCRE_ANCHORED) != 0;
2561
2562 /* The remaining fixed data for passing around. */
2563
2564 md->start_code = (const uschar *)argument_re +
2565     re->name_table_offset + re->name_count * re->name_entry_size;
2566 md->start_subject = (const unsigned char *)subject;
2567 md->end_subject = end_subject;
2568 md->moptions = options;
2569 md->poptions = re->options;
2570
2571 /* Handle different types of newline. The three bits give eight cases. If
2572 nothing is set at run time, whatever was used at compile time applies. */
2573
2574 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2575          PCRE_NEWLINE_BITS)
2576   {
2577   case 0: newline = NEWLINE; break;   /* Compile-time default */
2578   case PCRE_NEWLINE_CR: newline = '\r'; break;
2579   case PCRE_NEWLINE_LF: newline = '\n'; break;
2580   case PCRE_NEWLINE_CR+
2581        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2582   case PCRE_NEWLINE_ANY: newline = -1; break;
2583   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2584   default: return PCRE_ERROR_BADNEWLINE;
2585   }
2586
2587 if (newline == -2)
2588   {
2589   md->nltype = NLTYPE_ANYCRLF;
2590   }
2591 else if (newline < 0)
2592   {
2593   md->nltype = NLTYPE_ANY;
2594   }
2595 else
2596   {
2597   md->nltype = NLTYPE_FIXED;
2598   if (newline > 255)
2599     {
2600     md->nllen = 2;
2601     md->nl[0] = (newline >> 8) & 255;
2602     md->nl[1] = newline & 255;
2603     }
2604   else
2605     {
2606     md->nllen = 1;
2607     md->nl[0] = newline;
2608     }
2609   }
2610
2611 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2612 back the character offset. */
2613
2614 #ifdef SUPPORT_UTF8
2615 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2616   {
2617   if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2618     return PCRE_ERROR_BADUTF8;
2619   if (start_offset > 0 && start_offset < length)
2620     {
2621     int tb = ((uschar *)subject)[start_offset];
2622     if (tb > 127)
2623       {
2624       tb &= 0xc0;
2625       if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2626       }
2627     }
2628   }
2629 #endif
2630
2631 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2632 is a feature that makes it possible to save compiled regex and re-use them
2633 in other programs later. */
2634
2635 if (md->tables == NULL) md->tables = _pcre_default_tables;
2636
2637 /* The lower casing table and the "must be at the start of a line" flag are
2638 used in a loop when finding where to start. */
2639
2640 lcc = md->tables + lcc_offset;
2641 startline = (re->options & PCRE_STARTLINE) != 0;
2642 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2643
2644 /* Set up the first character to match, if available. The first_byte value is
2645 never set for an anchored regular expression, but the anchoring may be forced
2646 at run time, so we have to test for anchoring. The first char may be unset for
2647 an unanchored pattern, of course. If there's no first char and the pattern was
2648 studied, there may be a bitmap of possible first characters. */
2649
2650 if (!anchored)
2651   {
2652   if ((re->options & PCRE_FIRSTSET) != 0)
2653     {
2654     first_byte = re->first_byte & 255;
2655     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2656       first_byte = lcc[first_byte];
2657     }
2658   else
2659     {
2660     if (startline && study != NULL &&
2661          (study->options & PCRE_STUDY_MAPPED) != 0)
2662       start_bits = study->start_bits;
2663     }
2664   }
2665
2666 /* For anchored or unanchored matches, there may be a "last known required
2667 character" set. */
2668
2669 if ((re->options & PCRE_REQCHSET) != 0)
2670   {
2671   req_byte = re->req_byte & 255;
2672   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2673   req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
2674   }
2675
2676 /* Call the main matching function, looping for a non-anchored regex after a
2677 failed match. Unless restarting, optimize by moving to the first match
2678 character if possible, when not anchored. Then unless wanting a partial match,
2679 check for a required later character. */
2680
2681 for (;;)
2682   {
2683   int rc;
2684
2685   if ((options & PCRE_DFA_RESTART) == 0)
2686     {
2687     const uschar *save_end_subject = end_subject;
2688
2689     /* Advance to a unique first char if possible. If firstline is TRUE, the
2690     start of the match is constrained to the first line of a multiline string.
2691     Implement this by temporarily adjusting end_subject so that we stop
2692     scanning at a newline. If the match fails at the newline, later code breaks
2693     this loop. */
2694
2695     if (firstline)
2696       {
2697       const uschar *t = current_subject;
2698       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2699       end_subject = t;
2700       }
2701
2702     if (first_byte >= 0)
2703       {
2704       if (first_byte_caseless)
2705         while (current_subject < end_subject &&
2706                lcc[*current_subject] != first_byte)
2707           current_subject++;
2708       else
2709         while (current_subject < end_subject && *current_subject != first_byte)
2710           current_subject++;
2711       }
2712
2713     /* Or to just after a linebreak for a multiline match if possible */
2714
2715     else if (startline)
2716       {
2717       if (current_subject > md->start_subject + start_offset)
2718         {
2719         while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2720           current_subject++;
2721
2722         /* If we have just passed a CR and the newline option is ANY or
2723         ANYCRLF, and we are now at a LF, advance the match position by one more
2724         character. */
2725
2726         if (current_subject[-1] == '\r' &&
2727              (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2728              current_subject < end_subject &&
2729              *current_subject == '\n')
2730           current_subject++;
2731         }
2732       }
2733
2734     /* Or to a non-unique first char after study */
2735
2736     else if (start_bits != NULL)
2737       {
2738       while (current_subject < end_subject)
2739         {
2740         register unsigned int c = *current_subject;
2741         if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2742           else break;
2743         }
2744       }
2745
2746     /* Restore fudged end_subject */
2747
2748     end_subject = save_end_subject;
2749     }
2750
2751   /* If req_byte is set, we know that that character must appear in the subject
2752   for the match to succeed. If the first character is set, req_byte must be
2753   later in the subject; otherwise the test starts at the match point. This
2754   optimization can save a huge amount of work in patterns with nested unlimited
2755   repeats that aren't going to match. Writing separate code for cased/caseless
2756   versions makes it go faster, as does using an autoincrement and backing off
2757   on a match.
2758
2759   HOWEVER: when the subject string is very, very long, searching to its end can
2760   take a long time, and give bad performance on quite ordinary patterns. This
2761   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2762   don't do this when the string is sufficiently long.
2763
2764   ALSO: this processing is disabled when partial matching is requested.
2765   */
2766
2767   if (req_byte >= 0 &&
2768       end_subject - current_subject < REQ_BYTE_MAX &&
2769       (options & PCRE_PARTIAL) == 0)
2770     {
2771     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2772
2773     /* We don't need to repeat the search if we haven't yet reached the
2774     place we found it at last time. */
2775
2776     if (p > req_byte_ptr)
2777       {
2778       if (req_byte_caseless)
2779         {
2780         while (p < end_subject)
2781           {
2782           register int pp = *p++;
2783           if (pp == req_byte || pp == req_byte2) { p--; break; }
2784           }
2785         }
2786       else
2787         {
2788         while (p < end_subject)
2789           {
2790           if (*p++ == req_byte) { p--; break; }
2791           }
2792         }
2793
2794       /* If we can't find the required character, break the matching loop,
2795       which will cause a return or PCRE_ERROR_NOMATCH. */
2796
2797       if (p >= end_subject) break;
2798
2799       /* If we have found the required character, save the point where we
2800       found it, so that we don't search again next time round the loop if
2801       the start hasn't passed this character yet. */
2802
2803       req_byte_ptr = p;
2804       }
2805     }
2806
2807   /* OK, now we can do the business */
2808
2809   rc = internal_dfa_exec(
2810     md,                                /* fixed match data */
2811     md->start_code,                    /* this subexpression's code */
2812     current_subject,                   /* where we currently are */
2813     start_offset,                      /* start offset in subject */
2814     offsets,                           /* offset vector */
2815     offsetcount,                       /* size of same */
2816     workspace,                         /* workspace vector */
2817     wscount,                           /* size of same */
2818     re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2819     0,                                 /* function recurse level */
2820     0);                                /* regex recurse level */
2821
2822   /* Anything other than "no match" means we are done, always; otherwise, carry
2823   on only if not anchored. */
2824
2825   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2826
2827   /* Advance to the next subject character unless we are at the end of a line
2828   and firstline is set. */
2829
2830   if (firstline && IS_NEWLINE(current_subject)) break;
2831   current_subject++;
2832   if (utf8)
2833     {
2834     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2835       current_subject++;
2836     }
2837   if (current_subject > end_subject) break;
2838
2839   /* If we have just passed a CR and the newline option is CRLF or ANY or
2840   ANYCRLF, and we are now at a LF, advance the match position by one more
2841   character. */
2842
2843   if (current_subject[-1] == '\r' &&
2844        (md->nltype == NLTYPE_ANY ||
2845         md->nltype == NLTYPE_ANYCRLF ||
2846         md->nllen == 2) &&
2847        current_subject < end_subject &&
2848        *current_subject == '\n')
2849     current_subject++;
2850
2851   }   /* "Bumpalong" loop */
2852
2853 return PCRE_ERROR_NOMATCH;
2854 }
2855
2856 /* End of pcre_dfa_exec.c */