src/pcre_dfa_exec.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9            Copyright (c) 1997-2008 University of Cambridge
  10
  11 -----------------------------------------------------------------------------
  12 Redistribution and use in source and binary forms, with or without
  13 modification, are permitted provided that the following conditions are met:
  14
  15     * Redistributions of source code must retain the above copyright notice,
  16       this list of conditions and the following disclaimer.
  17
  18     * Redistributions in binary form must reproduce the above copyright
  19       notice, this list of conditions and the following disclaimer in the
  20       documentation and/or other materials provided with the distribution.
  21
  22     * Neither the name of the University of Cambridge nor the names of its
  23       contributors may be used to endorse or promote products derived from
  24       this software without specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE.
  37 -----------------------------------------------------------------------------
  38 */
  39
  40
  41 /* This module contains the external function pcre_dfa_exec(), which is an
  42 alternative matching function that uses a sort of DFA algorithm (not a true
  43 FSM). This is NOT Perl- compatible, but it has advantages in certain
  44 applications. */
  45
  46
  47 #ifdef HAVE_CONFIG_H
  48 #include "config.h"
  49 #endif
  50
  51 #define NLBLOCK md             /* Block containing newline information */
  52 #define PSSTART start_subject  /* Field containing processed string start */
  53 #define PSEND   end_subject    /* Field containing processed string end */
  54
  55 #include "pcre_internal.h"
  56
  57
  58 /* For use to indent debugging output */
  59
  60 #define SP "                   "
  61
  62
  63
  64 /*************************************************
  65 *      Code parameters and static tables         *
  66 *************************************************/
  67
  68 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
  69 into others, under special conditions. A gap of 20 between the blocks should be
  70 enough. The resulting opcodes don't have to be less than 256 because they are
  71 never stored, so we push them well clear of the normal opcodes. */
  72
  73 #define OP_PROP_EXTRA       300
  74 #define OP_EXTUNI_EXTRA     320
  75 #define OP_ANYNL_EXTRA      340
  76 #define OP_HSPACE_EXTRA     360
  77 #define OP_VSPACE_EXTRA     380
  78
  79
  80 /* This table identifies those opcodes that are followed immediately by a
  81 character that is to be tested in some way. This makes is possible to
  82 centralize the loading of these characters. In the case of Type * etc, the
  83 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
  84 small value. ***NOTE*** If the start of this table is modified, the two tables
  85 that follow must also be modified. */
  86
  87 static const uschar coptable[] = {
  88   0,                             /* End                                    */
  89   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
  90   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
  91   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
  92   0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
  93   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
  94   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
  95   1,                             /* Char                                   */
  96   1,                             /* Charnc                                 */
  97   1,                             /* not                                    */
  98   /* Positive single-char repeats                                          */
  99   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 100   3, 3, 3,                       /* upto, minupto, exact                   */
 101   1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
 102   /* Negative single-char repeats - only for chars < 256                   */
 103   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
 104   3, 3, 3,                       /* NOT upto, minupto, exact               */
 105   1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */
 106   /* Positive type repeats                                                 */
 107   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
 108   3, 3, 3,                       /* Type upto, minupto, exact              */
 109   1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
 110   /* Character class & ref repeats                                         */
 111   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
 112   0, 0,                          /* CRRANGE, CRMINRANGE                    */
 113   0,                             /* CLASS                                  */
 114   0,                             /* NCLASS                                 */
 115   0,                             /* XCLASS - variable length               */
 116   0,                             /* REF                                    */
 117   0,                             /* RECURSE                                */
 118   0,                             /* CALLOUT                                */
 119   0,                             /* Alt                                    */
 120   0,                             /* Ket                                    */
 121   0,                             /* KetRmax                                */
 122   0,                             /* KetRmin                                */
 123   0,                             /* Assert                                 */
 124   0,                             /* Assert not                             */
 125   0,                             /* Assert behind                          */
 126   0,                             /* Assert behind not                      */
 127   0,                             /* Reverse                                */
 128   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
 129   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
 130   0,                             /* CREF                                   */
 131   0,                             /* RREF                                   */
 132   0,                             /* DEF                                    */
 133   0, 0,                          /* BRAZERO, BRAMINZERO                    */
 134   0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
 135   0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */
 136 };
 137
 138 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
 139 and \w */
 140
 141 static const uschar toptable1[] = {
 142   0, 0, 0, 0, 0, 0,
 143   ctype_digit, ctype_digit,
 144   ctype_space, ctype_space,
 145   ctype_word,  ctype_word,
 146   0, 0                            /* OP_ANY, OP_ALLANY */
 147 };
 148
 149 static const uschar toptable2[] = {
 150   0, 0, 0, 0, 0, 0,
 151   ctype_digit, 0,
 152   ctype_space, 0,
 153   ctype_word,  0,
 154   1, 1                            /* OP_ANY, OP_ALLANY */
 155 };
 156
 157
 158 /* Structure for holding data about a particular state, which is in effect the
 159 current data for an active path through the match tree. It must consist
 160 entirely of ints because the working vector we are passed, and which we put
 161 these structures in, is a vector of ints. */
 162
 163 typedef struct stateblock {
 164   int offset;                     /* Offset to opcode */
 165   int count;                      /* Count for repeats */
 166   int ims;                        /* ims flag bits */
 167   int data;                       /* Some use extra data */
 168 } stateblock;
 169
 170 #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
 171
 172
 173 #ifdef DEBUG
 174 /*************************************************
 175 *             Print character string             *
 176 *************************************************/
 177
 178 /* Character string printing function for debugging.
 179
 180 Arguments:
 181   p            points to string
 182   length       number of bytes
 183   f            where to print
 184
 185 Returns:       nothing
 186 */
 187
 188 static void
 189 pchars(unsigned char *p, int length, FILE *f)
 190 {
 191 int c;
 192 while (length-- > 0)
 193   {
 194   if (isprint(c = *(p++)))
 195     fprintf(f, "%c", c);
 196   else
 197     fprintf(f, "\\x%02x", c);
 198   }
 199 }
 200 #endif
 201
 202
 203
 204 /*************************************************
 205 *    Execute a Regular Expression - DFA engine   *
 206 *************************************************/
 207
 208 /* This internal function applies a compiled pattern to a subject string,
 209 starting at a given point, using a DFA engine. This function is called from the
 210 external one, possibly multiple times if the pattern is not anchored. The
 211 function calls itself recursively for some kinds of subpattern.
 212
 213 Arguments:
 214   md                the match_data block with fixed information
 215   this_start_code   the opening bracket of this subexpression's code
 216   current_subject   where we currently are in the subject string
 217   start_offset      start offset in the subject string
 218   offsets           vector to contain the matching string offsets
 219   offsetcount       size of same
 220   workspace         vector of workspace
 221   wscount           size of same
 222   ims               the current ims flags
 223   rlevel            function call recursion level
 224   recursing         regex recursive call level
 225
 226 Returns:            > 0 => number of match offset pairs placed in offsets
 227                     = 0 => offsets overflowed; longest matches are present
 228                      -1 => failed to match
 229                    < -1 => some kind of unexpected problem
 230
 231 The following macros are used for adding states to the two state vectors (one
 232 for the current character, one for the following character). */
 233
 234 #define ADD_ACTIVE(x,y) \
 235   if (active_count++ < wscount) \
 236     { \
 237     next_active_state->offset = (x); \
 238     next_active_state->count  = (y); \
 239     next_active_state->ims    = ims; \
 240     next_active_state++; \
 241     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 242     } \
 243   else return PCRE_ERROR_DFA_WSSIZE
 244
 245 #define ADD_ACTIVE_DATA(x,y,z) \
 246   if (active_count++ < wscount) \
 247     { \
 248     next_active_state->offset = (x); \
 249     next_active_state->count  = (y); \
 250     next_active_state->ims    = ims; \
 251     next_active_state->data   = (z); \
 252     next_active_state++; \
 253     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 254     } \
 255   else return PCRE_ERROR_DFA_WSSIZE
 256
 257 #define ADD_NEW(x,y) \
 258   if (new_count++ < wscount) \
 259     { \
 260     next_new_state->offset = (x); \
 261     next_new_state->count  = (y); \
 262     next_new_state->ims    = ims; \
 263     next_new_state++; \
 264     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 265     } \
 266   else return PCRE_ERROR_DFA_WSSIZE
 267
 268 #define ADD_NEW_DATA(x,y,z) \
 269   if (new_count++ < wscount) \
 270     { \
 271     next_new_state->offset = (x); \
 272     next_new_state->count  = (y); \
 273     next_new_state->ims    = ims; \
 274     next_new_state->data   = (z); \
 275     next_new_state++; \
 276     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 277     } \
 278   else return PCRE_ERROR_DFA_WSSIZE
 279
 280 /* And now, here is the code */
 281
 282 static int
 283 internal_dfa_exec(
 284   dfa_match_data *md,
 285   const uschar *this_start_code,
 286   const uschar *current_subject,
 287   int start_offset,
 288   int *offsets,
 289   int offsetcount,
 290   int *workspace,
 291   int wscount,
 292   int ims,
 293   int  rlevel,
 294   int  recursing)
 295 {
 296 stateblock *active_states, *new_states, *temp_states;
 297 stateblock *next_active_state, *next_new_state;
 298
 299 const uschar *ctypes, *lcc, *fcc;
 300 const uschar *ptr;
 301 const uschar *end_code, *first_op;
 302
 303 int active_count, new_count, match_count;
 304
 305 /* Some fields in the md block are frequently referenced, so we load them into
 306 independent variables in the hope that this will perform better. */
 307
 308 const uschar *start_subject = md->start_subject;
 309 const uschar *end_subject = md->end_subject;
 310 const uschar *start_code = md->start_code;
 311
 312 #ifdef SUPPORT_UTF8
 313 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
 314 #else
 315 BOOL utf8 = FALSE;
 316 #endif
 317
 318 rlevel++;
 319 offsetcount &= (-2);
 320
 321 wscount -= 2;
 322 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
 323           (2 * INTS_PER_STATEBLOCK);
 324
 325 DPRINTF(("\n%.*s---------------------\n"
 326   "%.*sCall to internal_dfa_exec f=%d r=%d\n",
 327   rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
 328
 329 ctypes = md->tables + ctypes_offset;
 330 lcc = md->tables + lcc_offset;
 331 fcc = md->tables + fcc_offset;
 332
 333 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
 334
 335 active_states = (stateblock *)(workspace + 2);
 336 next_new_state = new_states = active_states + wscount;
 337 new_count = 0;
 338
 339 first_op = this_start_code + 1 + LINK_SIZE +
 340   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
 341
 342 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
 343 the alternative states onto the list, and find out where the end is. This
 344 makes is possible to use this function recursively, when we want to stop at a
 345 matching internal ket rather than at the end.
 346
 347 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
 348 a backward assertion. In that case, we have to find out the maximum amount to
 349 move back, and set up each alternative appropriately. */
 350
 351 if (*first_op == OP_REVERSE)
 352   {
 353   int max_back = 0;
 354   int gone_back;
 355
 356   end_code = this_start_code;
 357   do
 358     {
 359     int back = GET(end_code, 2+LINK_SIZE);
 360     if (back > max_back) max_back = back;
 361     end_code += GET(end_code, 1);
 362     }
 363   while (*end_code == OP_ALT);
 364
 365   /* If we can't go back the amount required for the longest lookbehind
 366   pattern, go back as far as we can; some alternatives may still be viable. */
 367
 368 #ifdef SUPPORT_UTF8
 369   /* In character mode we have to step back character by character */
 370
 371   if (utf8)
 372     {
 373     for (gone_back = 0; gone_back < max_back; gone_back++)
 374       {
 375       if (current_subject <= start_subject) break;
 376       current_subject--;
 377       while (current_subject > start_subject &&
 378              (*current_subject & 0xc0) == 0x80)
 379         current_subject--;
 380       }
 381     }
 382   else
 383 #endif
 384
 385   /* In byte-mode we can do this quickly. */
 386
 387     {
 388     gone_back = (current_subject - max_back < start_subject)?
 389       current_subject - start_subject : max_back;
 390     current_subject -= gone_back;
 391     }
 392
 393   /* Now we can process the individual branches. */
 394
 395   end_code = this_start_code;
 396   do
 397     {
 398     int back = GET(end_code, 2+LINK_SIZE);
 399     if (back <= gone_back)
 400       {
 401       int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
 402       ADD_NEW_DATA(-bstate, 0, gone_back - back);
 403       }
 404     end_code += GET(end_code, 1);
 405     }
 406   while (*end_code == OP_ALT);
 407  }
 408
 409 /* This is the code for a "normal" subpattern (not a backward assertion). The
 410 start of a whole pattern is always one of these. If we are at the top level,
 411 we may be asked to restart matching from the same point that we reached for a
 412 previous partial match. We still have to scan through the top-level branches to
 413 find the end state. */
 414
 415 else
 416   {
 417   end_code = this_start_code;
 418
 419   /* Restarting */
 420
 421   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
 422     {
 423     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
 424     new_count = workspace[1];
 425     if (!workspace[0])
 426       memcpy(new_states, active_states, new_count * sizeof(stateblock));
 427     }
 428
 429   /* Not restarting */
 430
 431   else
 432     {
 433     int length = 1 + LINK_SIZE +
 434       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
 435     do
 436       {
 437       ADD_NEW(end_code - start_code + length, 0);
 438       end_code += GET(end_code, 1);
 439       length = 1 + LINK_SIZE;
 440       }
 441     while (*end_code == OP_ALT);
 442     }
 443   }
 444
 445 workspace[0] = 0;    /* Bit indicating which vector is current */
 446
 447 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
 448
 449 /* Loop for scanning the subject */
 450
 451 ptr = current_subject;
 452 for (;;)
 453   {
 454   int i, j;
 455   int clen, dlen;
 456   unsigned int c, d;
 457
 458   /* Make the new state list into the active state list and empty the
 459   new state list. */
 460
 461   temp_states = active_states;
 462   active_states = new_states;
 463   new_states = temp_states;
 464   active_count = new_count;
 465   new_count = 0;
 466
 467   workspace[0] ^= 1;              /* Remember for the restarting feature */
 468   workspace[1] = active_count;
 469
 470 #ifdef DEBUG
 471   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
 472   pchars((uschar *)ptr, strlen((char *)ptr), stdout);
 473   printf("\"\n");
 474
 475   printf("%.*sActive states: ", rlevel*2-2, SP);
 476   for (i = 0; i < active_count; i++)
 477     printf("%d/%d ", active_states[i].offset, active_states[i].count);
 478   printf("\n");
 479 #endif
 480
 481   /* Set the pointers for adding new states */
 482
 483   next_active_state = active_states + active_count;
 484   next_new_state = new_states;
 485
 486   /* Load the current character from the subject outside the loop, as many
 487   different states may want to look at it, and we assume that at least one
 488   will. */
 489
 490   if (ptr < end_subject)
 491     {
 492     clen = 1;        /* Number of bytes in the character */
 493 #ifdef SUPPORT_UTF8
 494     if (utf8) { GETCHARLEN(c, ptr, clen); } else
 495 #endif  /* SUPPORT_UTF8 */
 496     c = *ptr;
 497     }
 498   else
 499     {
 500     clen = 0;        /* This indicates the end of the subject */
 501     c = NOTACHAR;    /* This value should never actually be used */
 502     }
 503
 504   /* Scan up the active states and act on each one. The result of an action
 505   may be to add more states to the currently active list (e.g. on hitting a
 506   parenthesis) or it may be to put states on the new list, for considering
 507   when we move the character pointer on. */
 508
 509   for (i = 0; i < active_count; i++)
 510     {
 511     stateblock *current_state = active_states + i;
 512     const uschar *code;
 513     int state_offset = current_state->offset;
 514     int count, codevalue;
 515 #ifdef SUPPORT_UCP
 516     int chartype, script;
 517 #endif
 518
 519 #ifdef DEBUG
 520     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
 521     if (clen == 0) printf("EOL\n");
 522       else if (c > 32 && c < 127) printf("'%c'\n", c);
 523         else printf("0x%02x\n", c);
 524 #endif
 525
 526     /* This variable is referred to implicity in the ADD_xxx macros. */
 527
 528     ims = current_state->ims;
 529
 530     /* A negative offset is a special case meaning "hold off going to this
 531     (negated) state until the number of characters in the data field have
 532     been skipped". */
 533
 534     if (state_offset < 0)
 535       {
 536       if (current_state->data > 0)
 537         {
 538         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
 539         ADD_NEW_DATA(state_offset, current_state->count,
 540           current_state->data - 1);
 541         continue;
 542         }
 543       else
 544         {
 545         current_state->offset = state_offset = -state_offset;
 546         }
 547       }
 548
 549     /* Check for a duplicate state with the same count, and skip if found. */
 550
 551     for (j = 0; j < i; j++)
 552       {
 553       if (active_states[j].offset == state_offset &&
 554           active_states[j].count == current_state->count)
 555         {
 556         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
 557         goto NEXT_ACTIVE_STATE;
 558         }
 559       }
 560
 561     /* The state offset is the offset to the opcode */
 562
 563     code = start_code + state_offset;
 564     codevalue = *code;
 565
 566     /* If this opcode is followed by an inline character, load it. It is
 567     tempting to test for the presence of a subject character here, but that
 568     is wrong, because sometimes zero repetitions of the subject are
 569     permitted.
 570
 571     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
 572     argument that is not a data character - but is always one byte long. We
 573     have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
 574     this case. To keep the other cases fast, convert these ones to new opcodes.
 575     */
 576
 577     if (coptable[codevalue] > 0)
 578       {
 579       dlen = 1;
 580 #ifdef SUPPORT_UTF8
 581       if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
 582 #endif  /* SUPPORT_UTF8 */
 583       d = code[coptable[codevalue]];
 584       if (codevalue >= OP_TYPESTAR)
 585         {
 586         switch(d)
 587           {
 588           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
 589           case OP_NOTPROP:
 590           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
 591           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
 592           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
 593           case OP_NOT_HSPACE:
 594           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
 595           case OP_NOT_VSPACE:
 596           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
 597           default: break;
 598           }
 599         }
 600       }
 601     else
 602       {
 603       dlen = 0;         /* Not strictly necessary, but compilers moan */
 604       d = NOTACHAR;     /* if these variables are not set. */
 605       }
 606
 607
 608     /* Now process the individual opcodes */
 609
 610     switch (codevalue)
 611       {
 612
 613 /* ========================================================================== */
 614       /* Reached a closing bracket. If not at the end of the pattern, carry
 615       on with the next opcode. Otherwise, unless we have an empty string and
 616       PCRE_NOTEMPTY is set, save the match data, shifting up all previous
 617       matches so we always have the longest first. */
 618
 619       case OP_KET:
 620       case OP_KETRMIN:
 621       case OP_KETRMAX:
 622       if (code != end_code)
 623         {
 624         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
 625         if (codevalue != OP_KET)
 626           {
 627           ADD_ACTIVE(state_offset - GET(code, 1), 0);
 628           }
 629         }
 630       else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
 631         {
 632         if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
 633           else if (match_count > 0 && ++match_count * 2 >= offsetcount)
 634             match_count = 0;
 635         count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
 636         if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
 637         if (offsetcount >= 2)
 638           {
 639           offsets[0] = current_subject - start_subject;
 640           offsets[1] = ptr - start_subject;
 641           DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
 642             offsets[1] - offsets[0], current_subject));
 643           }
 644         if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
 645           {
 646           DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
 647             "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
 648             match_count, rlevel*2-2, SP));
 649           return match_count;
 650           }
 651         }
 652       break;
 653
 654 /* ========================================================================== */
 655       /* These opcodes add to the current list of states without looking
 656       at the current character. */
 657
 658       /*-----------------------------------------------------------------*/
 659       case OP_ALT:
 660       do { code += GET(code, 1); } while (*code == OP_ALT);
 661       ADD_ACTIVE(code - start_code, 0);
 662       break;
 663
 664       /*-----------------------------------------------------------------*/
 665       case OP_BRA:
 666       case OP_SBRA:
 667       do
 668         {
 669         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
 670         code += GET(code, 1);
 671         }
 672       while (*code == OP_ALT);
 673       break;
 674
 675       /*-----------------------------------------------------------------*/
 676       case OP_CBRA:
 677       case OP_SCBRA:
 678       ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
 679       code += GET(code, 1);
 680       while (*code == OP_ALT)
 681         {
 682         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
 683         code += GET(code, 1);
 684         }
 685       break;
 686
 687       /*-----------------------------------------------------------------*/
 688       case OP_BRAZERO:
 689       case OP_BRAMINZERO:
 690       ADD_ACTIVE(state_offset + 1, 0);
 691       code += 1 + GET(code, 2);
 692       while (*code == OP_ALT) code += GET(code, 1);
 693       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
 694       break;
 695
 696       /*-----------------------------------------------------------------*/
 697       case OP_SKIPZERO:
 698       code += 1 + GET(code, 2);
 699       while (*code == OP_ALT) code += GET(code, 1);
 700       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
 701       break;
 702
 703       /*-----------------------------------------------------------------*/
 704       case OP_CIRC:
 705       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
 706           ((ims & PCRE_MULTILINE) != 0 &&
 707             ptr != end_subject &&
 708             WAS_NEWLINE(ptr)))
 709         { ADD_ACTIVE(state_offset + 1, 0); }
 710       break;
 711
 712       /*-----------------------------------------------------------------*/
 713       case OP_EOD:
 714       if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 715       break;
 716
 717       /*-----------------------------------------------------------------*/
 718       case OP_OPT:
 719       ims = code[1];
 720       ADD_ACTIVE(state_offset + 2, 0);
 721       break;
 722
 723       /*-----------------------------------------------------------------*/
 724       case OP_SOD:
 725       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 726       break;
 727
 728       /*-----------------------------------------------------------------*/
 729       case OP_SOM:
 730       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
 731       break;
 732
 733
 734 /* ========================================================================== */
 735       /* These opcodes inspect the next subject character, and sometimes
 736       the previous one as well, but do not have an argument. The variable
 737       clen contains the length of the current character and is zero if we are
 738       at the end of the subject. */
 739
 740       /*-----------------------------------------------------------------*/
 741       case OP_ANY:
 742       if (clen > 0 && !IS_NEWLINE(ptr))
 743         { ADD_NEW(state_offset + 1, 0); }
 744       break;
 745
 746       /*-----------------------------------------------------------------*/
 747       case OP_ALLANY:
 748       if (clen > 0)
 749         { ADD_NEW(state_offset + 1, 0); }
 750       break;
 751
 752       /*-----------------------------------------------------------------*/
 753       case OP_EODN:
 754       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
 755         { ADD_ACTIVE(state_offset + 1, 0); }
 756       break;
 757
 758       /*-----------------------------------------------------------------*/
 759       case OP_DOLL:
 760       if ((md->moptions & PCRE_NOTEOL) == 0)
 761         {
 762         if (clen == 0 ||
 763             (IS_NEWLINE(ptr) &&
 764                ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
 765             ))
 766           { ADD_ACTIVE(state_offset + 1, 0); }
 767         }
 768       else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
 769         { ADD_ACTIVE(state_offset + 1, 0); }
 770       break;
 771
 772       /*-----------------------------------------------------------------*/
 773
 774       case OP_DIGIT:
 775       case OP_WHITESPACE:
 776       case OP_WORDCHAR:
 777       if (clen > 0 && c < 256 &&
 778             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
 779         { ADD_NEW(state_offset + 1, 0); }
 780       break;
 781
 782       /*-----------------------------------------------------------------*/
 783       case OP_NOT_DIGIT:
 784       case OP_NOT_WHITESPACE:
 785       case OP_NOT_WORDCHAR:
 786       if (clen > 0 && (c >= 256 ||
 787             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
 788         { ADD_NEW(state_offset + 1, 0); }
 789       break;
 790
 791       /*-----------------------------------------------------------------*/
 792       case OP_WORD_BOUNDARY:
 793       case OP_NOT_WORD_BOUNDARY:
 794         {
 795         int left_word, right_word;
 796
 797         if (ptr > start_subject)
 798           {
 799           const uschar *temp = ptr - 1;
 800 #ifdef SUPPORT_UTF8
 801           if (utf8) BACKCHAR(temp);
 802 #endif
 803           GETCHARTEST(d, temp);
 804           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
 805           }
 806         else left_word = 0;
 807
 808         if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
 809           else right_word = 0;
 810
 811         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
 812           { ADD_ACTIVE(state_offset + 1, 0); }
 813         }
 814       break;
 815
 816
 817       /*-----------------------------------------------------------------*/
 818       /* Check the next character by Unicode property. We will get here only
 819       if the support is in the binary; otherwise a compile-time error occurs.
 820       */
 821
 822 #ifdef SUPPORT_UCP
 823       case OP_PROP:
 824       case OP_NOTPROP:
 825       if (clen > 0)
 826         {
 827         BOOL OK;
 828         int category = _pcre_ucp_findprop(c, &chartype, &script);
 829         switch(code[1])
 830           {
 831           case PT_ANY:
 832           OK = TRUE;
 833           break;
 834
 835           case PT_LAMP:
 836           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
 837           break;
 838
 839           case PT_GC:
 840           OK = category == code[2];
 841           break;
 842
 843           case PT_PC:
 844           OK = chartype == code[2];
 845           break;
 846
 847           case PT_SC:
 848           OK = script == code[2];
 849           break;
 850
 851           /* Should never occur, but keep compilers from grumbling. */
 852
 853           default:
 854           OK = codevalue != OP_PROP;
 855           break;
 856           }
 857
 858         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
 859         }
 860       break;
 861 #endif
 862
 863
 864
 865 /* ========================================================================== */
 866       /* These opcodes likewise inspect the subject character, but have an
 867       argument that is not a data character. It is one of these opcodes:
 868       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
 869       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
 870
 871       case OP_TYPEPLUS:
 872       case OP_TYPEMINPLUS:
 873       case OP_TYPEPOSPLUS:
 874       count = current_state->count;  /* Already matched */
 875       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 876       if (clen > 0)
 877         {
 878         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 879             (c < 256 &&
 880               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 881               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 882           {
 883           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
 884             {
 885             active_count--;            /* Remove non-match possibility */
 886             next_active_state--;
 887             }
 888           count++;
 889           ADD_NEW(state_offset, count);
 890           }
 891         }
 892       break;
 893
 894       /*-----------------------------------------------------------------*/
 895       case OP_TYPEQUERY:
 896       case OP_TYPEMINQUERY:
 897       case OP_TYPEPOSQUERY:
 898       ADD_ACTIVE(state_offset + 2, 0);
 899       if (clen > 0)
 900         {
 901         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 902             (c < 256 &&
 903               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 904               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 905           {
 906           if (codevalue == OP_TYPEPOSQUERY)
 907             {
 908             active_count--;            /* Remove non-match possibility */
 909             next_active_state--;
 910             }
 911           ADD_NEW(state_offset + 2, 0);
 912           }
 913         }
 914       break;
 915
 916       /*-----------------------------------------------------------------*/
 917       case OP_TYPESTAR:
 918       case OP_TYPEMINSTAR:
 919       case OP_TYPEPOSSTAR:
 920       ADD_ACTIVE(state_offset + 2, 0);
 921       if (clen > 0)
 922         {
 923         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 924             (c < 256 &&
 925               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 926               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 927           {
 928           if (codevalue == OP_TYPEPOSSTAR)
 929             {
 930             active_count--;            /* Remove non-match possibility */
 931             next_active_state--;
 932             }
 933           ADD_NEW(state_offset, 0);
 934           }
 935         }
 936       break;
 937
 938       /*-----------------------------------------------------------------*/
 939       case OP_TYPEEXACT:
 940       count = current_state->count;  /* Number already matched */
 941       if (clen > 0)
 942         {
 943         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 944             (c < 256 &&
 945               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 946               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 947           {
 948           if (++count >= GET2(code, 1))
 949             { ADD_NEW(state_offset + 4, 0); }
 950           else
 951             { ADD_NEW(state_offset, count); }
 952           }
 953         }
 954       break;
 955
 956       /*-----------------------------------------------------------------*/
 957       case OP_TYPEUPTO:
 958       case OP_TYPEMINUPTO:
 959       case OP_TYPEPOSUPTO:
 960       ADD_ACTIVE(state_offset + 4, 0);
 961       count = current_state->count;  /* Number already matched */
 962       if (clen > 0)
 963         {
 964         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 965             (c < 256 &&
 966               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 967               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 968           {
 969           if (codevalue == OP_TYPEPOSUPTO)
 970             {
 971             active_count--;           /* Remove non-match possibility */
 972             next_active_state--;
 973             }
 974           if (++count >= GET2(code, 1))
 975             { ADD_NEW(state_offset + 4, 0); }
 976           else
 977             { ADD_NEW(state_offset, count); }
 978           }
 979         }
 980       break;
 981
 982 /* ========================================================================== */
 983       /* These are virtual opcodes that are used when something like
 984       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
 985       argument. It keeps the code above fast for the other cases. The argument
 986       is in the d variable. */
 987
 988 #ifdef SUPPORT_UCP
 989       case OP_PROP_EXTRA + OP_TYPEPLUS:
 990       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
 991       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
 992       count = current_state->count;           /* Already matched */
 993       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
 994       if (clen > 0)
 995         {
 996         BOOL OK;
 997         int category = _pcre_ucp_findprop(c, &chartype, &script);
 998         switch(code[2])
 999           {
1000           case PT_ANY:
1001           OK = TRUE;
1002           break;
1003
1004           case PT_LAMP:
1005           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1006           break;
1007
1008           case PT_GC:
1009           OK = category == code[3];
1010           break;
1011
1012           case PT_PC:
1013           OK = chartype == code[3];
1014           break;
1015
1016           case PT_SC:
1017           OK = script == code[3];
1018           break;
1019
1020           /* Should never occur, but keep compilers from grumbling. */
1021
1022           default:
1023           OK = codevalue != OP_PROP;
1024           break;
1025           }
1026
1027         if (OK == (d == OP_PROP))
1028           {
1029           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1030             {
1031             active_count--;           /* Remove non-match possibility */
1032             next_active_state--;
1033             }
1034           count++;
1035           ADD_NEW(state_offset, count);
1036           }
1037         }
1038       break;
1039
1040       /*-----------------------------------------------------------------*/
1041       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1042       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1043       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1044       count = current_state->count;  /* Already matched */
1045       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1046       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1047         {
1048         const uschar *nptr = ptr + clen;
1049         int ncount = 0;
1050         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1051           {
1052           active_count--;           /* Remove non-match possibility */
1053           next_active_state--;
1054           }
1055         while (nptr < end_subject)
1056           {
1057           int nd;
1058           int ndlen = 1;
1059           GETCHARLEN(nd, nptr, ndlen);
1060           if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1061           ncount++;
1062           nptr += ndlen;
1063           }
1064         count++;
1065         ADD_NEW_DATA(-state_offset, count, ncount);
1066         }
1067       break;
1068 #endif
1069
1070       /*-----------------------------------------------------------------*/
1071       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1072       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1073       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1074       count = current_state->count;  /* Already matched */
1075       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1076       if (clen > 0)
1077         {
1078         int ncount = 0;
1079         switch (c)
1080           {
1081           case 0x000b:
1082           case 0x000c:
1083           case 0x0085:
1084           case 0x2028:
1085           case 0x2029:
1086           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1087           goto ANYNL01;
1088
1089           case 0x000d:
1090           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1091           /* Fall through */
1092
1093           ANYNL01:
1094           case 0x000a:
1095           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1096             {
1097             active_count--;           /* Remove non-match possibility */
1098             next_active_state--;
1099             }
1100           count++;
1101           ADD_NEW_DATA(-state_offset, count, ncount);
1102           break;
1103
1104           default:
1105           break;
1106           }
1107         }
1108       break;
1109
1110       /*-----------------------------------------------------------------*/
1111       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1112       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1113       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1114       count = current_state->count;  /* Already matched */
1115       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1116       if (clen > 0)
1117         {
1118         BOOL OK;
1119         switch (c)
1120           {
1121           case 0x000a:
1122           case 0x000b:
1123           case 0x000c:
1124           case 0x000d:
1125           case 0x0085:
1126           case 0x2028:
1127           case 0x2029:
1128           OK = TRUE;
1129           break;
1130
1131           default:
1132           OK = FALSE;
1133           break;
1134           }
1135
1136         if (OK == (d == OP_VSPACE))
1137           {
1138           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1139             {
1140             active_count--;           /* Remove non-match possibility */
1141             next_active_state--;
1142             }
1143           count++;
1144           ADD_NEW_DATA(-state_offset, count, 0);
1145           }
1146         }
1147       break;
1148
1149       /*-----------------------------------------------------------------*/
1150       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1151       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1152       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1153       count = current_state->count;  /* Already matched */
1154       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1155       if (clen > 0)
1156         {
1157         BOOL OK;
1158         switch (c)
1159           {
1160           case 0x09:      /* HT */
1161           case 0x20:      /* SPACE */
1162           case 0xa0:      /* NBSP */
1163           case 0x1680:    /* OGHAM SPACE MARK */
1164           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1165           case 0x2000:    /* EN QUAD */
1166           case 0x2001:    /* EM QUAD */
1167           case 0x2002:    /* EN SPACE */
1168           case 0x2003:    /* EM SPACE */
1169           case 0x2004:    /* THREE-PER-EM SPACE */
1170           case 0x2005:    /* FOUR-PER-EM SPACE */
1171           case 0x2006:    /* SIX-PER-EM SPACE */
1172           case 0x2007:    /* FIGURE SPACE */
1173           case 0x2008:    /* PUNCTUATION SPACE */
1174           case 0x2009:    /* THIN SPACE */
1175           case 0x200A:    /* HAIR SPACE */
1176           case 0x202f:    /* NARROW NO-BREAK SPACE */
1177           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1178           case 0x3000:    /* IDEOGRAPHIC SPACE */
1179           OK = TRUE;
1180           break;
1181
1182           default:
1183           OK = FALSE;
1184           break;
1185           }
1186
1187         if (OK == (d == OP_HSPACE))
1188           {
1189           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1190             {
1191             active_count--;           /* Remove non-match possibility */
1192             next_active_state--;
1193             }
1194           count++;
1195           ADD_NEW_DATA(-state_offset, count, 0);
1196           }
1197         }
1198       break;
1199
1200       /*-----------------------------------------------------------------*/
1201 #ifdef SUPPORT_UCP
1202       case OP_PROP_EXTRA + OP_TYPEQUERY:
1203       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1204       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1205       count = 4;
1206       goto QS1;
1207
1208       case OP_PROP_EXTRA + OP_TYPESTAR:
1209       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1210       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1211       count = 0;
1212
1213       QS1:
1214
1215       ADD_ACTIVE(state_offset + 4, 0);
1216       if (clen > 0)
1217         {
1218         BOOL OK;
1219         int category = _pcre_ucp_findprop(c, &chartype, &script);
1220         switch(code[2])
1221           {
1222           case PT_ANY:
1223           OK = TRUE;
1224           break;
1225
1226           case PT_LAMP:
1227           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1228           break;
1229
1230           case PT_GC:
1231           OK = category == code[3];
1232           break;
1233
1234           case PT_PC:
1235           OK = chartype == code[3];
1236           break;
1237
1238           case PT_SC:
1239           OK = script == code[3];
1240           break;
1241
1242           /* Should never occur, but keep compilers from grumbling. */
1243
1244           default:
1245           OK = codevalue != OP_PROP;
1246           break;
1247           }
1248
1249         if (OK == (d == OP_PROP))
1250           {
1251           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1252               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1253             {
1254             active_count--;           /* Remove non-match possibility */
1255             next_active_state--;
1256             }
1257           ADD_NEW(state_offset + count, 0);
1258           }
1259         }
1260       break;
1261
1262       /*-----------------------------------------------------------------*/
1263       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1264       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1265       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1266       count = 2;
1267       goto QS2;
1268
1269       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1270       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1271       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1272       count = 0;
1273
1274       QS2:
1275
1276       ADD_ACTIVE(state_offset + 2, 0);
1277       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1278         {
1279         const uschar *nptr = ptr + clen;
1280         int ncount = 0;
1281         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1282             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1283           {
1284           active_count--;           /* Remove non-match possibility */
1285           next_active_state--;
1286           }
1287         while (nptr < end_subject)
1288           {
1289           int nd;
1290           int ndlen = 1;
1291           GETCHARLEN(nd, nptr, ndlen);
1292           if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1293           ncount++;
1294           nptr += ndlen;
1295           }
1296         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1297         }
1298       break;
1299 #endif
1300
1301       /*-----------------------------------------------------------------*/
1302       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1303       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1304       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1305       count = 2;
1306       goto QS3;
1307
1308       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1309       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1310       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1311       count = 0;
1312
1313       QS3:
1314       ADD_ACTIVE(state_offset + 2, 0);
1315       if (clen > 0)
1316         {
1317         int ncount = 0;
1318         switch (c)
1319           {
1320           case 0x000b:
1321           case 0x000c:
1322           case 0x0085:
1323           case 0x2028:
1324           case 0x2029:
1325           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1326           goto ANYNL02;
1327
1328           case 0x000d:
1329           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1330           /* Fall through */
1331
1332           ANYNL02:
1333           case 0x000a:
1334           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1335               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1336             {
1337             active_count--;           /* Remove non-match possibility */
1338             next_active_state--;
1339             }
1340           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1341           break;
1342
1343           default:
1344           break;
1345           }
1346         }
1347       break;
1348
1349       /*-----------------------------------------------------------------*/
1350       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1351       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1352       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1353       count = 2;
1354       goto QS4;
1355
1356       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1357       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1358       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1359       count = 0;
1360
1361       QS4:
1362       ADD_ACTIVE(state_offset + 2, 0);
1363       if (clen > 0)
1364         {
1365         BOOL OK;
1366         switch (c)
1367           {
1368           case 0x000a:
1369           case 0x000b:
1370           case 0x000c:
1371           case 0x000d:
1372           case 0x0085:
1373           case 0x2028:
1374           case 0x2029:
1375           OK = TRUE;
1376           break;
1377
1378           default:
1379           OK = FALSE;
1380           break;
1381           }
1382         if (OK == (d == OP_VSPACE))
1383           {
1384           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1385               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1386             {
1387             active_count--;           /* Remove non-match possibility */
1388             next_active_state--;
1389             }
1390           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1391           }
1392         }
1393       break;
1394
1395       /*-----------------------------------------------------------------*/
1396       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1397       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1398       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1399       count = 2;
1400       goto QS5;
1401
1402       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1403       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1404       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1405       count = 0;
1406
1407       QS5:
1408       ADD_ACTIVE(state_offset + 2, 0);
1409       if (clen > 0)
1410         {
1411         BOOL OK;
1412         switch (c)
1413           {
1414           case 0x09:      /* HT */
1415           case 0x20:      /* SPACE */
1416           case 0xa0:      /* NBSP */
1417           case 0x1680:    /* OGHAM SPACE MARK */
1418           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1419           case 0x2000:    /* EN QUAD */
1420           case 0x2001:    /* EM QUAD */
1421           case 0x2002:    /* EN SPACE */
1422           case 0x2003:    /* EM SPACE */
1423           case 0x2004:    /* THREE-PER-EM SPACE */
1424           case 0x2005:    /* FOUR-PER-EM SPACE */
1425           case 0x2006:    /* SIX-PER-EM SPACE */
1426           case 0x2007:    /* FIGURE SPACE */
1427           case 0x2008:    /* PUNCTUATION SPACE */
1428           case 0x2009:    /* THIN SPACE */
1429           case 0x200A:    /* HAIR SPACE */
1430           case 0x202f:    /* NARROW NO-BREAK SPACE */
1431           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1432           case 0x3000:    /* IDEOGRAPHIC SPACE */
1433           OK = TRUE;
1434           break;
1435
1436           default:
1437           OK = FALSE;
1438           break;
1439           }
1440
1441         if (OK == (d == OP_HSPACE))
1442           {
1443           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1444               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1445             {
1446             active_count--;           /* Remove non-match possibility */
1447             next_active_state--;
1448             }
1449           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1450           }
1451         }
1452       break;
1453
1454       /*-----------------------------------------------------------------*/
1455 #ifdef SUPPORT_UCP
1456       case OP_PROP_EXTRA + OP_TYPEEXACT:
1457       case OP_PROP_EXTRA + OP_TYPEUPTO:
1458       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1459       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1460       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1461         { ADD_ACTIVE(state_offset + 6, 0); }
1462       count = current_state->count;  /* Number already matched */
1463       if (clen > 0)
1464         {
1465         BOOL OK;
1466         int category = _pcre_ucp_findprop(c, &chartype, &script);
1467         switch(code[4])
1468           {
1469           case PT_ANY:
1470           OK = TRUE;
1471           break;
1472
1473           case PT_LAMP:
1474           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1475           break;
1476
1477           case PT_GC:
1478           OK = category == code[5];
1479           break;
1480
1481           case PT_PC:
1482           OK = chartype == code[5];
1483           break;
1484
1485           case PT_SC:
1486           OK = script == code[5];
1487           break;
1488
1489           /* Should never occur, but keep compilers from grumbling. */
1490
1491           default:
1492           OK = codevalue != OP_PROP;
1493           break;
1494           }
1495
1496         if (OK == (d == OP_PROP))
1497           {
1498           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1499             {
1500             active_count--;           /* Remove non-match possibility */
1501             next_active_state--;
1502             }
1503           if (++count >= GET2(code, 1))
1504             { ADD_NEW(state_offset + 6, 0); }
1505           else
1506             { ADD_NEW(state_offset, count); }
1507           }
1508         }
1509       break;
1510
1511       /*-----------------------------------------------------------------*/
1512       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1513       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1514       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1515       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1516       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1517         { ADD_ACTIVE(state_offset + 4, 0); }
1518       count = current_state->count;  /* Number already matched */
1519       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1520         {
1521         const uschar *nptr = ptr + clen;
1522         int ncount = 0;
1523         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1524           {
1525           active_count--;           /* Remove non-match possibility */
1526           next_active_state--;
1527           }
1528         while (nptr < end_subject)
1529           {
1530           int nd;
1531           int ndlen = 1;
1532           GETCHARLEN(nd, nptr, ndlen);
1533           if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1534           ncount++;
1535           nptr += ndlen;
1536           }
1537         if (++count >= GET2(code, 1))
1538           { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1539         else
1540           { ADD_NEW_DATA(-state_offset, count, ncount); }
1541         }
1542       break;
1543 #endif
1544
1545       /*-----------------------------------------------------------------*/
1546       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1547       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1548       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1549       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1550       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1551         { ADD_ACTIVE(state_offset + 4, 0); }
1552       count = current_state->count;  /* Number already matched */
1553       if (clen > 0)
1554         {
1555         int ncount = 0;
1556         switch (c)
1557           {
1558           case 0x000b:
1559           case 0x000c:
1560           case 0x0085:
1561           case 0x2028:
1562           case 0x2029:
1563           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1564           goto ANYNL03;
1565
1566           case 0x000d:
1567           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1568           /* Fall through */
1569
1570           ANYNL03:
1571           case 0x000a:
1572           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1573             {
1574             active_count--;           /* Remove non-match possibility */
1575             next_active_state--;
1576             }
1577           if (++count >= GET2(code, 1))
1578             { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1579           else
1580             { ADD_NEW_DATA(-state_offset, count, ncount); }
1581           break;
1582
1583           default:
1584           break;
1585           }
1586         }
1587       break;
1588
1589       /*-----------------------------------------------------------------*/
1590       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1591       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1592       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1593       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1594       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1595         { ADD_ACTIVE(state_offset + 4, 0); }
1596       count = current_state->count;  /* Number already matched */
1597       if (clen > 0)
1598         {
1599         BOOL OK;
1600         switch (c)
1601           {
1602           case 0x000a:
1603           case 0x000b:
1604           case 0x000c:
1605           case 0x000d:
1606           case 0x0085:
1607           case 0x2028:
1608           case 0x2029:
1609           OK = TRUE;
1610           break;
1611
1612           default:
1613           OK = FALSE;
1614           }
1615
1616         if (OK == (d == OP_VSPACE))
1617           {
1618           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1619             {
1620             active_count--;           /* Remove non-match possibility */
1621             next_active_state--;
1622             }
1623           if (++count >= GET2(code, 1))
1624             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1625           else
1626             { ADD_NEW_DATA(-state_offset, count, 0); }
1627           }
1628         }
1629       break;
1630
1631       /*-----------------------------------------------------------------*/
1632       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1633       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1634       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1635       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1636       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1637         { ADD_ACTIVE(state_offset + 4, 0); }
1638       count = current_state->count;  /* Number already matched */
1639       if (clen > 0)
1640         {
1641         BOOL OK;
1642         switch (c)
1643           {
1644           case 0x09:      /* HT */
1645           case 0x20:      /* SPACE */
1646           case 0xa0:      /* NBSP */
1647           case 0x1680:    /* OGHAM SPACE MARK */
1648           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1649           case 0x2000:    /* EN QUAD */
1650           case 0x2001:    /* EM QUAD */
1651           case 0x2002:    /* EN SPACE */
1652           case 0x2003:    /* EM SPACE */
1653           case 0x2004:    /* THREE-PER-EM SPACE */
1654           case 0x2005:    /* FOUR-PER-EM SPACE */
1655           case 0x2006:    /* SIX-PER-EM SPACE */
1656           case 0x2007:    /* FIGURE SPACE */
1657           case 0x2008:    /* PUNCTUATION SPACE */
1658           case 0x2009:    /* THIN SPACE */
1659           case 0x200A:    /* HAIR SPACE */
1660           case 0x202f:    /* NARROW NO-BREAK SPACE */
1661           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1662           case 0x3000:    /* IDEOGRAPHIC SPACE */
1663           OK = TRUE;
1664           break;
1665
1666           default:
1667           OK = FALSE;
1668           break;
1669           }
1670
1671         if (OK == (d == OP_HSPACE))
1672           {
1673           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1674             {
1675             active_count--;           /* Remove non-match possibility */
1676             next_active_state--;
1677             }
1678           if (++count >= GET2(code, 1))
1679             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1680           else
1681             { ADD_NEW_DATA(-state_offset, count, 0); }
1682           }
1683         }
1684       break;
1685
1686 /* ========================================================================== */
1687       /* These opcodes are followed by a character that is usually compared
1688       to the current subject character; it is loaded into d. We still get
1689       here even if there is no subject character, because in some cases zero
1690       repetitions are permitted. */
1691
1692       /*-----------------------------------------------------------------*/
1693       case OP_CHAR:
1694       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1695       break;
1696
1697       /*-----------------------------------------------------------------*/
1698       case OP_CHARNC:
1699       if (clen == 0) break;
1700
1701 #ifdef SUPPORT_UTF8
1702       if (utf8)
1703         {
1704         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1705           {
1706           unsigned int othercase;
1707           if (c < 128) othercase = fcc[c]; else
1708
1709           /* If we have Unicode property support, we can use it to test the
1710           other case of the character. */
1711
1712 #ifdef SUPPORT_UCP
1713           othercase = _pcre_ucp_othercase(c);
1714 #else
1715           othercase = NOTACHAR;
1716 #endif
1717
1718           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1719           }
1720         }
1721       else
1722 #endif  /* SUPPORT_UTF8 */
1723
1724       /* Non-UTF-8 mode */
1725         {
1726         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1727         }
1728       break;
1729
1730
1731 #ifdef SUPPORT_UCP
1732       /*-----------------------------------------------------------------*/
1733       /* This is a tricky one because it can match more than one character.
1734       Find out how many characters to skip, and then set up a negative state
1735       to wait for them to pass before continuing. */
1736
1737       case OP_EXTUNI:
1738       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1739         {
1740         const uschar *nptr = ptr + clen;
1741         int ncount = 0;
1742         while (nptr < end_subject)
1743           {
1744           int nclen = 1;
1745           GETCHARLEN(c, nptr, nclen);
1746           if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1747           ncount++;
1748           nptr += nclen;
1749           }
1750         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1751         }
1752       break;
1753 #endif
1754
1755       /*-----------------------------------------------------------------*/
1756       /* This is a tricky like EXTUNI because it too can match more than one
1757       character (when CR is followed by LF). In this case, set up a negative
1758       state to wait for one character to pass before continuing. */
1759
1760       case OP_ANYNL:
1761       if (clen > 0) switch(c)
1762         {
1763         case 0x000b:
1764         case 0x000c:
1765         case 0x0085:
1766         case 0x2028:
1767         case 0x2029:
1768         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1769
1770         case 0x000a:
1771         ADD_NEW(state_offset + 1, 0);
1772         break;
1773
1774         case 0x000d:
1775         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1776           {
1777           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1778           }
1779         else
1780           {
1781           ADD_NEW(state_offset + 1, 0);
1782           }
1783         break;
1784         }
1785       break;
1786
1787       /*-----------------------------------------------------------------*/
1788       case OP_NOT_VSPACE:
1789       if (clen > 0) switch(c)
1790         {
1791         case 0x000a:
1792         case 0x000b:
1793         case 0x000c:
1794         case 0x000d:
1795         case 0x0085:
1796         case 0x2028:
1797         case 0x2029:
1798         break;
1799
1800         default:
1801         ADD_NEW(state_offset + 1, 0);
1802         break;
1803         }
1804       break;
1805
1806       /*-----------------------------------------------------------------*/
1807       case OP_VSPACE:
1808       if (clen > 0) switch(c)
1809         {
1810         case 0x000a:
1811         case 0x000b:
1812         case 0x000c:
1813         case 0x000d:
1814         case 0x0085:
1815         case 0x2028:
1816         case 0x2029:
1817         ADD_NEW(state_offset + 1, 0);
1818         break;
1819
1820         default: break;
1821         }
1822       break;
1823
1824       /*-----------------------------------------------------------------*/
1825       case OP_NOT_HSPACE:
1826       if (clen > 0) switch(c)
1827         {
1828         case 0x09:      /* HT */
1829         case 0x20:      /* SPACE */
1830         case 0xa0:      /* NBSP */
1831         case 0x1680:    /* OGHAM SPACE MARK */
1832         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1833         case 0x2000:    /* EN QUAD */
1834         case 0x2001:    /* EM QUAD */
1835         case 0x2002:    /* EN SPACE */
1836         case 0x2003:    /* EM SPACE */
1837         case 0x2004:    /* THREE-PER-EM SPACE */
1838         case 0x2005:    /* FOUR-PER-EM SPACE */
1839         case 0x2006:    /* SIX-PER-EM SPACE */
1840         case 0x2007:    /* FIGURE SPACE */
1841         case 0x2008:    /* PUNCTUATION SPACE */
1842         case 0x2009:    /* THIN SPACE */
1843         case 0x200A:    /* HAIR SPACE */
1844         case 0x202f:    /* NARROW NO-BREAK SPACE */
1845         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1846         case 0x3000:    /* IDEOGRAPHIC SPACE */
1847         break;
1848
1849         default:
1850         ADD_NEW(state_offset + 1, 0);
1851         break;
1852         }
1853       break;
1854
1855       /*-----------------------------------------------------------------*/
1856       case OP_HSPACE:
1857       if (clen > 0) switch(c)
1858         {
1859         case 0x09:      /* HT */
1860         case 0x20:      /* SPACE */
1861         case 0xa0:      /* NBSP */
1862         case 0x1680:    /* OGHAM SPACE MARK */
1863         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1864         case 0x2000:    /* EN QUAD */
1865         case 0x2001:    /* EM QUAD */
1866         case 0x2002:    /* EN SPACE */
1867         case 0x2003:    /* EM SPACE */
1868         case 0x2004:    /* THREE-PER-EM SPACE */
1869         case 0x2005:    /* FOUR-PER-EM SPACE */
1870         case 0x2006:    /* SIX-PER-EM SPACE */
1871         case 0x2007:    /* FIGURE SPACE */
1872         case 0x2008:    /* PUNCTUATION SPACE */
1873         case 0x2009:    /* THIN SPACE */
1874         case 0x200A:    /* HAIR SPACE */
1875         case 0x202f:    /* NARROW NO-BREAK SPACE */
1876         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1877         case 0x3000:    /* IDEOGRAPHIC SPACE */
1878         ADD_NEW(state_offset + 1, 0);
1879         break;
1880         }
1881       break;
1882
1883       /*-----------------------------------------------------------------*/
1884       /* Match a negated single character. This is only used for one-byte
1885       characters, that is, we know that d < 256. The character we are
1886       checking (c) can be multibyte. */
1887
1888       case OP_NOT:
1889       if (clen > 0)
1890         {
1891         unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1892         if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1893         }
1894       break;
1895
1896       /*-----------------------------------------------------------------*/
1897       case OP_PLUS:
1898       case OP_MINPLUS:
1899       case OP_POSPLUS:
1900       case OP_NOTPLUS:
1901       case OP_NOTMINPLUS:
1902       case OP_NOTPOSPLUS:
1903       count = current_state->count;  /* Already matched */
1904       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1905       if (clen > 0)
1906         {
1907         unsigned int otherd = NOTACHAR;
1908         if ((ims & PCRE_CASELESS) != 0)
1909           {
1910 #ifdef SUPPORT_UTF8
1911           if (utf8 && d >= 128)
1912             {
1913 #ifdef SUPPORT_UCP
1914             otherd = _pcre_ucp_othercase(d);
1915 #endif  /* SUPPORT_UCP */
1916             }
1917           else
1918 #endif  /* SUPPORT_UTF8 */
1919           otherd = fcc[d];
1920           }
1921         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1922           {
1923           if (count > 0 &&
1924               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1925             {
1926             active_count--;             /* Remove non-match possibility */
1927             next_active_state--;
1928             }
1929           count++;
1930           ADD_NEW(state_offset, count);
1931           }
1932         }
1933       break;
1934
1935       /*-----------------------------------------------------------------*/
1936       case OP_QUERY:
1937       case OP_MINQUERY:
1938       case OP_POSQUERY:
1939       case OP_NOTQUERY:
1940       case OP_NOTMINQUERY:
1941       case OP_NOTPOSQUERY:
1942       ADD_ACTIVE(state_offset + dlen + 1, 0);
1943       if (clen > 0)
1944         {
1945         unsigned int otherd = NOTACHAR;
1946         if ((ims & PCRE_CASELESS) != 0)
1947           {
1948 #ifdef SUPPORT_UTF8
1949           if (utf8 && d >= 128)
1950             {
1951 #ifdef SUPPORT_UCP
1952             otherd = _pcre_ucp_othercase(d);
1953 #endif  /* SUPPORT_UCP */
1954             }
1955           else
1956 #endif  /* SUPPORT_UTF8 */
1957           otherd = fcc[d];
1958           }
1959         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1960           {
1961           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1962             {
1963             active_count--;            /* Remove non-match possibility */
1964             next_active_state--;
1965             }
1966           ADD_NEW(state_offset + dlen + 1, 0);
1967           }
1968         }
1969       break;
1970
1971       /*-----------------------------------------------------------------*/
1972       case OP_STAR:
1973       case OP_MINSTAR:
1974       case OP_POSSTAR:
1975       case OP_NOTSTAR:
1976       case OP_NOTMINSTAR:
1977       case OP_NOTPOSSTAR:
1978       ADD_ACTIVE(state_offset + dlen + 1, 0);
1979       if (clen > 0)
1980         {
1981         unsigned int otherd = NOTACHAR;
1982         if ((ims & PCRE_CASELESS) != 0)
1983           {
1984 #ifdef SUPPORT_UTF8
1985           if (utf8 && d >= 128)
1986             {
1987 #ifdef SUPPORT_UCP
1988             otherd = _pcre_ucp_othercase(d);
1989 #endif  /* SUPPORT_UCP */
1990             }
1991           else
1992 #endif  /* SUPPORT_UTF8 */
1993           otherd = fcc[d];
1994           }
1995         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1996           {
1997           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1998             {
1999             active_count--;            /* Remove non-match possibility */
2000             next_active_state--;
2001             }
2002           ADD_NEW(state_offset, 0);
2003           }
2004         }
2005       break;
2006
2007       /*-----------------------------------------------------------------*/
2008       case OP_EXACT:
2009       case OP_NOTEXACT:
2010       count = current_state->count;  /* Number already matched */
2011       if (clen > 0)
2012         {
2013         unsigned int otherd = NOTACHAR;
2014         if ((ims & PCRE_CASELESS) != 0)
2015           {
2016 #ifdef SUPPORT_UTF8
2017           if (utf8 && d >= 128)
2018             {
2019 #ifdef SUPPORT_UCP
2020             otherd = _pcre_ucp_othercase(d);
2021 #endif  /* SUPPORT_UCP */
2022             }
2023           else
2024 #endif  /* SUPPORT_UTF8 */
2025           otherd = fcc[d];
2026           }
2027         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2028           {
2029           if (++count >= GET2(code, 1))
2030             { ADD_NEW(state_offset + dlen + 3, 0); }
2031           else
2032             { ADD_NEW(state_offset, count); }
2033           }
2034         }
2035       break;
2036
2037       /*-----------------------------------------------------------------*/
2038       case OP_UPTO:
2039       case OP_MINUPTO:
2040       case OP_POSUPTO:
2041       case OP_NOTUPTO:
2042       case OP_NOTMINUPTO:
2043       case OP_NOTPOSUPTO:
2044       ADD_ACTIVE(state_offset + dlen + 3, 0);
2045       count = current_state->count;  /* Number already matched */
2046       if (clen > 0)
2047         {
2048         unsigned int otherd = NOTACHAR;
2049         if ((ims & PCRE_CASELESS) != 0)
2050           {
2051 #ifdef SUPPORT_UTF8
2052           if (utf8 && d >= 128)
2053             {
2054 #ifdef SUPPORT_UCP
2055             otherd = _pcre_ucp_othercase(d);
2056 #endif  /* SUPPORT_UCP */
2057             }
2058           else
2059 #endif  /* SUPPORT_UTF8 */
2060           otherd = fcc[d];
2061           }
2062         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2063           {
2064           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2065             {
2066             active_count--;             /* Remove non-match possibility */
2067             next_active_state--;
2068             }
2069           if (++count >= GET2(code, 1))
2070             { ADD_NEW(state_offset + dlen + 3, 0); }
2071           else
2072             { ADD_NEW(state_offset, count); }
2073           }
2074         }
2075       break;
2076
2077
2078 /* ========================================================================== */
2079       /* These are the class-handling opcodes */
2080
2081       case OP_CLASS:
2082       case OP_NCLASS:
2083       case OP_XCLASS:
2084         {
2085         BOOL isinclass = FALSE;
2086         int next_state_offset;
2087         const uschar *ecode;
2088
2089         /* For a simple class, there is always just a 32-byte table, and we
2090         can set isinclass from it. */
2091
2092         if (codevalue != OP_XCLASS)
2093           {
2094           ecode = code + 33;
2095           if (clen > 0)
2096             {
2097             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2098               ((code[1 + c/8] & (1 << (c&7))) != 0);
2099             }
2100           }
2101
2102         /* An extended class may have a table or a list of single characters,
2103         ranges, or both, and it may be positive or negative. There's a
2104         function that sorts all this out. */
2105
2106         else
2107          {
2108          ecode = code + GET(code, 1);
2109          if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2110          }
2111
2112         /* At this point, isinclass is set for all kinds of class, and ecode
2113         points to the byte after the end of the class. If there is a
2114         quantifier, this is where it will be. */
2115
2116         next_state_offset = ecode - start_code;
2117
2118         switch (*ecode)
2119           {
2120           case OP_CRSTAR:
2121           case OP_CRMINSTAR:
2122           ADD_ACTIVE(next_state_offset + 1, 0);
2123           if (isinclass) { ADD_NEW(state_offset, 0); }
2124           break;
2125
2126           case OP_CRPLUS:
2127           case OP_CRMINPLUS:
2128           count = current_state->count;  /* Already matched */
2129           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2130           if (isinclass) { count++; ADD_NEW(state_offset, count); }
2131           break;
2132
2133           case OP_CRQUERY:
2134           case OP_CRMINQUERY:
2135           ADD_ACTIVE(next_state_offset + 1, 0);
2136           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2137           break;
2138
2139           case OP_CRRANGE:
2140           case OP_CRMINRANGE:
2141           count = current_state->count;  /* Already matched */
2142           if (count >= GET2(ecode, 1))
2143             { ADD_ACTIVE(next_state_offset + 5, 0); }
2144           if (isinclass)
2145             {
2146             int max = GET2(ecode, 3);
2147             if (++count >= max && max != 0)   /* Max 0 => no limit */
2148               { ADD_NEW(next_state_offset + 5, 0); }
2149             else
2150               { ADD_NEW(state_offset, count); }
2151             }
2152           break;
2153
2154           default:
2155           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2156           break;
2157           }
2158         }
2159       break;
2160
2161 /* ========================================================================== */
2162       /* These are the opcodes for fancy brackets of various kinds. We have
2163       to use recursion in order to handle them. The "always failing" assersion
2164       (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2165       though the other "backtracking verbs" are not supported. */
2166
2167       case OP_FAIL:
2168       break;
2169
2170       case OP_ASSERT:
2171       case OP_ASSERT_NOT:
2172       case OP_ASSERTBACK:
2173       case OP_ASSERTBACK_NOT:
2174         {
2175         int rc;
2176         int local_offsets[2];
2177         int local_workspace[1000];
2178         const uschar *endasscode = code + GET(code, 1);
2179
2180         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2181
2182         rc = internal_dfa_exec(
2183           md,                                   /* static match data */
2184           code,                                 /* this subexpression's code */
2185           ptr,                                  /* where we currently are */
2186           ptr - start_subject,                  /* start offset */
2187           local_offsets,                        /* offset vector */
2188           sizeof(local_offsets)/sizeof(int),    /* size of same */
2189           local_workspace,                      /* workspace vector */
2190           sizeof(local_workspace)/sizeof(int),  /* size of same */
2191           ims,                                  /* the current ims flags */
2192           rlevel,                               /* function recursion level */
2193           recursing);                           /* pass on regex recursion */
2194
2195         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2196             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2197         }
2198       break;
2199
2200       /*-----------------------------------------------------------------*/
2201       case OP_COND:
2202       case OP_SCOND:
2203         {
2204         int local_offsets[1000];
2205         int local_workspace[1000];
2206         int condcode = code[LINK_SIZE+1];
2207
2208         /* Back reference conditions are not supported */
2209
2210         if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2211
2212         /* The DEFINE condition is always false */
2213
2214         if (condcode == OP_DEF)
2215           {
2216           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2217           }
2218
2219         /* The only supported version of OP_RREF is for the value RREF_ANY,
2220         which means "test if in any recursion". We can't test for specifically
2221         recursed groups. */
2222
2223         else if (condcode == OP_RREF)
2224           {
2225           int value = GET2(code, LINK_SIZE+2);
2226           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2227           if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2228             else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2229           }
2230
2231         /* Otherwise, the condition is an assertion */
2232
2233         else
2234           {
2235           int rc;
2236           const uschar *asscode = code + LINK_SIZE + 1;
2237           const uschar *endasscode = asscode + GET(asscode, 1);
2238
2239           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2240
2241           rc = internal_dfa_exec(
2242             md,                                   /* fixed match data */
2243             asscode,                              /* this subexpression's code */
2244             ptr,                                  /* where we currently are */
2245             ptr - start_subject,                  /* start offset */
2246             local_offsets,                        /* offset vector */
2247             sizeof(local_offsets)/sizeof(int),    /* size of same */
2248             local_workspace,                      /* workspace vector */
2249             sizeof(local_workspace)/sizeof(int),  /* size of same */
2250             ims,                                  /* the current ims flags */
2251             rlevel,                               /* function recursion level */
2252             recursing);                           /* pass on regex recursion */
2253
2254           if ((rc >= 0) ==
2255                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2256             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2257           else
2258             { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2259           }
2260         }
2261       break;
2262
2263       /*-----------------------------------------------------------------*/
2264       case OP_RECURSE:
2265         {
2266         int local_offsets[1000];
2267         int local_workspace[1000];
2268         int rc;
2269
2270         DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2271           recursing + 1));
2272
2273         rc = internal_dfa_exec(
2274           md,                                   /* fixed match data */
2275           start_code + GET(code, 1),            /* this subexpression's code */
2276           ptr,                                  /* where we currently are */
2277           ptr - start_subject,                  /* start offset */
2278           local_offsets,                        /* offset vector */
2279           sizeof(local_offsets)/sizeof(int),    /* size of same */
2280           local_workspace,                      /* workspace vector */
2281           sizeof(local_workspace)/sizeof(int),  /* size of same */
2282           ims,                                  /* the current ims flags */
2283           rlevel,                               /* function recursion level */
2284           recursing + 1);                       /* regex recurse level */
2285
2286         DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2287           recursing + 1, rc));
2288
2289         /* Ran out of internal offsets */
2290
2291         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2292
2293         /* For each successful matched substring, set up the next state with a
2294         count of characters to skip before trying it. Note that the count is in
2295         characters, not bytes. */
2296
2297         if (rc > 0)
2298           {
2299           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2300             {
2301             const uschar *p = start_subject + local_offsets[rc];
2302             const uschar *pp = start_subject + local_offsets[rc+1];
2303             int charcount = local_offsets[rc+1] - local_offsets[rc];
2304             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2305             if (charcount > 0)
2306               {
2307               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2308               }
2309             else
2310               {
2311               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2312               }
2313             }
2314           }
2315         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2316         }
2317       break;
2318
2319       /*-----------------------------------------------------------------*/
2320       case OP_ONCE:
2321         {
2322         int local_offsets[2];
2323         int local_workspace[1000];
2324
2325         int rc = internal_dfa_exec(
2326           md,                                   /* fixed match data */
2327           code,                                 /* this subexpression's code */
2328           ptr,                                  /* where we currently are */
2329           ptr - start_subject,                  /* start offset */
2330           local_offsets,                        /* offset vector */
2331           sizeof(local_offsets)/sizeof(int),    /* size of same */
2332           local_workspace,                      /* workspace vector */
2333           sizeof(local_workspace)/sizeof(int),  /* size of same */
2334           ims,                                  /* the current ims flags */
2335           rlevel,                               /* function recursion level */
2336           recursing);                           /* pass on regex recursion */
2337
2338         if (rc >= 0)
2339           {
2340           const uschar *end_subpattern = code;
2341           int charcount = local_offsets[1] - local_offsets[0];
2342           int next_state_offset, repeat_state_offset;
2343
2344           do { end_subpattern += GET(end_subpattern, 1); }
2345             while (*end_subpattern == OP_ALT);
2346           next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2347
2348           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2349           arrange for the repeat state also to be added to the relevant list.
2350           Calculate the offset, or set -1 for no repeat. */
2351
2352           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2353                                  *end_subpattern == OP_KETRMIN)?
2354             end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2355
2356           /* If we have matched an empty string, add the next state at the
2357           current character pointer. This is important so that the duplicate
2358           checking kicks in, which is what breaks infinite loops that match an
2359           empty string. */
2360
2361           if (charcount == 0)
2362             {
2363             ADD_ACTIVE(next_state_offset, 0);
2364             }
2365
2366           /* Optimization: if there are no more active states, and there
2367           are no new states yet set up, then skip over the subject string
2368           right here, to save looping. Otherwise, set up the new state to swing
2369           into action when the end of the substring is reached. */
2370
2371           else if (i + 1 >= active_count && new_count == 0)
2372             {
2373             ptr += charcount;
2374             clen = 0;
2375             ADD_NEW(next_state_offset, 0);
2376
2377             /* If we are adding a repeat state at the new character position,
2378             we must fudge things so that it is the only current state.
2379             Otherwise, it might be a duplicate of one we processed before, and
2380             that would cause it to be skipped. */
2381
2382             if (repeat_state_offset >= 0)
2383               {
2384               next_active_state = active_states;
2385               active_count = 0;
2386               i = -1;
2387               ADD_ACTIVE(repeat_state_offset, 0);
2388               }
2389             }
2390           else
2391             {
2392             const uschar *p = start_subject + local_offsets[0];
2393             const uschar *pp = start_subject + local_offsets[1];
2394             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2395             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2396             if (repeat_state_offset >= 0)
2397               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2398             }
2399
2400           }
2401         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2402         }
2403       break;
2404
2405
2406 /* ========================================================================== */
2407       /* Handle callouts */
2408
2409       case OP_CALLOUT:
2410       if (pcre_callout != NULL)
2411         {
2412         int rrc;
2413         pcre_callout_block cb;
2414         cb.version          = 1;   /* Version 1 of the callout block */
2415         cb.callout_number   = code[1];
2416         cb.offset_vector    = offsets;
2417         cb.subject          = (PCRE_SPTR)start_subject;
2418         cb.subject_length   = end_subject - start_subject;
2419         cb.start_match      = current_subject - start_subject;
2420         cb.current_position = ptr - start_subject;
2421         cb.pattern_position = GET(code, 2);
2422         cb.next_item_length = GET(code, 2 + LINK_SIZE);
2423         cb.capture_top      = 1;
2424         cb.capture_last     = -1;
2425         cb.callout_data     = md->callout_data;
2426         if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2427         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2428         }
2429       break;
2430
2431
2432 /* ========================================================================== */
2433       default:        /* Unsupported opcode */
2434       return PCRE_ERROR_DFA_UITEM;
2435       }
2436
2437     NEXT_ACTIVE_STATE: continue;
2438
2439     }      /* End of loop scanning active states */
2440
2441   /* We have finished the processing at the current subject character. If no
2442   new states have been set for the next character, we have found all the
2443   matches that we are going to find. If we are at the top level and partial
2444   matching has been requested, check for appropriate conditions. */
2445
2446   if (new_count <= 0)
2447     {
2448     if (match_count < 0 &&                     /* No matches found */
2449         rlevel == 1 &&                         /* Top level match function */
2450         (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */
2451         ptr >= end_subject &&                  /* Reached end of subject */
2452         ptr > current_subject)                 /* Matched non-empty string */
2453       {
2454       if (offsetcount >= 2)
2455         {
2456         offsets[0] = current_subject - start_subject;
2457         offsets[1] = end_subject - start_subject;
2458         }
2459       match_count = PCRE_ERROR_PARTIAL;
2460       }
2461
2462     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2463       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2464       rlevel*2-2, SP));
2465     break;        /* In effect, "return", but see the comment below */
2466     }
2467
2468   /* One or more states are active for the next character. */
2469
2470   ptr += clen;    /* Advance to next subject character */
2471   }               /* Loop to move along the subject string */
2472
2473 /* Control gets here from "break" a few lines above. We do it this way because
2474 if we use "return" above, we have compiler trouble. Some compilers warn if
2475 there's nothing here because they think the function doesn't return a value. On
2476 the other hand, if we put a dummy statement here, some more clever compilers
2477 complain that it can't be reached. Sigh. */
2478
2479 return match_count;
2480 }
2481
2482
2483
2484
2485 /*************************************************
2486 *    Execute a Regular Expression - DFA engine   *
2487 *************************************************/
2488
2489 /* This external function applies a compiled re to a subject string using a DFA
2490 engine. This function calls the internal function multiple times if the pattern
2491 is not anchored.
2492
2493 Arguments:
2494   argument_re     points to the compiled expression
2495   extra_data      points to extra data or is NULL
2496   subject         points to the subject string
2497   length          length of subject string (may contain binary zeros)
2498   start_offset    where to start in the subject string
2499   options         option bits
2500   offsets         vector of match offsets
2501   offsetcount     size of same
2502   workspace       workspace vector
2503   wscount         size of same
2504
2505 Returns:          > 0 => number of match offset pairs placed in offsets
2506                   = 0 => offsets overflowed; longest matches are present
2507                    -1 => failed to match
2508                  < -1 => some kind of unexpected problem
2509 */
2510
2511 PCRE_EXP_DEFN int
2512 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2513   const char *subject, int length, int start_offset, int options, int *offsets,
2514   int offsetcount, int *workspace, int wscount)
2515 {
2516 real_pcre *re = (real_pcre *)argument_re;
2517 dfa_match_data match_block;
2518 dfa_match_data *md = &match_block;
2519 BOOL utf8, anchored, startline, firstline;
2520 const uschar *current_subject, *end_subject, *lcc;
2521
2522 pcre_study_data internal_study;
2523 const pcre_study_data *study = NULL;
2524 real_pcre internal_re;
2525
2526 const uschar *req_byte_ptr;
2527 const uschar *start_bits = NULL;
2528 BOOL first_byte_caseless = FALSE;
2529 BOOL req_byte_caseless = FALSE;
2530 int first_byte = -1;
2531 int req_byte = -1;
2532 int req_byte2 = -1;
2533 int newline;
2534
2535 /* Plausibility checks */
2536
2537 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2538 if (re == NULL || subject == NULL || workspace == NULL ||
2539    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2540 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2541 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2542
2543 /* We need to find the pointer to any study data before we test for byte
2544 flipping, so we scan the extra_data block first. This may set two fields in the
2545 match block, so we must initialize them beforehand. However, the other fields
2546 in the match block must not be set until after the byte flipping. */
2547
2548 md->tables = re->tables;
2549 md->callout_data = NULL;
2550
2551 if (extra_data != NULL)
2552   {
2553   unsigned int flags = extra_data->flags;
2554   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2555     study = (const pcre_study_data *)extra_data->study_data;
2556   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2557   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2558     return PCRE_ERROR_DFA_UMLIMIT;
2559   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2560     md->callout_data = extra_data->callout_data;
2561   if ((flags & PCRE_EXTRA_TABLES) != 0)
2562     md->tables = extra_data->tables;
2563   }
2564
2565 /* Check that the first field in the block is the magic number. If it is not,
2566 test for a regex that was compiled on a host of opposite endianness. If this is
2567 the case, flipped values are put in internal_re and internal_study if there was
2568 study data too. */
2569
2570 if (re->magic_number != MAGIC_NUMBER)
2571   {
2572   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2573   if (re == NULL) return PCRE_ERROR_BADMAGIC;
2574   if (study != NULL) study = &internal_study;
2575   }
2576
2577 /* Set some local values */
2578
2579 current_subject = (const unsigned char *)subject + start_offset;
2580 end_subject = (const unsigned char *)subject + length;
2581 req_byte_ptr = current_subject - 1;
2582
2583 #ifdef SUPPORT_UTF8
2584 utf8 = (re->options & PCRE_UTF8) != 0;
2585 #else
2586 utf8 = FALSE;
2587 #endif
2588
2589 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2590   (re->options & PCRE_ANCHORED) != 0;
2591
2592 /* The remaining fixed data for passing around. */
2593
2594 md->start_code = (const uschar *)argument_re +
2595     re->name_table_offset + re->name_count * re->name_entry_size;
2596 md->start_subject = (const unsigned char *)subject;
2597 md->end_subject = end_subject;
2598 md->moptions = options;
2599 md->poptions = re->options;
2600
2601 /* If the BSR option is not set at match time, copy what was set
2602 at compile time. */
2603
2604 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2605   {
2606   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2607     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2608 #ifdef BSR_ANYCRLF
2609   else md->moptions |= PCRE_BSR_ANYCRLF;
2610 #endif
2611   }
2612
2613 /* Handle different types of newline. The three bits give eight cases. If
2614 nothing is set at run time, whatever was used at compile time applies. */
2615
2616 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2617          PCRE_NEWLINE_BITS)
2618   {
2619   case 0: newline = NEWLINE; break;   /* Compile-time default */
2620   case PCRE_NEWLINE_CR: newline = '\r'; break;
2621   case PCRE_NEWLINE_LF: newline = '\n'; break;
2622   case PCRE_NEWLINE_CR+
2623        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2624   case PCRE_NEWLINE_ANY: newline = -1; break;
2625   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2626   default: return PCRE_ERROR_BADNEWLINE;
2627   }
2628
2629 if (newline == -2)
2630   {
2631   md->nltype = NLTYPE_ANYCRLF;
2632   }
2633 else if (newline < 0)
2634   {
2635   md->nltype = NLTYPE_ANY;
2636   }
2637 else
2638   {
2639   md->nltype = NLTYPE_FIXED;
2640   if (newline > 255)
2641     {
2642     md->nllen = 2;
2643     md->nl[0] = (newline >> 8) & 255;
2644     md->nl[1] = newline & 255;
2645     }
2646   else
2647     {
2648     md->nllen = 1;
2649     md->nl[0] = newline;
2650     }
2651   }
2652
2653 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2654 back the character offset. */
2655
2656 #ifdef SUPPORT_UTF8
2657 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2658   {
2659   if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2660     return PCRE_ERROR_BADUTF8;
2661   if (start_offset > 0 && start_offset < length)
2662     {
2663     int tb = ((uschar *)subject)[start_offset];
2664     if (tb > 127)
2665       {
2666       tb &= 0xc0;
2667       if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2668       }
2669     }
2670   }
2671 #endif
2672
2673 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2674 is a feature that makes it possible to save compiled regex and re-use them
2675 in other programs later. */
2676
2677 if (md->tables == NULL) md->tables = _pcre_default_tables;
2678
2679 /* The lower casing table and the "must be at the start of a line" flag are
2680 used in a loop when finding where to start. */
2681
2682 lcc = md->tables + lcc_offset;
2683 startline = (re->flags & PCRE_STARTLINE) != 0;
2684 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2685
2686 /* Set up the first character to match, if available. The first_byte value is
2687 never set for an anchored regular expression, but the anchoring may be forced
2688 at run time, so we have to test for anchoring. The first char may be unset for
2689 an unanchored pattern, of course. If there's no first char and the pattern was
2690 studied, there may be a bitmap of possible first characters. */
2691
2692 if (!anchored)
2693   {
2694   if ((re->flags & PCRE_FIRSTSET) != 0)
2695     {
2696     first_byte = re->first_byte & 255;
2697     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2698       first_byte = lcc[first_byte];
2699     }
2700   else
2701     {
2702     if (startline && study != NULL &&
2703          (study->options & PCRE_STUDY_MAPPED) != 0)
2704       start_bits = study->start_bits;
2705     }
2706   }
2707
2708 /* For anchored or unanchored matches, there may be a "last known required
2709 character" set. */
2710
2711 if ((re->flags & PCRE_REQCHSET) != 0)
2712   {
2713   req_byte = re->req_byte & 255;
2714   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2715   req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
2716   }
2717
2718 /* Call the main matching function, looping for a non-anchored regex after a
2719 failed match. Unless restarting, optimize by moving to the first match
2720 character if possible, when not anchored. Then unless wanting a partial match,
2721 check for a required later character. */
2722
2723 for (;;)
2724   {
2725   int rc;
2726
2727   if ((options & PCRE_DFA_RESTART) == 0)
2728     {
2729     const uschar *save_end_subject = end_subject;
2730
2731     /* Advance to a unique first char if possible. If firstline is TRUE, the
2732     start of the match is constrained to the first line of a multiline string.
2733     Implement this by temporarily adjusting end_subject so that we stop
2734     scanning at a newline. If the match fails at the newline, later code breaks
2735     this loop. */
2736
2737     if (firstline)
2738       {
2739       const uschar *t = current_subject;
2740       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2741       end_subject = t;
2742       }
2743
2744     if (first_byte >= 0)
2745       {
2746       if (first_byte_caseless)
2747         while (current_subject < end_subject &&
2748                lcc[*current_subject] != first_byte)
2749           current_subject++;
2750       else
2751         while (current_subject < end_subject && *current_subject != first_byte)
2752           current_subject++;
2753       }
2754
2755     /* Or to just after a linebreak for a multiline match if possible */
2756
2757     else if (startline)
2758       {
2759       if (current_subject > md->start_subject + start_offset)
2760         {
2761         while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2762           current_subject++;
2763
2764         /* If we have just passed a CR and the newline option is ANY or
2765         ANYCRLF, and we are now at a LF, advance the match position by one more
2766         character. */
2767
2768         if (current_subject[-1] == '\r' &&
2769              (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2770              current_subject < end_subject &&
2771              *current_subject == '\n')
2772           current_subject++;
2773         }
2774       }
2775
2776     /* Or to a non-unique first char after study */
2777
2778     else if (start_bits != NULL)
2779       {
2780       while (current_subject < end_subject)
2781         {
2782         register unsigned int c = *current_subject;
2783         if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2784           else break;
2785         }
2786       }
2787
2788     /* Restore fudged end_subject */
2789
2790     end_subject = save_end_subject;
2791     }
2792
2793   /* If req_byte is set, we know that that character must appear in the subject
2794   for the match to succeed. If the first character is set, req_byte must be
2795   later in the subject; otherwise the test starts at the match point. This
2796   optimization can save a huge amount of work in patterns with nested unlimited
2797   repeats that aren't going to match. Writing separate code for cased/caseless
2798   versions makes it go faster, as does using an autoincrement and backing off
2799   on a match.
2800
2801   HOWEVER: when the subject string is very, very long, searching to its end can
2802   take a long time, and give bad performance on quite ordinary patterns. This
2803   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2804   don't do this when the string is sufficiently long.
2805
2806   ALSO: this processing is disabled when partial matching is requested.
2807   */
2808
2809   if (req_byte >= 0 &&
2810       end_subject - current_subject < REQ_BYTE_MAX &&
2811       (options & PCRE_PARTIAL) == 0)
2812     {
2813     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2814
2815     /* We don't need to repeat the search if we haven't yet reached the
2816     place we found it at last time. */
2817
2818     if (p > req_byte_ptr)
2819       {
2820       if (req_byte_caseless)
2821         {
2822         while (p < end_subject)
2823           {
2824           register int pp = *p++;
2825           if (pp == req_byte || pp == req_byte2) { p--; break; }
2826           }
2827         }
2828       else
2829         {
2830         while (p < end_subject)
2831           {
2832           if (*p++ == req_byte) { p--; break; }
2833           }
2834         }
2835
2836       /* If we can't find the required character, break the matching loop,
2837       which will cause a return or PCRE_ERROR_NOMATCH. */
2838
2839       if (p >= end_subject) break;
2840
2841       /* If we have found the required character, save the point where we
2842       found it, so that we don't search again next time round the loop if
2843       the start hasn't passed this character yet. */
2844
2845       req_byte_ptr = p;
2846       }
2847     }
2848
2849   /* OK, now we can do the business */
2850
2851   rc = internal_dfa_exec(
2852     md,                                /* fixed match data */
2853     md->start_code,                    /* this subexpression's code */
2854     current_subject,                   /* where we currently are */
2855     start_offset,                      /* start offset in subject */
2856     offsets,                           /* offset vector */
2857     offsetcount,                       /* size of same */
2858     workspace,                         /* workspace vector */
2859     wscount,                           /* size of same */
2860     re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2861     0,                                 /* function recurse level */
2862     0);                                /* regex recurse level */
2863
2864   /* Anything other than "no match" means we are done, always; otherwise, carry
2865   on only if not anchored. */
2866
2867   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2868
2869   /* Advance to the next subject character unless we are at the end of a line
2870   and firstline is set. */
2871
2872   if (firstline && IS_NEWLINE(current_subject)) break;
2873   current_subject++;
2874   if (utf8)
2875     {
2876     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2877       current_subject++;
2878     }
2879   if (current_subject > end_subject) break;
2880
2881   /* If we have just passed a CR and we are now at a LF, and the pattern does
2882   not contain any explicit matches for \r or \n, and the newline option is CRLF
2883   or ANY or ANYCRLF, advance the match position by one more character. */
2884
2885   if (current_subject[-1] == '\r' &&
2886       current_subject < end_subject &&
2887       *current_subject == '\n' &&
2888       (re->flags & PCRE_HASCRORLF) == 0 &&
2889         (md->nltype == NLTYPE_ANY ||
2890          md->nltype == NLTYPE_ANYCRLF ||
2891          md->nllen == 2))
2892     current_subject++;
2893
2894   }   /* "Bumpalong" loop */
2895
2896 return PCRE_ERROR_NOMATCH;
2897 }
2898
2899 /* End of pcre_dfa_exec.c */