glib/pcre/pcre_dfa_exec.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9            Copyright (c) 1997-2007 University of Cambridge
  10
  11 -----------------------------------------------------------------------------
  12 Redistribution and use in source and binary forms, with or without
  13 modification, are permitted provided that the following conditions are met:
  14
  15     * Redistributions of source code must retain the above copyright notice,
  16       this list of conditions and the following disclaimer.
  17
  18     * Redistributions in binary form must reproduce the above copyright
  19       notice, this list of conditions and the following disclaimer in the
  20       documentation and/or other materials provided with the distribution.
  21
  22     * Neither the name of the University of Cambridge nor the names of its
  23       contributors may be used to endorse or promote products derived from
  24       this software without specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE.
  37 -----------------------------------------------------------------------------
  38 */
  39
  40
  41 /* This module contains the external function pcre_dfa_exec(), which is an
  42 alternative matching function that uses a sort of DFA algorithm (not a true
  43 FSM). This is NOT Perl- compatible, but it has advantages in certain
  44 applications. */
  45
  46
  47 #ifdef HAVE_CONFIG_H
  48 #include "config.h"
  49 #endif
  50
  51 #define NLBLOCK md             /* Block containing newline information */
  52 #define PSSTART start_subject  /* Field containing processed string start */
  53 #define PSEND   end_subject    /* Field containing processed string end */
  54
  55 #include "pcre_internal.h"
  56
  57
  58 /* For use to indent debugging output */
  59
  60 #define SP "                   "
  61
  62
  63
  64 /*************************************************
  65 *      Code parameters and static tables         *
  66 *************************************************/
  67
  68 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
  69 into others, under special conditions. A gap of 20 between the blocks should be
  70 enough. The resulting opcodes don't have to be less than 256 because they are
  71 never stored, so we push them well clear of the normal opcodes. */
  72
  73 #define OP_PROP_EXTRA       300
  74 #define OP_EXTUNI_EXTRA     320
  75 #define OP_ANYNL_EXTRA      340
  76 #define OP_HSPACE_EXTRA     360
  77 #define OP_VSPACE_EXTRA     380
  78
  79
  80 /* This table identifies those opcodes that are followed immediately by a
  81 character that is to be tested in some way. This makes is possible to
  82 centralize the loading of these characters. In the case of Type * etc, the
  83 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
  84 small value. ***NOTE*** If the start of this table is modified, the two tables
  85 that follow must also be modified. */
  86
  87 static uschar coptable[] = {
  88   0,                             /* End                                    */
  89   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
  90   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
  91   0, 0,                          /* Any, Anybyte                           */
  92   0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
  93   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
  94   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
  95   1,                             /* Char                                   */
  96   1,                             /* Charnc                                 */
  97   1,                             /* not                                    */
  98   /* Positive single-char repeats                                          */
  99   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 100   3, 3, 3,                       /* upto, minupto, exact                   */
 101   1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
 102   /* Negative single-char repeats - only for chars < 256                   */
 103   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
 104   3, 3, 3,                       /* NOT upto, minupto, exact               */
 105   1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */
 106   /* Positive type repeats                                                 */
 107   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
 108   3, 3, 3,                       /* Type upto, minupto, exact              */
 109   1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
 110   /* Character class & ref repeats                                         */
 111   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
 112   0, 0,                          /* CRRANGE, CRMINRANGE                    */
 113   0,                             /* CLASS                                  */
 114   0,                             /* NCLASS                                 */
 115   0,                             /* XCLASS - variable length               */
 116   0,                             /* REF                                    */
 117   0,                             /* RECURSE                                */
 118   0,                             /* CALLOUT                                */
 119   0,                             /* Alt                                    */
 120   0,                             /* Ket                                    */
 121   0,                             /* KetRmax                                */
 122   0,                             /* KetRmin                                */
 123   0,                             /* Assert                                 */
 124   0,                             /* Assert not                             */
 125   0,                             /* Assert behind                          */
 126   0,                             /* Assert behind not                      */
 127   0,                             /* Reverse                                */
 128   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
 129   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
 130   0,                             /* CREF                                   */
 131   0,                             /* RREF                                   */
 132   0,                             /* DEF                                    */
 133   0, 0,                          /* BRAZERO, BRAMINZERO                    */
 134   0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
 135   0, 0                           /* FAIL, ACCEPT                           */
 136 };
 137
 138 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
 139 and \w */
 140
 141 static uschar toptable1[] = {
 142   0, 0, 0, 0, 0, 0,
 143   ctype_digit, ctype_digit,
 144   ctype_space, ctype_space,
 145   ctype_word,  ctype_word,
 146   0                               /* OP_ANY */
 147 };
 148
 149 static uschar toptable2[] = {
 150   0, 0, 0, 0, 0, 0,
 151   ctype_digit, 0,
 152   ctype_space, 0,
 153   ctype_word,  0,
 154   1                               /* OP_ANY */
 155 };
 156
 157
 158 /* Structure for holding data about a particular state, which is in effect the
 159 current data for an active path through the match tree. It must consist
 160 entirely of ints because the working vector we are passed, and which we put
 161 these structures in, is a vector of ints. */
 162
 163 typedef struct stateblock {
 164   int offset;                     /* Offset to opcode */
 165   int count;                      /* Count for repeats */
 166   int ims;                        /* ims flag bits */
 167   int data;                       /* Some use extra data */
 168 } stateblock;
 169
 170 #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
 171
 172
 173 #ifdef DEBUG
 174 /*************************************************
 175 *             Print character string             *
 176 *************************************************/
 177
 178 /* Character string printing function for debugging.
 179
 180 Arguments:
 181   p            points to string
 182   length       number of bytes
 183   f            where to print
 184
 185 Returns:       nothing
 186 */
 187
 188 static void
 189 pchars(unsigned char *p, int length, FILE *f)
 190 {
 191 int c;
 192 while (length-- > 0)
 193   {
 194   if (isprint(c = *(p++)))
 195     fprintf(f, "%c", c);
 196   else
 197     fprintf(f, "\\x%02x", c);
 198   }
 199 }
 200 #endif
 201
 202
 203
 204 /*************************************************
 205 *    Execute a Regular Expression - DFA engine   *
 206 *************************************************/
 207
 208 /* This internal function applies a compiled pattern to a subject string,
 209 starting at a given point, using a DFA engine. This function is called from the
 210 external one, possibly multiple times if the pattern is not anchored. The
 211 function calls itself recursively for some kinds of subpattern.
 212
 213 Arguments:
 214   md                the match_data block with fixed information
 215   this_start_code   the opening bracket of this subexpression's code
 216   current_subject   where we currently are in the subject string
 217   start_offset      start offset in the subject string
 218   offsets           vector to contain the matching string offsets
 219   offsetcount       size of same
 220   workspace         vector of workspace
 221   wscount           size of same
 222   ims               the current ims flags
 223   rlevel            function call recursion level
 224   recursing         regex recursive call level
 225
 226 Returns:            > 0 =>
 227                     = 0 =>
 228                      -1 => failed to match
 229                    < -1 => some kind of unexpected problem
 230
 231 The following macros are used for adding states to the two state vectors (one
 232 for the current character, one for the following character). */
 233
 234 #define ADD_ACTIVE(x,y) \
 235   if (active_count++ < wscount) \
 236     { \
 237     next_active_state->offset = (x); \
 238     next_active_state->count  = (y); \
 239     next_active_state->ims    = ims; \
 240     next_active_state++; \
 241     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 242     } \
 243   else return PCRE_ERROR_DFA_WSSIZE
 244
 245 #define ADD_ACTIVE_DATA(x,y,z) \
 246   if (active_count++ < wscount) \
 247     { \
 248     next_active_state->offset = (x); \
 249     next_active_state->count  = (y); \
 250     next_active_state->ims    = ims; \
 251     next_active_state->data   = (z); \
 252     next_active_state++; \
 253     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 254     } \
 255   else return PCRE_ERROR_DFA_WSSIZE
 256
 257 #define ADD_NEW(x,y) \
 258   if (new_count++ < wscount) \
 259     { \
 260     next_new_state->offset = (x); \
 261     next_new_state->count  = (y); \
 262     next_new_state->ims    = ims; \
 263     next_new_state++; \
 264     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 265     } \
 266   else return PCRE_ERROR_DFA_WSSIZE
 267
 268 #define ADD_NEW_DATA(x,y,z) \
 269   if (new_count++ < wscount) \
 270     { \
 271     next_new_state->offset = (x); \
 272     next_new_state->count  = (y); \
 273     next_new_state->ims    = ims; \
 274     next_new_state->data   = (z); \
 275     next_new_state++; \
 276     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 277     } \
 278   else return PCRE_ERROR_DFA_WSSIZE
 279
 280 /* And now, here is the code */
 281
 282 static int
 283 internal_dfa_exec(
 284   dfa_match_data *md,
 285   const uschar *this_start_code,
 286   const uschar *current_subject,
 287   int start_offset,
 288   int *offsets,
 289   int offsetcount,
 290   int *workspace,
 291   int wscount,
 292   int ims,
 293   int  rlevel,
 294   int  recursing)
 295 {
 296 stateblock *active_states, *new_states, *temp_states;
 297 stateblock *next_active_state, *next_new_state;
 298
 299 const uschar *ctypes, *lcc, *fcc;
 300 const uschar *ptr;
 301 const uschar *end_code, *first_op;
 302
 303 int active_count, new_count, match_count;
 304
 305 /* Some fields in the md block are frequently referenced, so we load them into
 306 independent variables in the hope that this will perform better. */
 307
 308 const uschar *start_subject = md->start_subject;
 309 const uschar *end_subject = md->end_subject;
 310 const uschar *start_code = md->start_code;
 311
 312 #ifdef SUPPORT_UTF8
 313 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
 314 #else
 315 BOOL utf8 = FALSE;
 316 #endif
 317
 318 rlevel++;
 319 offsetcount &= (-2);
 320
 321 wscount -= 2;
 322 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
 323           (2 * INTS_PER_STATEBLOCK);
 324
 325 DPRINTF(("\n%.*s---------------------\n"
 326   "%.*sCall to internal_dfa_exec f=%d r=%d\n",
 327   rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
 328
 329 ctypes = md->tables + ctypes_offset;
 330 lcc = md->tables + lcc_offset;
 331 fcc = md->tables + fcc_offset;
 332
 333 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
 334
 335 active_states = (stateblock *)(workspace + 2);
 336 next_new_state = new_states = active_states + wscount;
 337 new_count = 0;
 338
 339 first_op = this_start_code + 1 + LINK_SIZE +
 340   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
 341
 342 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
 343 the alternative states onto the list, and find out where the end is. This
 344 makes is possible to use this function recursively, when we want to stop at a
 345 matching internal ket rather than at the end.
 346
 347 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
 348 a backward assertion. In that case, we have to find out the maximum amount to
 349 move back, and set up each alternative appropriately. */
 350
 351 if (*first_op == OP_REVERSE)
 352   {
 353   int max_back = 0;
 354   int gone_back;
 355
 356   end_code = this_start_code;
 357   do
 358     {
 359     int back = GET(end_code, 2+LINK_SIZE);
 360     if (back > max_back) max_back = back;
 361     end_code += GET(end_code, 1);
 362     }
 363   while (*end_code == OP_ALT);
 364
 365   /* If we can't go back the amount required for the longest lookbehind
 366   pattern, go back as far as we can; some alternatives may still be viable. */
 367
 368 #ifdef SUPPORT_UTF8
 369   /* In character mode we have to step back character by character */
 370
 371   if (utf8)
 372     {
 373     for (gone_back = 0; gone_back < max_back; gone_back++)
 374       {
 375       if (current_subject <= start_subject) break;
 376       current_subject--;
 377       while (current_subject > start_subject &&
 378              (*current_subject & 0xc0) == 0x80)
 379         current_subject--;
 380       }
 381     }
 382   else
 383 #endif
 384
 385   /* In byte-mode we can do this quickly. */
 386
 387     {
 388     gone_back = (current_subject - max_back < start_subject)?
 389       current_subject - start_subject : max_back;
 390     current_subject -= gone_back;
 391     }
 392
 393   /* Now we can process the individual branches. */
 394
 395   end_code = this_start_code;
 396   do
 397     {
 398     int back = GET(end_code, 2+LINK_SIZE);
 399     if (back <= gone_back)
 400       {
 401       int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
 402       ADD_NEW_DATA(-bstate, 0, gone_back - back);
 403       }
 404     end_code += GET(end_code, 1);
 405     }
 406   while (*end_code == OP_ALT);
 407  }
 408
 409 /* This is the code for a "normal" subpattern (not a backward assertion). The
 410 start of a whole pattern is always one of these. If we are at the top level,
 411 we may be asked to restart matching from the same point that we reached for a
 412 previous partial match. We still have to scan through the top-level branches to
 413 find the end state. */
 414
 415 else
 416   {
 417   end_code = this_start_code;
 418
 419   /* Restarting */
 420
 421   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
 422     {
 423     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
 424     new_count = workspace[1];
 425     if (!workspace[0])
 426       memcpy(new_states, active_states, new_count * sizeof(stateblock));
 427     }
 428
 429   /* Not restarting */
 430
 431   else
 432     {
 433     int length = 1 + LINK_SIZE +
 434       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
 435     do
 436       {
 437       ADD_NEW(end_code - start_code + length, 0);
 438       end_code += GET(end_code, 1);
 439       length = 1 + LINK_SIZE;
 440       }
 441     while (*end_code == OP_ALT);
 442     }
 443   }
 444
 445 workspace[0] = 0;    /* Bit indicating which vector is current */
 446
 447 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
 448
 449 /* Loop for scanning the subject */
 450
 451 ptr = current_subject;
 452 for (;;)
 453   {
 454   int i, j;
 455   int clen, dlen;
 456   unsigned int c, d;
 457
 458   /* Make the new state list into the active state list and empty the
 459   new state list. */
 460
 461   temp_states = active_states;
 462   active_states = new_states;
 463   new_states = temp_states;
 464   active_count = new_count;
 465   new_count = 0;
 466
 467   workspace[0] ^= 1;              /* Remember for the restarting feature */
 468   workspace[1] = active_count;
 469
 470 #ifdef DEBUG
 471   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
 472   pchars((uschar *)ptr, strlen((char *)ptr), stdout);
 473   printf("\"\n");
 474
 475   printf("%.*sActive states: ", rlevel*2-2, SP);
 476   for (i = 0; i < active_count; i++)
 477     printf("%d/%d ", active_states[i].offset, active_states[i].count);
 478   printf("\n");
 479 #endif
 480
 481   /* Set the pointers for adding new states */
 482
 483   next_active_state = active_states + active_count;
 484   next_new_state = new_states;
 485
 486   /* Load the current character from the subject outside the loop, as many
 487   different states may want to look at it, and we assume that at least one
 488   will. */
 489
 490   if (ptr < end_subject)
 491     {
 492     clen = 1;        /* Number of bytes in the character */
 493 #ifdef SUPPORT_UTF8
 494     if (utf8) { GETCHARLEN(c, ptr, clen); } else
 495 #endif  /* SUPPORT_UTF8 */
 496     c = *ptr;
 497     }
 498   else
 499     {
 500     clen = 0;        /* This indicates the end of the subject */
 501     c = NOTACHAR;    /* This value should never actually be used */
 502     }
 503
 504   /* Scan up the active states and act on each one. The result of an action
 505   may be to add more states to the currently active list (e.g. on hitting a
 506   parenthesis) or it may be to put states on the new list, for considering
 507   when we move the character pointer on. */
 508
 509   for (i = 0; i < active_count; i++)
 510     {
 511     stateblock *current_state = active_states + i;
 512     const uschar *code;
 513     int state_offset = current_state->offset;
 514     int count, codevalue;
 515 #ifdef SUPPORT_UCP
 516     int chartype, script;
 517 #endif
 518
 519 #ifdef DEBUG
 520     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
 521     if (clen == 0) printf("EOL\n");
 522       else if (c > 32 && c < 127) printf("'%c'\n", c);
 523         else printf("0x%02x\n", c);
 524 #endif
 525
 526     /* This variable is referred to implicity in the ADD_xxx macros. */
 527
 528     ims = current_state->ims;
 529
 530     /* A negative offset is a special case meaning "hold off going to this
 531     (negated) state until the number of characters in the data field have
 532     been skipped". */
 533
 534     if (state_offset < 0)
 535       {
 536       if (current_state->data > 0)
 537         {
 538         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
 539         ADD_NEW_DATA(state_offset, current_state->count,
 540           current_state->data - 1);
 541         continue;
 542         }
 543       else
 544         {
 545         current_state->offset = state_offset = -state_offset;
 546         }
 547       }
 548
 549     /* Check for a duplicate state with the same count, and skip if found. */
 550
 551     for (j = 0; j < i; j++)
 552       {
 553       if (active_states[j].offset == state_offset &&
 554           active_states[j].count == current_state->count)
 555         {
 556         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
 557         goto NEXT_ACTIVE_STATE;
 558         }
 559       }
 560
 561     /* The state offset is the offset to the opcode */
 562
 563     code = start_code + state_offset;
 564     codevalue = *code;
 565
 566     /* If this opcode is followed by an inline character, load it. It is
 567     tempting to test for the presence of a subject character here, but that
 568     is wrong, because sometimes zero repetitions of the subject are
 569     permitted.
 570
 571     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
 572     argument that is not a data character - but is always one byte long. We
 573     have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
 574     this case. To keep the other cases fast, convert these ones to new opcodes.
 575     */
 576
 577     if (coptable[codevalue] > 0)
 578       {
 579       dlen = 1;
 580 #ifdef SUPPORT_UTF8
 581       if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
 582 #endif  /* SUPPORT_UTF8 */
 583       d = code[coptable[codevalue]];
 584       if (codevalue >= OP_TYPESTAR)
 585         {
 586         switch(d)
 587           {
 588           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
 589           case OP_NOTPROP:
 590           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
 591           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
 592           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
 593           case OP_NOT_HSPACE:
 594           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
 595           case OP_NOT_VSPACE:
 596           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
 597           default: break;
 598           }
 599         }
 600       }
 601     else
 602       {
 603       dlen = 0;         /* Not strictly necessary, but compilers moan */
 604       d = NOTACHAR;     /* if these variables are not set. */
 605       }
 606
 607
 608     /* Now process the individual opcodes */
 609
 610     switch (codevalue)
 611       {
 612
 613 /* ========================================================================== */
 614       /* Reached a closing bracket. If not at the end of the pattern, carry
 615       on with the next opcode. Otherwise, unless we have an empty string and
 616       PCRE_NOTEMPTY is set, save the match data, shifting up all previous
 617       matches so we always have the longest first. */
 618
 619       case OP_KET:
 620       case OP_KETRMIN:
 621       case OP_KETRMAX:
 622       if (code != end_code)
 623         {
 624         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
 625         if (codevalue != OP_KET)
 626           {
 627           ADD_ACTIVE(state_offset - GET(code, 1), 0);
 628           }
 629         }
 630       else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
 631         {
 632         if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
 633           else if (match_count > 0 && ++match_count * 2 >= offsetcount)
 634             match_count = 0;
 635         count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
 636         if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
 637         if (offsetcount >= 2)
 638           {
 639           offsets[0] = current_subject - start_subject;
 640           offsets[1] = ptr - start_subject;
 641           DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
 642             offsets[1] - offsets[0], current_subject));
 643           }
 644         if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
 645           {
 646           DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
 647             "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
 648             match_count, rlevel*2-2, SP));
 649           return match_count;
 650           }
 651         }
 652       break;
 653
 654 /* ========================================================================== */
 655       /* These opcodes add to the current list of states without looking
 656       at the current character. */
 657
 658       /*-----------------------------------------------------------------*/
 659       case OP_ALT:
 660       do { code += GET(code, 1); } while (*code == OP_ALT);
 661       ADD_ACTIVE(code - start_code, 0);
 662       break;
 663
 664       /*-----------------------------------------------------------------*/
 665       case OP_BRA:
 666       case OP_SBRA:
 667       do
 668         {
 669         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
 670         code += GET(code, 1);
 671         }
 672       while (*code == OP_ALT);
 673       break;
 674
 675       /*-----------------------------------------------------------------*/
 676       case OP_CBRA:
 677       case OP_SCBRA:
 678       ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
 679       code += GET(code, 1);
 680       while (*code == OP_ALT)
 681         {
 682         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
 683         code += GET(code, 1);
 684         }
 685       break;
 686
 687       /*-----------------------------------------------------------------*/
 688       case OP_BRAZERO:
 689       case OP_BRAMINZERO:
 690       ADD_ACTIVE(state_offset + 1, 0);
 691       code += 1 + GET(code, 2);
 692       while (*code == OP_ALT) code += GET(code, 1);
 693       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
 694       break;
 695
 696       /*-----------------------------------------------------------------*/
 697       case OP_CIRC:
 698       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
 699           ((ims & PCRE_MULTILINE) != 0 &&
 700             ptr != end_subject &&
 701             WAS_NEWLINE(ptr)))
 702         { ADD_ACTIVE(state_offset + 1, 0); }
 703       break;
 704
 705       /*-----------------------------------------------------------------*/
 706       case OP_EOD:
 707       if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 708       break;
 709
 710       /*-----------------------------------------------------------------*/
 711       case OP_OPT:
 712       ims = code[1];
 713       ADD_ACTIVE(state_offset + 2, 0);
 714       break;
 715
 716       /*-----------------------------------------------------------------*/
 717       case OP_SOD:
 718       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 719       break;
 720
 721       /*-----------------------------------------------------------------*/
 722       case OP_SOM:
 723       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
 724       break;
 725
 726
 727 /* ========================================================================== */
 728       /* These opcodes inspect the next subject character, and sometimes
 729       the previous one as well, but do not have an argument. The variable
 730       clen contains the length of the current character and is zero if we are
 731       at the end of the subject. */
 732
 733       /*-----------------------------------------------------------------*/
 734       case OP_ANY:
 735       if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
 736         { ADD_NEW(state_offset + 1, 0); }
 737       break;
 738
 739       /*-----------------------------------------------------------------*/
 740       case OP_EODN:
 741       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
 742         { ADD_ACTIVE(state_offset + 1, 0); }
 743       break;
 744
 745       /*-----------------------------------------------------------------*/
 746       case OP_DOLL:
 747       if ((md->moptions & PCRE_NOTEOL) == 0)
 748         {
 749         if (clen == 0 ||
 750             (IS_NEWLINE(ptr) &&
 751                ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
 752             ))
 753           { ADD_ACTIVE(state_offset + 1, 0); }
 754         }
 755       else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
 756         { ADD_ACTIVE(state_offset + 1, 0); }
 757       break;
 758
 759       /*-----------------------------------------------------------------*/
 760
 761       case OP_DIGIT:
 762       case OP_WHITESPACE:
 763       case OP_WORDCHAR:
 764       if (clen > 0 && c < 256 &&
 765             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
 766         { ADD_NEW(state_offset + 1, 0); }
 767       break;
 768
 769       /*-----------------------------------------------------------------*/
 770       case OP_NOT_DIGIT:
 771       case OP_NOT_WHITESPACE:
 772       case OP_NOT_WORDCHAR:
 773       if (clen > 0 && (c >= 256 ||
 774             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
 775         { ADD_NEW(state_offset + 1, 0); }
 776       break;
 777
 778       /*-----------------------------------------------------------------*/
 779       case OP_WORD_BOUNDARY:
 780       case OP_NOT_WORD_BOUNDARY:
 781         {
 782         int left_word, right_word;
 783
 784         if (ptr > start_subject)
 785           {
 786           const uschar *temp = ptr - 1;
 787 #ifdef SUPPORT_UTF8
 788           if (utf8) BACKCHAR(temp);
 789 #endif
 790           GETCHARTEST(d, temp);
 791           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
 792           }
 793         else left_word = 0;
 794
 795         if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
 796           else right_word = 0;
 797
 798         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
 799           { ADD_ACTIVE(state_offset + 1, 0); }
 800         }
 801       break;
 802
 803
 804       /*-----------------------------------------------------------------*/
 805       /* Check the next character by Unicode property. We will get here only
 806       if the support is in the binary; otherwise a compile-time error occurs.
 807       */
 808
 809 #ifdef SUPPORT_UCP
 810       case OP_PROP:
 811       case OP_NOTPROP:
 812       if (clen > 0)
 813         {
 814         BOOL OK;
 815         int category = _pcre_ucp_findprop(c, &chartype, &script);
 816         switch(code[1])
 817           {
 818           case PT_ANY:
 819           OK = TRUE;
 820           break;
 821
 822           case PT_LAMP:
 823           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
 824           break;
 825
 826           case PT_GC:
 827           OK = category == code[2];
 828           break;
 829
 830           case PT_PC:
 831           OK = chartype == code[2];
 832           break;
 833
 834           case PT_SC:
 835           OK = script == code[2];
 836           break;
 837
 838           /* Should never occur, but keep compilers from grumbling. */
 839
 840           default:
 841           OK = codevalue != OP_PROP;
 842           break;
 843           }
 844
 845         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
 846         }
 847       break;
 848 #endif
 849
 850
 851
 852 /* ========================================================================== */
 853       /* These opcodes likewise inspect the subject character, but have an
 854       argument that is not a data character. It is one of these opcodes:
 855       OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
 856       OP_NOT_WORDCHAR. The value is loaded into d. */
 857
 858       case OP_TYPEPLUS:
 859       case OP_TYPEMINPLUS:
 860       case OP_TYPEPOSPLUS:
 861       count = current_state->count;  /* Already matched */
 862       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 863       if (clen > 0)
 864         {
 865         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 866             (c < 256 &&
 867               (d != OP_ANY ||
 868                (ims & PCRE_DOTALL) != 0 ||
 869                !IS_NEWLINE(ptr)
 870               ) &&
 871               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 872           {
 873           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
 874             {
 875             active_count--;            /* Remove non-match possibility */
 876             next_active_state--;
 877             }
 878           count++;
 879           ADD_NEW(state_offset, count);
 880           }
 881         }
 882       break;
 883
 884       /*-----------------------------------------------------------------*/
 885       case OP_TYPEQUERY:
 886       case OP_TYPEMINQUERY:
 887       case OP_TYPEPOSQUERY:
 888       ADD_ACTIVE(state_offset + 2, 0);
 889       if (clen > 0)
 890         {
 891         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 892             (c < 256 &&
 893               (d != OP_ANY ||
 894                (ims & PCRE_DOTALL) != 0 ||
 895                !IS_NEWLINE(ptr)
 896               ) &&
 897               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 898           {
 899           if (codevalue == OP_TYPEPOSQUERY)
 900             {
 901             active_count--;            /* Remove non-match possibility */
 902             next_active_state--;
 903             }
 904           ADD_NEW(state_offset + 2, 0);
 905           }
 906         }
 907       break;
 908
 909       /*-----------------------------------------------------------------*/
 910       case OP_TYPESTAR:
 911       case OP_TYPEMINSTAR:
 912       case OP_TYPEPOSSTAR:
 913       ADD_ACTIVE(state_offset + 2, 0);
 914       if (clen > 0)
 915         {
 916         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 917             (c < 256 &&
 918               (d != OP_ANY ||
 919                (ims & PCRE_DOTALL) != 0 ||
 920                !IS_NEWLINE(ptr)
 921               ) &&
 922               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 923           {
 924           if (codevalue == OP_TYPEPOSSTAR)
 925             {
 926             active_count--;            /* Remove non-match possibility */
 927             next_active_state--;
 928             }
 929           ADD_NEW(state_offset, 0);
 930           }
 931         }
 932       break;
 933
 934       /*-----------------------------------------------------------------*/
 935       case OP_TYPEEXACT:
 936       count = current_state->count;  /* Number already matched */
 937       if (clen > 0)
 938         {
 939         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 940             (c < 256 &&
 941               (d != OP_ANY ||
 942                (ims & PCRE_DOTALL) != 0 ||
 943                !IS_NEWLINE(ptr)
 944               ) &&
 945               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 946           {
 947           if (++count >= GET2(code, 1))
 948             { ADD_NEW(state_offset + 4, 0); }
 949           else
 950             { ADD_NEW(state_offset, count); }
 951           }
 952         }
 953       break;
 954
 955       /*-----------------------------------------------------------------*/
 956       case OP_TYPEUPTO:
 957       case OP_TYPEMINUPTO:
 958       case OP_TYPEPOSUPTO:
 959       ADD_ACTIVE(state_offset + 4, 0);
 960       count = current_state->count;  /* Number already matched */
 961       if (clen > 0)
 962         {
 963         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 964             (c < 256 &&
 965               (d != OP_ANY ||
 966                (ims & PCRE_DOTALL) != 0 ||
 967                !IS_NEWLINE(ptr)
 968               ) &&
 969               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 970           {
 971           if (codevalue == OP_TYPEPOSUPTO)
 972             {
 973             active_count--;           /* Remove non-match possibility */
 974             next_active_state--;
 975             }
 976           if (++count >= GET2(code, 1))
 977             { ADD_NEW(state_offset + 4, 0); }
 978           else
 979             { ADD_NEW(state_offset, count); }
 980           }
 981         }
 982       break;
 983
 984 /* ========================================================================== */
 985       /* These are virtual opcodes that are used when something like
 986       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
 987       argument. It keeps the code above fast for the other cases. The argument
 988       is in the d variable. */
 989
 990 #ifdef SUPPORT_UCP
 991       case OP_PROP_EXTRA + OP_TYPEPLUS:
 992       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
 993       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
 994       count = current_state->count;           /* Already matched */
 995       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
 996       if (clen > 0)
 997         {
 998         BOOL OK;
 999         int category = _pcre_ucp_findprop(c, &chartype, &script);
1000         switch(code[2])
1001           {
1002           case PT_ANY:
1003           OK = TRUE;
1004           break;
1005
1006           case PT_LAMP:
1007           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1008           break;
1009
1010           case PT_GC:
1011           OK = category == code[3];
1012           break;
1013
1014           case PT_PC:
1015           OK = chartype == code[3];
1016           break;
1017
1018           case PT_SC:
1019           OK = script == code[3];
1020           break;
1021
1022           /* Should never occur, but keep compilers from grumbling. */
1023
1024           default:
1025           OK = codevalue != OP_PROP;
1026           break;
1027           }
1028
1029         if (OK == (d == OP_PROP))
1030           {
1031           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1032             {
1033             active_count--;           /* Remove non-match possibility */
1034             next_active_state--;
1035             }
1036           count++;
1037           ADD_NEW(state_offset, count);
1038           }
1039         }
1040       break;
1041
1042       /*-----------------------------------------------------------------*/
1043       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1044       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1045       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1046       count = current_state->count;  /* Already matched */
1047       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1048       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1049         {
1050         const uschar *nptr = ptr + clen;
1051         int ncount = 0;
1052         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1053           {
1054           active_count--;           /* Remove non-match possibility */
1055           next_active_state--;
1056           }
1057         while (nptr < end_subject)
1058           {
1059           int nd;
1060           int ndlen = 1;
1061           GETCHARLEN(nd, nptr, ndlen);
1062           if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1063           ncount++;
1064           nptr += ndlen;
1065           }
1066         count++;
1067         ADD_NEW_DATA(-state_offset, count, ncount);
1068         }
1069       break;
1070 #endif
1071
1072       /*-----------------------------------------------------------------*/
1073       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1074       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1075       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1076       count = current_state->count;  /* Already matched */
1077       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1078       if (clen > 0)
1079         {
1080         int ncount = 0;
1081         switch (c)
1082           {
1083           case 0x000b:
1084           case 0x000c:
1085           case 0x0085:
1086           case 0x2028:
1087           case 0x2029:
1088           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1089           goto ANYNL01;
1090
1091           case 0x000d:
1092           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1093           /* Fall through */
1094
1095           ANYNL01:
1096           case 0x000a:
1097           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1098             {
1099             active_count--;           /* Remove non-match possibility */
1100             next_active_state--;
1101             }
1102           count++;
1103           ADD_NEW_DATA(-state_offset, count, ncount);
1104           break;
1105
1106           default:
1107           break;
1108           }
1109         }
1110       break;
1111
1112       /*-----------------------------------------------------------------*/
1113       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1114       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1115       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1116       count = current_state->count;  /* Already matched */
1117       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1118       if (clen > 0)
1119         {
1120         BOOL OK;
1121         switch (c)
1122           {
1123           case 0x000a:
1124           case 0x000b:
1125           case 0x000c:
1126           case 0x000d:
1127           case 0x0085:
1128           case 0x2028:
1129           case 0x2029:
1130           OK = TRUE;
1131           break;
1132
1133           default:
1134           OK = FALSE;
1135           break;
1136           }
1137
1138         if (OK == (d == OP_VSPACE))
1139           {
1140           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1141             {
1142             active_count--;           /* Remove non-match possibility */
1143             next_active_state--;
1144             }
1145           count++;
1146           ADD_NEW_DATA(-state_offset, count, 0);
1147           }
1148         }
1149       break;
1150
1151       /*-----------------------------------------------------------------*/
1152       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1153       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1154       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1155       count = current_state->count;  /* Already matched */
1156       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1157       if (clen > 0)
1158         {
1159         BOOL OK;
1160         switch (c)
1161           {
1162           case 0x09:      /* HT */
1163           case 0x20:      /* SPACE */
1164           case 0xa0:      /* NBSP */
1165           case 0x1680:    /* OGHAM SPACE MARK */
1166           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1167           case 0x2000:    /* EN QUAD */
1168           case 0x2001:    /* EM QUAD */
1169           case 0x2002:    /* EN SPACE */
1170           case 0x2003:    /* EM SPACE */
1171           case 0x2004:    /* THREE-PER-EM SPACE */
1172           case 0x2005:    /* FOUR-PER-EM SPACE */
1173           case 0x2006:    /* SIX-PER-EM SPACE */
1174           case 0x2007:    /* FIGURE SPACE */
1175           case 0x2008:    /* PUNCTUATION SPACE */
1176           case 0x2009:    /* THIN SPACE */
1177           case 0x200A:    /* HAIR SPACE */
1178           case 0x202f:    /* NARROW NO-BREAK SPACE */
1179           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1180           case 0x3000:    /* IDEOGRAPHIC SPACE */
1181           OK = TRUE;
1182           break;
1183
1184           default:
1185           OK = FALSE;
1186           break;
1187           }
1188
1189         if (OK == (d == OP_HSPACE))
1190           {
1191           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1192             {
1193             active_count--;           /* Remove non-match possibility */
1194             next_active_state--;
1195             }
1196           count++;
1197           ADD_NEW_DATA(-state_offset, count, 0);
1198           }
1199         }
1200       break;
1201
1202       /*-----------------------------------------------------------------*/
1203 #ifdef SUPPORT_UCP
1204       case OP_PROP_EXTRA + OP_TYPEQUERY:
1205       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1206       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1207       count = 4;
1208       goto QS1;
1209
1210       case OP_PROP_EXTRA + OP_TYPESTAR:
1211       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1212       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1213       count = 0;
1214
1215       QS1:
1216
1217       ADD_ACTIVE(state_offset + 4, 0);
1218       if (clen > 0)
1219         {
1220         BOOL OK;
1221         int category = _pcre_ucp_findprop(c, &chartype, &script);
1222         switch(code[2])
1223           {
1224           case PT_ANY:
1225           OK = TRUE;
1226           break;
1227
1228           case PT_LAMP:
1229           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1230           break;
1231
1232           case PT_GC:
1233           OK = category == code[3];
1234           break;
1235
1236           case PT_PC:
1237           OK = chartype == code[3];
1238           break;
1239
1240           case PT_SC:
1241           OK = script == code[3];
1242           break;
1243
1244           /* Should never occur, but keep compilers from grumbling. */
1245
1246           default:
1247           OK = codevalue != OP_PROP;
1248           break;
1249           }
1250
1251         if (OK == (d == OP_PROP))
1252           {
1253           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1254               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1255             {
1256             active_count--;           /* Remove non-match possibility */
1257             next_active_state--;
1258             }
1259           ADD_NEW(state_offset + count, 0);
1260           }
1261         }
1262       break;
1263
1264       /*-----------------------------------------------------------------*/
1265       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1266       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1267       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1268       count = 2;
1269       goto QS2;
1270
1271       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1272       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1273       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1274       count = 0;
1275
1276       QS2:
1277
1278       ADD_ACTIVE(state_offset + 2, 0);
1279       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1280         {
1281         const uschar *nptr = ptr + clen;
1282         int ncount = 0;
1283         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1284             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1285           {
1286           active_count--;           /* Remove non-match possibility */
1287           next_active_state--;
1288           }
1289         while (nptr < end_subject)
1290           {
1291           int nd;
1292           int ndlen = 1;
1293           GETCHARLEN(nd, nptr, ndlen);
1294           if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1295           ncount++;
1296           nptr += ndlen;
1297           }
1298         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1299         }
1300       break;
1301 #endif
1302
1303       /*-----------------------------------------------------------------*/
1304       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1305       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1306       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1307       count = 2;
1308       goto QS3;
1309
1310       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1311       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1312       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1313       count = 0;
1314
1315       QS3:
1316       ADD_ACTIVE(state_offset + 2, 0);
1317       if (clen > 0)
1318         {
1319         int ncount = 0;
1320         switch (c)
1321           {
1322           case 0x000b:
1323           case 0x000c:
1324           case 0x0085:
1325           case 0x2028:
1326           case 0x2029:
1327           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1328           goto ANYNL02;
1329
1330           case 0x000d:
1331           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1332           /* Fall through */
1333
1334           ANYNL02:
1335           case 0x000a:
1336           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1337               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1338             {
1339             active_count--;           /* Remove non-match possibility */
1340             next_active_state--;
1341             }
1342           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1343           break;
1344
1345           default:
1346           break;
1347           }
1348         }
1349       break;
1350
1351       /*-----------------------------------------------------------------*/
1352       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1353       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1354       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1355       count = 2;
1356       goto QS4;
1357
1358       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1359       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1360       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1361       count = 0;
1362
1363       QS4:
1364       ADD_ACTIVE(state_offset + 2, 0);
1365       if (clen > 0)
1366         {
1367         BOOL OK;
1368         switch (c)
1369           {
1370           case 0x000a:
1371           case 0x000b:
1372           case 0x000c:
1373           case 0x000d:
1374           case 0x0085:
1375           case 0x2028:
1376           case 0x2029:
1377           OK = TRUE;
1378           break;
1379
1380           default:
1381           OK = FALSE;
1382           break;
1383           }
1384         if (OK == (d == OP_VSPACE))
1385           {
1386           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1387               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1388             {
1389             active_count--;           /* Remove non-match possibility */
1390             next_active_state--;
1391             }
1392           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1393           }
1394         }
1395       break;
1396
1397       /*-----------------------------------------------------------------*/
1398       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1399       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1400       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1401       count = 2;
1402       goto QS5;
1403
1404       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1405       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1406       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1407       count = 0;
1408
1409       QS5:
1410       ADD_ACTIVE(state_offset + 2, 0);
1411       if (clen > 0)
1412         {
1413         BOOL OK;
1414         switch (c)
1415           {
1416           case 0x09:      /* HT */
1417           case 0x20:      /* SPACE */
1418           case 0xa0:      /* NBSP */
1419           case 0x1680:    /* OGHAM SPACE MARK */
1420           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1421           case 0x2000:    /* EN QUAD */
1422           case 0x2001:    /* EM QUAD */
1423           case 0x2002:    /* EN SPACE */
1424           case 0x2003:    /* EM SPACE */
1425           case 0x2004:    /* THREE-PER-EM SPACE */
1426           case 0x2005:    /* FOUR-PER-EM SPACE */
1427           case 0x2006:    /* SIX-PER-EM SPACE */
1428           case 0x2007:    /* FIGURE SPACE */
1429           case 0x2008:    /* PUNCTUATION SPACE */
1430           case 0x2009:    /* THIN SPACE */
1431           case 0x200A:    /* HAIR SPACE */
1432           case 0x202f:    /* NARROW NO-BREAK SPACE */
1433           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1434           case 0x3000:    /* IDEOGRAPHIC SPACE */
1435           OK = TRUE;
1436           break;
1437
1438           default:
1439           OK = FALSE;
1440           break;
1441           }
1442
1443         if (OK == (d == OP_HSPACE))
1444           {
1445           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1446               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1447             {
1448             active_count--;           /* Remove non-match possibility */
1449             next_active_state--;
1450             }
1451           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1452           }
1453         }
1454       break;
1455
1456       /*-----------------------------------------------------------------*/
1457 #ifdef SUPPORT_UCP
1458       case OP_PROP_EXTRA + OP_TYPEEXACT:
1459       case OP_PROP_EXTRA + OP_TYPEUPTO:
1460       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1461       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1462       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1463         { ADD_ACTIVE(state_offset + 6, 0); }
1464       count = current_state->count;  /* Number already matched */
1465       if (clen > 0)
1466         {
1467         BOOL OK;
1468         int category = _pcre_ucp_findprop(c, &chartype, &script);
1469         switch(code[4])
1470           {
1471           case PT_ANY:
1472           OK = TRUE;
1473           break;
1474
1475           case PT_LAMP:
1476           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1477           break;
1478
1479           case PT_GC:
1480           OK = category == code[5];
1481           break;
1482
1483           case PT_PC:
1484           OK = chartype == code[5];
1485           break;
1486
1487           case PT_SC:
1488           OK = script == code[5];
1489           break;
1490
1491           /* Should never occur, but keep compilers from grumbling. */
1492
1493           default:
1494           OK = codevalue != OP_PROP;
1495           break;
1496           }
1497
1498         if (OK == (d == OP_PROP))
1499           {
1500           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1501             {
1502             active_count--;           /* Remove non-match possibility */
1503             next_active_state--;
1504             }
1505           if (++count >= GET2(code, 1))
1506             { ADD_NEW(state_offset + 6, 0); }
1507           else
1508             { ADD_NEW(state_offset, count); }
1509           }
1510         }
1511       break;
1512
1513       /*-----------------------------------------------------------------*/
1514       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1515       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1516       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1517       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1518       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1519         { ADD_ACTIVE(state_offset + 4, 0); }
1520       count = current_state->count;  /* Number already matched */
1521       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1522         {
1523         const uschar *nptr = ptr + clen;
1524         int ncount = 0;
1525         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1526           {
1527           active_count--;           /* Remove non-match possibility */
1528           next_active_state--;
1529           }
1530         while (nptr < end_subject)
1531           {
1532           int nd;
1533           int ndlen = 1;
1534           GETCHARLEN(nd, nptr, ndlen);
1535           if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1536           ncount++;
1537           nptr += ndlen;
1538           }
1539         if (++count >= GET2(code, 1))
1540           { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1541         else
1542           { ADD_NEW_DATA(-state_offset, count, ncount); }
1543         }
1544       break;
1545 #endif
1546
1547       /*-----------------------------------------------------------------*/
1548       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1549       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1550       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1551       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1552       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1553         { ADD_ACTIVE(state_offset + 4, 0); }
1554       count = current_state->count;  /* Number already matched */
1555       if (clen > 0)
1556         {
1557         int ncount = 0;
1558         switch (c)
1559           {
1560           case 0x000b:
1561           case 0x000c:
1562           case 0x0085:
1563           case 0x2028:
1564           case 0x2029:
1565           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1566           goto ANYNL03;
1567
1568           case 0x000d:
1569           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1570           /* Fall through */
1571
1572           ANYNL03:
1573           case 0x000a:
1574           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1575             {
1576             active_count--;           /* Remove non-match possibility */
1577             next_active_state--;
1578             }
1579           if (++count >= GET2(code, 1))
1580             { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1581           else
1582             { ADD_NEW_DATA(-state_offset, count, ncount); }
1583           break;
1584
1585           default:
1586           break;
1587           }
1588         }
1589       break;
1590
1591       /*-----------------------------------------------------------------*/
1592       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1593       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1594       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1595       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1596       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1597         { ADD_ACTIVE(state_offset + 4, 0); }
1598       count = current_state->count;  /* Number already matched */
1599       if (clen > 0)
1600         {
1601         BOOL OK;
1602         switch (c)
1603           {
1604           case 0x000a:
1605           case 0x000b:
1606           case 0x000c:
1607           case 0x000d:
1608           case 0x0085:
1609           case 0x2028:
1610           case 0x2029:
1611           OK = TRUE;
1612           break;
1613
1614           default:
1615           OK = FALSE;
1616           }
1617
1618         if (OK == (d == OP_VSPACE))
1619           {
1620           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1621             {
1622             active_count--;           /* Remove non-match possibility */
1623             next_active_state--;
1624             }
1625           if (++count >= GET2(code, 1))
1626             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1627           else
1628             { ADD_NEW_DATA(-state_offset, count, 0); }
1629           }
1630         }
1631       break;
1632
1633       /*-----------------------------------------------------------------*/
1634       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1635       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1636       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1637       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1638       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1639         { ADD_ACTIVE(state_offset + 4, 0); }
1640       count = current_state->count;  /* Number already matched */
1641       if (clen > 0)
1642         {
1643         BOOL OK;
1644         switch (c)
1645           {
1646           case 0x09:      /* HT */
1647           case 0x20:      /* SPACE */
1648           case 0xa0:      /* NBSP */
1649           case 0x1680:    /* OGHAM SPACE MARK */
1650           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1651           case 0x2000:    /* EN QUAD */
1652           case 0x2001:    /* EM QUAD */
1653           case 0x2002:    /* EN SPACE */
1654           case 0x2003:    /* EM SPACE */
1655           case 0x2004:    /* THREE-PER-EM SPACE */
1656           case 0x2005:    /* FOUR-PER-EM SPACE */
1657           case 0x2006:    /* SIX-PER-EM SPACE */
1658           case 0x2007:    /* FIGURE SPACE */
1659           case 0x2008:    /* PUNCTUATION SPACE */
1660           case 0x2009:    /* THIN SPACE */
1661           case 0x200A:    /* HAIR SPACE */
1662           case 0x202f:    /* NARROW NO-BREAK SPACE */
1663           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1664           case 0x3000:    /* IDEOGRAPHIC SPACE */
1665           OK = TRUE;
1666           break;
1667
1668           default:
1669           OK = FALSE;
1670           break;
1671           }
1672
1673         if (OK == (d == OP_HSPACE))
1674           {
1675           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1676             {
1677             active_count--;           /* Remove non-match possibility */
1678             next_active_state--;
1679             }
1680           if (++count >= GET2(code, 1))
1681             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1682           else
1683             { ADD_NEW_DATA(-state_offset, count, 0); }
1684           }
1685         }
1686       break;
1687
1688 /* ========================================================================== */
1689       /* These opcodes are followed by a character that is usually compared
1690       to the current subject character; it is loaded into d. We still get
1691       here even if there is no subject character, because in some cases zero
1692       repetitions are permitted. */
1693
1694       /*-----------------------------------------------------------------*/
1695       case OP_CHAR:
1696       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1697       break;
1698
1699       /*-----------------------------------------------------------------*/
1700       case OP_CHARNC:
1701       if (clen == 0) break;
1702
1703 #ifdef SUPPORT_UTF8
1704       if (utf8)
1705         {
1706         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1707           {
1708           unsigned int othercase;
1709           if (c < 128) othercase = fcc[c]; else
1710
1711           /* If we have Unicode property support, we can use it to test the
1712           other case of the character. */
1713
1714 #ifdef SUPPORT_UCP
1715           othercase = _pcre_ucp_othercase(c);
1716 #else
1717           othercase = NOTACHAR;
1718 #endif
1719
1720           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1721           }
1722         }
1723       else
1724 #endif  /* SUPPORT_UTF8 */
1725
1726       /* Non-UTF-8 mode */
1727         {
1728         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1729         }
1730       break;
1731
1732
1733 #ifdef SUPPORT_UCP
1734       /*-----------------------------------------------------------------*/
1735       /* This is a tricky one because it can match more than one character.
1736       Find out how many characters to skip, and then set up a negative state
1737       to wait for them to pass before continuing. */
1738
1739       case OP_EXTUNI:
1740       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1741         {
1742         const uschar *nptr = ptr + clen;
1743         int ncount = 0;
1744         while (nptr < end_subject)
1745           {
1746           int nclen = 1;
1747           GETCHARLEN(c, nptr, nclen);
1748           if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1749           ncount++;
1750           nptr += nclen;
1751           }
1752         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1753         }
1754       break;
1755 #endif
1756
1757       /*-----------------------------------------------------------------*/
1758       /* This is a tricky like EXTUNI because it too can match more than one
1759       character (when CR is followed by LF). In this case, set up a negative
1760       state to wait for one character to pass before continuing. */
1761
1762       case OP_ANYNL:
1763       if (clen > 0) switch(c)
1764         {
1765         case 0x000b:
1766         case 0x000c:
1767         case 0x0085:
1768         case 0x2028:
1769         case 0x2029:
1770         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1771
1772         case 0x000a:
1773         ADD_NEW(state_offset + 1, 0);
1774         break;
1775
1776         case 0x000d:
1777         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1778           {
1779           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1780           }
1781         else
1782           {
1783           ADD_NEW(state_offset + 1, 0);
1784           }
1785         break;
1786         }
1787       break;
1788
1789       /*-----------------------------------------------------------------*/
1790       case OP_NOT_VSPACE:
1791       if (clen > 0) switch(c)
1792         {
1793         case 0x000a:
1794         case 0x000b:
1795         case 0x000c:
1796         case 0x000d:
1797         case 0x0085:
1798         case 0x2028:
1799         case 0x2029:
1800         break;
1801
1802         default:
1803         ADD_NEW(state_offset + 1, 0);
1804         break;
1805         }
1806       break;
1807
1808       /*-----------------------------------------------------------------*/
1809       case OP_VSPACE:
1810       if (clen > 0) switch(c)
1811         {
1812         case 0x000a:
1813         case 0x000b:
1814         case 0x000c:
1815         case 0x000d:
1816         case 0x0085:
1817         case 0x2028:
1818         case 0x2029:
1819         ADD_NEW(state_offset + 1, 0);
1820         break;
1821
1822         default: break;
1823         }
1824       break;
1825
1826       /*-----------------------------------------------------------------*/
1827       case OP_NOT_HSPACE:
1828       if (clen > 0) switch(c)
1829         {
1830         case 0x09:      /* HT */
1831         case 0x20:      /* SPACE */
1832         case 0xa0:      /* NBSP */
1833         case 0x1680:    /* OGHAM SPACE MARK */
1834         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1835         case 0x2000:    /* EN QUAD */
1836         case 0x2001:    /* EM QUAD */
1837         case 0x2002:    /* EN SPACE */
1838         case 0x2003:    /* EM SPACE */
1839         case 0x2004:    /* THREE-PER-EM SPACE */
1840         case 0x2005:    /* FOUR-PER-EM SPACE */
1841         case 0x2006:    /* SIX-PER-EM SPACE */
1842         case 0x2007:    /* FIGURE SPACE */
1843         case 0x2008:    /* PUNCTUATION SPACE */
1844         case 0x2009:    /* THIN SPACE */
1845         case 0x200A:    /* HAIR SPACE */
1846         case 0x202f:    /* NARROW NO-BREAK SPACE */
1847         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1848         case 0x3000:    /* IDEOGRAPHIC SPACE */
1849         break;
1850
1851         default:
1852         ADD_NEW(state_offset + 1, 0);
1853         break;
1854         }
1855       break;
1856
1857       /*-----------------------------------------------------------------*/
1858       case OP_HSPACE:
1859       if (clen > 0) switch(c)
1860         {
1861         case 0x09:      /* HT */
1862         case 0x20:      /* SPACE */
1863         case 0xa0:      /* NBSP */
1864         case 0x1680:    /* OGHAM SPACE MARK */
1865         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1866         case 0x2000:    /* EN QUAD */
1867         case 0x2001:    /* EM QUAD */
1868         case 0x2002:    /* EN SPACE */
1869         case 0x2003:    /* EM SPACE */
1870         case 0x2004:    /* THREE-PER-EM SPACE */
1871         case 0x2005:    /* FOUR-PER-EM SPACE */
1872         case 0x2006:    /* SIX-PER-EM SPACE */
1873         case 0x2007:    /* FIGURE SPACE */
1874         case 0x2008:    /* PUNCTUATION SPACE */
1875         case 0x2009:    /* THIN SPACE */
1876         case 0x200A:    /* HAIR SPACE */
1877         case 0x202f:    /* NARROW NO-BREAK SPACE */
1878         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1879         case 0x3000:    /* IDEOGRAPHIC SPACE */
1880         ADD_NEW(state_offset + 1, 0);
1881         break;
1882         }
1883       break;
1884
1885       /*-----------------------------------------------------------------*/
1886       /* Match a negated single character. This is only used for one-byte
1887       characters, that is, we know that d < 256. The character we are
1888       checking (c) can be multibyte. */
1889
1890       case OP_NOT:
1891       if (clen > 0)
1892         {
1893         unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1894         if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1895         }
1896       break;
1897
1898       /*-----------------------------------------------------------------*/
1899       case OP_PLUS:
1900       case OP_MINPLUS:
1901       case OP_POSPLUS:
1902       case OP_NOTPLUS:
1903       case OP_NOTMINPLUS:
1904       case OP_NOTPOSPLUS:
1905       count = current_state->count;  /* Already matched */
1906       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1907       if (clen > 0)
1908         {
1909         unsigned int otherd = NOTACHAR;
1910         if ((ims & PCRE_CASELESS) != 0)
1911           {
1912 #ifdef SUPPORT_UTF8
1913           if (utf8 && d >= 128)
1914             {
1915 #ifdef SUPPORT_UCP
1916             otherd = _pcre_ucp_othercase(d);
1917 #endif  /* SUPPORT_UCP */
1918             }
1919           else
1920 #endif  /* SUPPORT_UTF8 */
1921           otherd = fcc[d];
1922           }
1923         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1924           {
1925           if (count > 0 &&
1926               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1927             {
1928             active_count--;             /* Remove non-match possibility */
1929             next_active_state--;
1930             }
1931           count++;
1932           ADD_NEW(state_offset, count);
1933           }
1934         }
1935       break;
1936
1937       /*-----------------------------------------------------------------*/
1938       case OP_QUERY:
1939       case OP_MINQUERY:
1940       case OP_POSQUERY:
1941       case OP_NOTQUERY:
1942       case OP_NOTMINQUERY:
1943       case OP_NOTPOSQUERY:
1944       ADD_ACTIVE(state_offset + dlen + 1, 0);
1945       if (clen > 0)
1946         {
1947         unsigned int otherd = NOTACHAR;
1948         if ((ims & PCRE_CASELESS) != 0)
1949           {
1950 #ifdef SUPPORT_UTF8
1951           if (utf8 && d >= 128)
1952             {
1953 #ifdef SUPPORT_UCP
1954             otherd = _pcre_ucp_othercase(d);
1955 #endif  /* SUPPORT_UCP */
1956             }
1957           else
1958 #endif  /* SUPPORT_UTF8 */
1959           otherd = fcc[d];
1960           }
1961         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1962           {
1963           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1964             {
1965             active_count--;            /* Remove non-match possibility */
1966             next_active_state--;
1967             }
1968           ADD_NEW(state_offset + dlen + 1, 0);
1969           }
1970         }
1971       break;
1972
1973       /*-----------------------------------------------------------------*/
1974       case OP_STAR:
1975       case OP_MINSTAR:
1976       case OP_POSSTAR:
1977       case OP_NOTSTAR:
1978       case OP_NOTMINSTAR:
1979       case OP_NOTPOSSTAR:
1980       ADD_ACTIVE(state_offset + dlen + 1, 0);
1981       if (clen > 0)
1982         {
1983         unsigned int otherd = NOTACHAR;
1984         if ((ims & PCRE_CASELESS) != 0)
1985           {
1986 #ifdef SUPPORT_UTF8
1987           if (utf8 && d >= 128)
1988             {
1989 #ifdef SUPPORT_UCP
1990             otherd = _pcre_ucp_othercase(d);
1991 #endif  /* SUPPORT_UCP */
1992             }
1993           else
1994 #endif  /* SUPPORT_UTF8 */
1995           otherd = fcc[d];
1996           }
1997         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1998           {
1999           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2000             {
2001             active_count--;            /* Remove non-match possibility */
2002             next_active_state--;
2003             }
2004           ADD_NEW(state_offset, 0);
2005           }
2006         }
2007       break;
2008
2009       /*-----------------------------------------------------------------*/
2010       case OP_EXACT:
2011       case OP_NOTEXACT:
2012       count = current_state->count;  /* Number already matched */
2013       if (clen > 0)
2014         {
2015         unsigned int otherd = NOTACHAR;
2016         if ((ims & PCRE_CASELESS) != 0)
2017           {
2018 #ifdef SUPPORT_UTF8
2019           if (utf8 && d >= 128)
2020             {
2021 #ifdef SUPPORT_UCP
2022             otherd = _pcre_ucp_othercase(d);
2023 #endif  /* SUPPORT_UCP */
2024             }
2025           else
2026 #endif  /* SUPPORT_UTF8 */
2027           otherd = fcc[d];
2028           }
2029         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2030           {
2031           if (++count >= GET2(code, 1))
2032             { ADD_NEW(state_offset + dlen + 3, 0); }
2033           else
2034             { ADD_NEW(state_offset, count); }
2035           }
2036         }
2037       break;
2038
2039       /*-----------------------------------------------------------------*/
2040       case OP_UPTO:
2041       case OP_MINUPTO:
2042       case OP_POSUPTO:
2043       case OP_NOTUPTO:
2044       case OP_NOTMINUPTO:
2045       case OP_NOTPOSUPTO:
2046       ADD_ACTIVE(state_offset + dlen + 3, 0);
2047       count = current_state->count;  /* Number already matched */
2048       if (clen > 0)
2049         {
2050         unsigned int otherd = NOTACHAR;
2051         if ((ims & PCRE_CASELESS) != 0)
2052           {
2053 #ifdef SUPPORT_UTF8
2054           if (utf8 && d >= 128)
2055             {
2056 #ifdef SUPPORT_UCP
2057             otherd = _pcre_ucp_othercase(d);
2058 #endif  /* SUPPORT_UCP */
2059             }
2060           else
2061 #endif  /* SUPPORT_UTF8 */
2062           otherd = fcc[d];
2063           }
2064         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2065           {
2066           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2067             {
2068             active_count--;             /* Remove non-match possibility */
2069             next_active_state--;
2070             }
2071           if (++count >= GET2(code, 1))
2072             { ADD_NEW(state_offset + dlen + 3, 0); }
2073           else
2074             { ADD_NEW(state_offset, count); }
2075           }
2076         }
2077       break;
2078
2079
2080 /* ========================================================================== */
2081       /* These are the class-handling opcodes */
2082
2083       case OP_CLASS:
2084       case OP_NCLASS:
2085       case OP_XCLASS:
2086         {
2087         BOOL isinclass = FALSE;
2088         int next_state_offset;
2089         const uschar *ecode;
2090
2091         /* For a simple class, there is always just a 32-byte table, and we
2092         can set isinclass from it. */
2093
2094         if (codevalue != OP_XCLASS)
2095           {
2096           ecode = code + 33;
2097           if (clen > 0)
2098             {
2099             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2100               ((code[1 + c/8] & (1 << (c&7))) != 0);
2101             }
2102           }
2103
2104         /* An extended class may have a table or a list of single characters,
2105         ranges, or both, and it may be positive or negative. There's a
2106         function that sorts all this out. */
2107
2108         else
2109          {
2110          ecode = code + GET(code, 1);
2111          if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2112          }
2113
2114         /* At this point, isinclass is set for all kinds of class, and ecode
2115         points to the byte after the end of the class. If there is a
2116         quantifier, this is where it will be. */
2117
2118         next_state_offset = ecode - start_code;
2119
2120         switch (*ecode)
2121           {
2122           case OP_CRSTAR:
2123           case OP_CRMINSTAR:
2124           ADD_ACTIVE(next_state_offset + 1, 0);
2125           if (isinclass) { ADD_NEW(state_offset, 0); }
2126           break;
2127
2128           case OP_CRPLUS:
2129           case OP_CRMINPLUS:
2130           count = current_state->count;  /* Already matched */
2131           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2132           if (isinclass) { count++; ADD_NEW(state_offset, count); }
2133           break;
2134
2135           case OP_CRQUERY:
2136           case OP_CRMINQUERY:
2137           ADD_ACTIVE(next_state_offset + 1, 0);
2138           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2139           break;
2140
2141           case OP_CRRANGE:
2142           case OP_CRMINRANGE:
2143           count = current_state->count;  /* Already matched */
2144           if (count >= GET2(ecode, 1))
2145             { ADD_ACTIVE(next_state_offset + 5, 0); }
2146           if (isinclass)
2147             {
2148             int max = GET2(ecode, 3);
2149             if (++count >= max && max != 0)   /* Max 0 => no limit */
2150               { ADD_NEW(next_state_offset + 5, 0); }
2151             else
2152               { ADD_NEW(state_offset, count); }
2153             }
2154           break;
2155
2156           default:
2157           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2158           break;
2159           }
2160         }
2161       break;
2162
2163 /* ========================================================================== */
2164       /* These are the opcodes for fancy brackets of various kinds. We have
2165       to use recursion in order to handle them. */
2166
2167       case OP_ASSERT:
2168       case OP_ASSERT_NOT:
2169       case OP_ASSERTBACK:
2170       case OP_ASSERTBACK_NOT:
2171         {
2172         int rc;
2173         int local_offsets[2];
2174         int local_workspace[1000];
2175         const uschar *endasscode = code + GET(code, 1);
2176
2177         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2178
2179         rc = internal_dfa_exec(
2180           md,                                   /* static match data */
2181           code,                                 /* this subexpression's code */
2182           ptr,                                  /* where we currently are */
2183           ptr - start_subject,                  /* start offset */
2184           local_offsets,                        /* offset vector */
2185           sizeof(local_offsets)/sizeof(int),    /* size of same */
2186           local_workspace,                      /* workspace vector */
2187           sizeof(local_workspace)/sizeof(int),  /* size of same */
2188           ims,                                  /* the current ims flags */
2189           rlevel,                               /* function recursion level */
2190           recursing);                           /* pass on regex recursion */
2191
2192         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2193             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2194         }
2195       break;
2196
2197       /*-----------------------------------------------------------------*/
2198       case OP_COND:
2199       case OP_SCOND:
2200         {
2201         int local_offsets[1000];
2202         int local_workspace[1000];
2203         int condcode = code[LINK_SIZE+1];
2204
2205         /* Back reference conditions are not supported */
2206
2207         if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2208
2209         /* The DEFINE condition is always false */
2210
2211         if (condcode == OP_DEF)
2212           {
2213           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2214           }
2215
2216         /* The only supported version of OP_RREF is for the value RREF_ANY,
2217         which means "test if in any recursion". We can't test for specifically
2218         recursed groups. */
2219
2220         else if (condcode == OP_RREF)
2221           {
2222           int value = GET2(code, LINK_SIZE+2);
2223           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2224           if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2225             else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2226           }
2227
2228         /* Otherwise, the condition is an assertion */
2229
2230         else
2231           {
2232           int rc;
2233           const uschar *asscode = code + LINK_SIZE + 1;
2234           const uschar *endasscode = asscode + GET(asscode, 1);
2235
2236           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2237
2238           rc = internal_dfa_exec(
2239             md,                                   /* fixed match data */
2240             asscode,                              /* this subexpression's code */
2241             ptr,                                  /* where we currently are */
2242             ptr - start_subject,                  /* start offset */
2243             local_offsets,                        /* offset vector */
2244             sizeof(local_offsets)/sizeof(int),    /* size of same */
2245             local_workspace,                      /* workspace vector */
2246             sizeof(local_workspace)/sizeof(int),  /* size of same */
2247             ims,                                  /* the current ims flags */
2248             rlevel,                               /* function recursion level */
2249             recursing);                           /* pass on regex recursion */
2250
2251           if ((rc >= 0) ==
2252                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2253             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2254           else
2255             { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2256           }
2257         }
2258       break;
2259
2260       /*-----------------------------------------------------------------*/
2261       case OP_RECURSE:
2262         {
2263         int local_offsets[1000];
2264         int local_workspace[1000];
2265         int rc;
2266
2267         DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2268           recursing + 1));
2269
2270         rc = internal_dfa_exec(
2271           md,                                   /* fixed match data */
2272           start_code + GET(code, 1),            /* this subexpression's code */
2273           ptr,                                  /* where we currently are */
2274           ptr - start_subject,                  /* start offset */
2275           local_offsets,                        /* offset vector */
2276           sizeof(local_offsets)/sizeof(int),    /* size of same */
2277           local_workspace,                      /* workspace vector */
2278           sizeof(local_workspace)/sizeof(int),  /* size of same */
2279           ims,                                  /* the current ims flags */
2280           rlevel,                               /* function recursion level */
2281           recursing + 1);                       /* regex recurse level */
2282
2283         DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2284           recursing + 1, rc));
2285
2286         /* Ran out of internal offsets */
2287
2288         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2289
2290         /* For each successful matched substring, set up the next state with a
2291         count of characters to skip before trying it. Note that the count is in
2292         characters, not bytes. */
2293
2294         if (rc > 0)
2295           {
2296           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2297             {
2298             const uschar *p = start_subject + local_offsets[rc];
2299             const uschar *pp = start_subject + local_offsets[rc+1];
2300             int charcount = local_offsets[rc+1] - local_offsets[rc];
2301             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2302             if (charcount > 0)
2303               {
2304               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2305               }
2306             else
2307               {
2308               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2309               }
2310             }
2311           }
2312         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2313         }
2314       break;
2315
2316       /*-----------------------------------------------------------------*/
2317       case OP_ONCE:
2318         {
2319         int local_offsets[2];
2320         int local_workspace[1000];
2321
2322         int rc = internal_dfa_exec(
2323           md,                                   /* fixed match data */
2324           code,                                 /* this subexpression's code */
2325           ptr,                                  /* where we currently are */
2326           ptr - start_subject,                  /* start offset */
2327           local_offsets,                        /* offset vector */
2328           sizeof(local_offsets)/sizeof(int),    /* size of same */
2329           local_workspace,                      /* workspace vector */
2330           sizeof(local_workspace)/sizeof(int),  /* size of same */
2331           ims,                                  /* the current ims flags */
2332           rlevel,                               /* function recursion level */
2333           recursing);                           /* pass on regex recursion */
2334
2335         if (rc >= 0)
2336           {
2337           const uschar *end_subpattern = code;
2338           int charcount = local_offsets[1] - local_offsets[0];
2339           int next_state_offset, repeat_state_offset;
2340
2341           do { end_subpattern += GET(end_subpattern, 1); }
2342             while (*end_subpattern == OP_ALT);
2343           next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2344
2345           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2346           arrange for the repeat state also to be added to the relevant list.
2347           Calculate the offset, or set -1 for no repeat. */
2348
2349           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2350                                  *end_subpattern == OP_KETRMIN)?
2351             end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2352
2353           /* If we have matched an empty string, add the next state at the
2354           current character pointer. This is important so that the duplicate
2355           checking kicks in, which is what breaks infinite loops that match an
2356           empty string. */
2357
2358           if (charcount == 0)
2359             {
2360             ADD_ACTIVE(next_state_offset, 0);
2361             }
2362
2363           /* Optimization: if there are no more active states, and there
2364           are no new states yet set up, then skip over the subject string
2365           right here, to save looping. Otherwise, set up the new state to swing
2366           into action when the end of the substring is reached. */
2367
2368           else if (i + 1 >= active_count && new_count == 0)
2369             {
2370             ptr += charcount;
2371             clen = 0;
2372             ADD_NEW(next_state_offset, 0);
2373
2374             /* If we are adding a repeat state at the new character position,
2375             we must fudge things so that it is the only current state.
2376             Otherwise, it might be a duplicate of one we processed before, and
2377             that would cause it to be skipped. */
2378
2379             if (repeat_state_offset >= 0)
2380               {
2381               next_active_state = active_states;
2382               active_count = 0;
2383               i = -1;
2384               ADD_ACTIVE(repeat_state_offset, 0);
2385               }
2386             }
2387           else
2388             {
2389             const uschar *p = start_subject + local_offsets[0];
2390             const uschar *pp = start_subject + local_offsets[1];
2391             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2392             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2393             if (repeat_state_offset >= 0)
2394               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2395             }
2396
2397           }
2398         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2399         }
2400       break;
2401
2402
2403 /* ========================================================================== */
2404       /* Handle callouts */
2405
2406       case OP_CALLOUT:
2407       if (pcre_callout != NULL)
2408         {
2409         int rrc;
2410         pcre_callout_block cb;
2411         cb.version          = 1;   /* Version 1 of the callout block */
2412         cb.callout_number   = code[1];
2413         cb.offset_vector    = offsets;
2414         cb.subject          = (PCRE_SPTR)start_subject;
2415         cb.subject_length   = end_subject - start_subject;
2416         cb.start_match      = current_subject - start_subject;
2417         cb.current_position = ptr - start_subject;
2418         cb.pattern_position = GET(code, 2);
2419         cb.next_item_length = GET(code, 2 + LINK_SIZE);
2420         cb.capture_top      = 1;
2421         cb.capture_last     = -1;
2422         cb.callout_data     = md->callout_data;
2423         if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2424         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2425         }
2426       break;
2427
2428
2429 /* ========================================================================== */
2430       default:        /* Unsupported opcode */
2431       return PCRE_ERROR_DFA_UITEM;
2432       }
2433
2434     NEXT_ACTIVE_STATE: continue;
2435
2436     }      /* End of loop scanning active states */
2437
2438   /* We have finished the processing at the current subject character. If no
2439   new states have been set for the next character, we have found all the
2440   matches that we are going to find. If we are at the top level and partial
2441   matching has been requested, check for appropriate conditions. */
2442
2443   if (new_count <= 0)
2444     {
2445     if (match_count < 0 &&                     /* No matches found */
2446         rlevel == 1 &&                         /* Top level match function */
2447         (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */
2448         ptr >= end_subject &&                  /* Reached end of subject */
2449         ptr > current_subject)                 /* Matched non-empty string */
2450       {
2451       if (offsetcount >= 2)
2452         {
2453         offsets[0] = current_subject - start_subject;
2454         offsets[1] = end_subject - start_subject;
2455         }
2456       match_count = PCRE_ERROR_PARTIAL;
2457       }
2458
2459     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2460       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2461       rlevel*2-2, SP));
2462     break;        /* In effect, "return", but see the comment below */
2463     }
2464
2465   /* One or more states are active for the next character. */
2466
2467   ptr += clen;    /* Advance to next subject character */
2468   }               /* Loop to move along the subject string */
2469
2470 /* Control gets here from "break" a few lines above. We do it this way because
2471 if we use "return" above, we have compiler trouble. Some compilers warn if
2472 there's nothing here because they think the function doesn't return a value. On
2473 the other hand, if we put a dummy statement here, some more clever compilers
2474 complain that it can't be reached. Sigh. */
2475
2476 return match_count;
2477 }
2478
2479
2480
2481
2482 /*************************************************
2483 *    Execute a Regular Expression - DFA engine   *
2484 *************************************************/
2485
2486 /* This external function applies a compiled re to a subject string using a DFA
2487 engine. This function calls the internal function multiple times if the pattern
2488 is not anchored.
2489
2490 Arguments:
2491   argument_re     points to the compiled expression
2492   extra_data      points to extra data or is NULL
2493   subject         points to the subject string
2494   length          length of subject string (may contain binary zeros)
2495   start_offset    where to start in the subject string
2496   options         option bits
2497   offsets         vector of match offsets
2498   offsetcount     size of same
2499   workspace       workspace vector
2500   wscount         size of same
2501
2502 Returns:          > 0 => number of match offset pairs placed in offsets
2503                   = 0 => offsets overflowed; longest matches are present
2504                    -1 => failed to match
2505                  < -1 => some kind of unexpected problem
2506 */
2507
2508 PCRE_EXP_DEFN int
2509 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2510   const char *subject, int length, int start_offset, int options, int *offsets,
2511   int offsetcount, int *workspace, int wscount)
2512 {
2513 real_pcre *re = (real_pcre *)argument_re;
2514 dfa_match_data match_block;
2515 dfa_match_data *md = &match_block;
2516 BOOL utf8, anchored, startline, firstline;
2517 const uschar *current_subject, *end_subject, *lcc;
2518
2519 pcre_study_data internal_study;
2520 const pcre_study_data *study = NULL;
2521 real_pcre internal_re;
2522
2523 const uschar *req_byte_ptr;
2524 const uschar *start_bits = NULL;
2525 BOOL first_byte_caseless = FALSE;
2526 BOOL req_byte_caseless = FALSE;
2527 int first_byte = -1;
2528 int req_byte = -1;
2529 int req_byte2 = -1;
2530 int newline;
2531
2532 /* Plausibility checks */
2533
2534 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2535 if (re == NULL || subject == NULL || workspace == NULL ||
2536    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2537 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2538 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2539
2540 /* We need to find the pointer to any study data before we test for byte
2541 flipping, so we scan the extra_data block first. This may set two fields in the
2542 match block, so we must initialize them beforehand. However, the other fields
2543 in the match block must not be set until after the byte flipping. */
2544
2545 md->tables = re->tables;
2546 md->callout_data = NULL;
2547
2548 if (extra_data != NULL)
2549   {
2550   unsigned int flags = extra_data->flags;
2551   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2552     study = (const pcre_study_data *)extra_data->study_data;
2553   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2554   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2555     return PCRE_ERROR_DFA_UMLIMIT;
2556   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2557     md->callout_data = extra_data->callout_data;
2558   if ((flags & PCRE_EXTRA_TABLES) != 0)
2559     md->tables = extra_data->tables;
2560   }
2561
2562 /* Check that the first field in the block is the magic number. If it is not,
2563 test for a regex that was compiled on a host of opposite endianness. If this is
2564 the case, flipped values are put in internal_re and internal_study if there was
2565 study data too. */
2566
2567 if (re->magic_number != MAGIC_NUMBER)
2568   {
2569   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2570   if (re == NULL) return PCRE_ERROR_BADMAGIC;
2571   if (study != NULL) study = &internal_study;
2572   }
2573
2574 /* Set some local values */
2575
2576 current_subject = (const unsigned char *)subject + start_offset;
2577 end_subject = (const unsigned char *)subject + length;
2578 req_byte_ptr = current_subject - 1;
2579
2580 #ifdef SUPPORT_UTF8
2581 utf8 = (re->options & PCRE_UTF8) != 0;
2582 #else
2583 utf8 = FALSE;
2584 #endif
2585
2586 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2587   (re->options & PCRE_ANCHORED) != 0;
2588
2589 /* The remaining fixed data for passing around. */
2590
2591 md->start_code = (const uschar *)argument_re +
2592     re->name_table_offset + re->name_count * re->name_entry_size;
2593 md->start_subject = (const unsigned char *)subject;
2594 md->end_subject = end_subject;
2595 md->moptions = options;
2596 md->poptions = re->options;
2597
2598 /* If the BSR option is not set at match time, copy what was set
2599 at compile time. */
2600
2601 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2602   {
2603   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2604     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2605 #ifdef BSR_ANYCRLF
2606   else md->moptions |= PCRE_BSR_ANYCRLF;
2607 #endif
2608   }
2609
2610 /* Handle different types of newline. The three bits give eight cases. If
2611 nothing is set at run time, whatever was used at compile time applies. */
2612
2613 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2614          PCRE_NEWLINE_BITS)
2615   {
2616   case 0: newline = NEWLINE; break;   /* Compile-time default */
2617   case PCRE_NEWLINE_CR: newline = '\r'; break;
2618   case PCRE_NEWLINE_LF: newline = '\n'; break;
2619   case PCRE_NEWLINE_CR+
2620        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2621   case PCRE_NEWLINE_ANY: newline = -1; break;
2622   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2623   default: return PCRE_ERROR_BADNEWLINE;
2624   }
2625
2626 if (newline == -2)
2627   {
2628   md->nltype = NLTYPE_ANYCRLF;
2629   }
2630 else if (newline < 0)
2631   {
2632   md->nltype = NLTYPE_ANY;
2633   }
2634 else
2635   {
2636   md->nltype = NLTYPE_FIXED;
2637   if (newline > 255)
2638     {
2639     md->nllen = 2;
2640     md->nl[0] = (newline >> 8) & 255;
2641     md->nl[1] = newline & 255;
2642     }
2643   else
2644     {
2645     md->nllen = 1;
2646     md->nl[0] = newline;
2647     }
2648   }
2649
2650 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2651 back the character offset. */
2652
2653 #ifdef SUPPORT_UTF8
2654 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2655   {
2656   if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2657     return PCRE_ERROR_BADUTF8;
2658   if (start_offset > 0 && start_offset < length)
2659     {
2660     int tb = ((uschar *)subject)[start_offset];
2661     if (tb > 127)
2662       {
2663       tb &= 0xc0;
2664       if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2665       }
2666     }
2667   }
2668 #endif
2669
2670 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2671 is a feature that makes it possible to save compiled regex and re-use them
2672 in other programs later. */
2673
2674 if (md->tables == NULL) md->tables = _pcre_default_tables;
2675
2676 /* The lower casing table and the "must be at the start of a line" flag are
2677 used in a loop when finding where to start. */
2678
2679 lcc = md->tables + lcc_offset;
2680 startline = (re->flags & PCRE_STARTLINE) != 0;
2681 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2682
2683 /* Set up the first character to match, if available. The first_byte value is
2684 never set for an anchored regular expression, but the anchoring may be forced
2685 at run time, so we have to test for anchoring. The first char may be unset for
2686 an unanchored pattern, of course. If there's no first char and the pattern was
2687 studied, there may be a bitmap of possible first characters. */
2688
2689 if (!anchored)
2690   {
2691   if ((re->flags & PCRE_FIRSTSET) != 0)
2692     {
2693     first_byte = re->first_byte & 255;
2694     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2695       first_byte = lcc[first_byte];
2696     }
2697   else
2698     {
2699     if (startline && study != NULL &&
2700          (study->options & PCRE_STUDY_MAPPED) != 0)
2701       start_bits = study->start_bits;
2702     }
2703   }
2704
2705 /* For anchored or unanchored matches, there may be a "last known required
2706 character" set. */
2707
2708 if ((re->flags & PCRE_REQCHSET) != 0)
2709   {
2710   req_byte = re->req_byte & 255;
2711   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2712   req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
2713   }
2714
2715 /* Call the main matching function, looping for a non-anchored regex after a
2716 failed match. Unless restarting, optimize by moving to the first match
2717 character if possible, when not anchored. Then unless wanting a partial match,
2718 check for a required later character. */
2719
2720 for (;;)
2721   {
2722   int rc;
2723
2724   if ((options & PCRE_DFA_RESTART) == 0)
2725     {
2726     const uschar *save_end_subject = end_subject;
2727
2728     /* Advance to a unique first char if possible. If firstline is TRUE, the
2729     start of the match is constrained to the first line of a multiline string.
2730     Implement this by temporarily adjusting end_subject so that we stop
2731     scanning at a newline. If the match fails at the newline, later code breaks
2732     this loop. */
2733
2734     if (firstline)
2735       {
2736       const uschar *t = current_subject;
2737       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2738       end_subject = t;
2739       }
2740
2741     if (first_byte >= 0)
2742       {
2743       if (first_byte_caseless)
2744         while (current_subject < end_subject &&
2745                lcc[*current_subject] != first_byte)
2746           current_subject++;
2747       else
2748         while (current_subject < end_subject && *current_subject != first_byte)
2749           current_subject++;
2750       }
2751
2752     /* Or to just after a linebreak for a multiline match if possible */
2753
2754     else if (startline)
2755       {
2756       if (current_subject > md->start_subject + start_offset)
2757         {
2758         while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2759           current_subject++;
2760
2761         /* If we have just passed a CR and the newline option is ANY or
2762         ANYCRLF, and we are now at a LF, advance the match position by one more
2763         character. */
2764
2765         if (current_subject[-1] == '\r' &&
2766              (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2767              current_subject < end_subject &&
2768              *current_subject == '\n')
2769           current_subject++;
2770         }
2771       }
2772
2773     /* Or to a non-unique first char after study */
2774
2775     else if (start_bits != NULL)
2776       {
2777       while (current_subject < end_subject)
2778         {
2779         register unsigned int c = *current_subject;
2780         if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2781           else break;
2782         }
2783       }
2784
2785     /* Restore fudged end_subject */
2786
2787     end_subject = save_end_subject;
2788     }
2789
2790   /* If req_byte is set, we know that that character must appear in the subject
2791   for the match to succeed. If the first character is set, req_byte must be
2792   later in the subject; otherwise the test starts at the match point. This
2793   optimization can save a huge amount of work in patterns with nested unlimited
2794   repeats that aren't going to match. Writing separate code for cased/caseless
2795   versions makes it go faster, as does using an autoincrement and backing off
2796   on a match.
2797
2798   HOWEVER: when the subject string is very, very long, searching to its end can
2799   take a long time, and give bad performance on quite ordinary patterns. This
2800   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2801   don't do this when the string is sufficiently long.
2802
2803   ALSO: this processing is disabled when partial matching is requested.
2804   */
2805
2806   if (req_byte >= 0 &&
2807       end_subject - current_subject < REQ_BYTE_MAX &&
2808       (options & PCRE_PARTIAL) == 0)
2809     {
2810     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2811
2812     /* We don't need to repeat the search if we haven't yet reached the
2813     place we found it at last time. */
2814
2815     if (p > req_byte_ptr)
2816       {
2817       if (req_byte_caseless)
2818         {
2819         while (p < end_subject)
2820           {
2821           register int pp = *p++;
2822           if (pp == req_byte || pp == req_byte2) { p--; break; }
2823           }
2824         }
2825       else
2826         {
2827         while (p < end_subject)
2828           {
2829           if (*p++ == req_byte) { p--; break; }
2830           }
2831         }
2832
2833       /* If we can't find the required character, break the matching loop,
2834       which will cause a return or PCRE_ERROR_NOMATCH. */
2835
2836       if (p >= end_subject) break;
2837
2838       /* If we have found the required character, save the point where we
2839       found it, so that we don't search again next time round the loop if
2840       the start hasn't passed this character yet. */
2841
2842       req_byte_ptr = p;
2843       }
2844     }
2845
2846   /* OK, now we can do the business */
2847
2848   rc = internal_dfa_exec(
2849     md,                                /* fixed match data */
2850     md->start_code,                    /* this subexpression's code */
2851     current_subject,                   /* where we currently are */
2852     start_offset,                      /* start offset in subject */
2853     offsets,                           /* offset vector */
2854     offsetcount,                       /* size of same */
2855     workspace,                         /* workspace vector */
2856     wscount,                           /* size of same */
2857     re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2858     0,                                 /* function recurse level */
2859     0);                                /* regex recurse level */
2860
2861   /* Anything other than "no match" means we are done, always; otherwise, carry
2862   on only if not anchored. */
2863
2864   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2865
2866   /* Advance to the next subject character unless we are at the end of a line
2867   and firstline is set. */
2868
2869   if (firstline && IS_NEWLINE(current_subject)) break;
2870   current_subject++;
2871   if (utf8)
2872     {
2873     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2874       current_subject++;
2875     }
2876   if (current_subject > end_subject) break;
2877
2878   /* If we have just passed a CR and we are now at a LF, and the pattern does
2879   not contain any explicit matches for \r or \n, and the newline option is CRLF
2880   or ANY or ANYCRLF, advance the match position by one more character. */
2881
2882   if (current_subject[-1] == '\r' &&
2883       current_subject < end_subject &&
2884       *current_subject == '\n' &&
2885       (re->flags & PCRE_HASCRORLF) == 0 &&
2886         (md->nltype == NLTYPE_ANY ||
2887          md->nltype == NLTYPE_ANYCRLF ||
2888          md->nllen == 2))
2889     current_subject++;
2890
2891   }   /* "Bumpalong" loop */
2892
2893 return PCRE_ERROR_NOMATCH;
2894 }
2895
2896 /* End of pcre_dfa_exec.c */