glib/pcre/pcre_dfa_exec.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9            Copyright (c) 1997-2008 University of Cambridge
  10
  11 -----------------------------------------------------------------------------
  12 Redistribution and use in source and binary forms, with or without
  13 modification, are permitted provided that the following conditions are met:
  14
  15     * Redistributions of source code must retain the above copyright notice,
  16       this list of conditions and the following disclaimer.
  17
  18     * Redistributions in binary form must reproduce the above copyright
  19       notice, this list of conditions and the following disclaimer in the
  20       documentation and/or other materials provided with the distribution.
  21
  22     * Neither the name of the University of Cambridge nor the names of its
  23       contributors may be used to endorse or promote products derived from
  24       this software without specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE.
  37 -----------------------------------------------------------------------------
  38 */
  39
  40
  41 /* This module contains the external function pcre_dfa_exec(), which is an
  42 alternative matching function that uses a sort of DFA algorithm (not a true
  43 FSM). This is NOT Perl- compatible, but it has advantages in certain
  44 applications. */
  45
  46
  47 #ifdef HAVE_CONFIG_H
  48 #include "config.h"
  49 #endif
  50
  51 #define NLBLOCK md             /* Block containing newline information */
  52 #define PSSTART start_subject  /* Field containing processed string start */
  53 #define PSEND   end_subject    /* Field containing processed string end */
  54
  55 #include "pcre_internal.h"
  56
  57
  58 /* For use to indent debugging output */
  59
  60 #define SP "                   "
  61
  62
  63
  64 /*************************************************
  65 *      Code parameters and static tables         *
  66 *************************************************/
  67
  68 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
  69 into others, under special conditions. A gap of 20 between the blocks should be
  70 enough. The resulting opcodes don't have to be less than 256 because they are
  71 never stored, so we push them well clear of the normal opcodes. */
  72
  73 #define OP_PROP_EXTRA       300
  74 #define OP_EXTUNI_EXTRA     320
  75 #define OP_ANYNL_EXTRA      340
  76 #define OP_HSPACE_EXTRA     360
  77 #define OP_VSPACE_EXTRA     380
  78
  79
  80 /* This table identifies those opcodes that are followed immediately by a
  81 character that is to be tested in some way. This makes is possible to
  82 centralize the loading of these characters. In the case of Type * etc, the
  83 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
  84 small value. ***NOTE*** If the start of this table is modified, the two tables
  85 that follow must also be modified. */
  86
  87 static const uschar coptable[] = {
  88   0,                             /* End                                    */
  89   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
  90   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
  91   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
  92   0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
  93   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
  94   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
  95   1,                             /* Char                                   */
  96   1,                             /* Charnc                                 */
  97   1,                             /* not                                    */
  98   /* Positive single-char repeats                                          */
  99   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 100   3, 3, 3,                       /* upto, minupto, exact                   */
 101   1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
 102   /* Negative single-char repeats - only for chars < 256                   */
 103   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
 104   3, 3, 3,                       /* NOT upto, minupto, exact               */
 105   1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */
 106   /* Positive type repeats                                                 */
 107   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
 108   3, 3, 3,                       /* Type upto, minupto, exact              */
 109   1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
 110   /* Character class & ref repeats                                         */
 111   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
 112   0, 0,                          /* CRRANGE, CRMINRANGE                    */
 113   0,                             /* CLASS                                  */
 114   0,                             /* NCLASS                                 */
 115   0,                             /* XCLASS - variable length               */
 116   0,                             /* REF                                    */
 117   0,                             /* RECURSE                                */
 118   0,                             /* CALLOUT                                */
 119   0,                             /* Alt                                    */
 120   0,                             /* Ket                                    */
 121   0,                             /* KetRmax                                */
 122   0,                             /* KetRmin                                */
 123   0,                             /* Assert                                 */
 124   0,                             /* Assert not                             */
 125   0,                             /* Assert behind                          */
 126   0,                             /* Assert behind not                      */
 127   0,                             /* Reverse                                */
 128   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
 129   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
 130   0,                             /* CREF                                   */
 131   0,                             /* RREF                                   */
 132   0,                             /* DEF                                    */
 133   0, 0,                          /* BRAZERO, BRAMINZERO                    */
 134   0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
 135   0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */
 136 };
 137
 138 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
 139 and \w */
 140
 141 static const uschar toptable1[] = {
 142   0, 0, 0, 0, 0, 0,
 143   ctype_digit, ctype_digit,
 144   ctype_space, ctype_space,
 145   ctype_word,  ctype_word,
 146   0, 0                            /* OP_ANY, OP_ALLANY */
 147 };
 148
 149 static const uschar toptable2[] = {
 150   0, 0, 0, 0, 0, 0,
 151   ctype_digit, 0,
 152   ctype_space, 0,
 153   ctype_word,  0,
 154   1, 1                            /* OP_ANY, OP_ALLANY */
 155 };
 156
 157
 158 /* Structure for holding data about a particular state, which is in effect the
 159 current data for an active path through the match tree. It must consist
 160 entirely of ints because the working vector we are passed, and which we put
 161 these structures in, is a vector of ints. */
 162
 163 typedef struct stateblock {
 164   int offset;                     /* Offset to opcode */
 165   int count;                      /* Count for repeats */
 166   int ims;                        /* ims flag bits */
 167   int data;                       /* Some use extra data */
 168 } stateblock;
 169
 170 #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
 171
 172
 173 #ifdef DEBUG
 174 /*************************************************
 175 *             Print character string             *
 176 *************************************************/
 177
 178 /* Character string printing function for debugging.
 179
 180 Arguments:
 181   p            points to string
 182   length       number of bytes
 183   f            where to print
 184
 185 Returns:       nothing
 186 */
 187
 188 static void
 189 pchars(unsigned char *p, int length, FILE *f)
 190 {
 191 int c;
 192 while (length-- > 0)
 193   {
 194   if (isprint(c = *(p++)))
 195     fprintf(f, "%c", c);
 196   else
 197     fprintf(f, "\\x%02x", c);
 198   }
 199 }
 200 #endif
 201
 202
 203
 204 /*************************************************
 205 *    Execute a Regular Expression - DFA engine   *
 206 *************************************************/
 207
 208 /* This internal function applies a compiled pattern to a subject string,
 209 starting at a given point, using a DFA engine. This function is called from the
 210 external one, possibly multiple times if the pattern is not anchored. The
 211 function calls itself recursively for some kinds of subpattern.
 212
 213 Arguments:
 214   md                the match_data block with fixed information
 215   this_start_code   the opening bracket of this subexpression's code
 216   current_subject   where we currently are in the subject string
 217   start_offset      start offset in the subject string
 218   offsets           vector to contain the matching string offsets
 219   offsetcount       size of same
 220   workspace         vector of workspace
 221   wscount           size of same
 222   ims               the current ims flags
 223   rlevel            function call recursion level
 224   recursing         regex recursive call level
 225
 226 Returns:            > 0 => number of match offset pairs placed in offsets
 227                     = 0 => offsets overflowed; longest matches are present
 228                      -1 => failed to match
 229                    < -1 => some kind of unexpected problem
 230
 231 The following macros are used for adding states to the two state vectors (one
 232 for the current character, one for the following character). */
 233
 234 #define ADD_ACTIVE(x,y) \
 235   if (active_count++ < wscount) \
 236     { \
 237     next_active_state->offset = (x); \
 238     next_active_state->count  = (y); \
 239     next_active_state->ims    = ims; \
 240     next_active_state++; \
 241     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 242     } \
 243   else return PCRE_ERROR_DFA_WSSIZE
 244
 245 #define ADD_ACTIVE_DATA(x,y,z) \
 246   if (active_count++ < wscount) \
 247     { \
 248     next_active_state->offset = (x); \
 249     next_active_state->count  = (y); \
 250     next_active_state->ims    = ims; \
 251     next_active_state->data   = (z); \
 252     next_active_state++; \
 253     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 254     } \
 255   else return PCRE_ERROR_DFA_WSSIZE
 256
 257 #define ADD_NEW(x,y) \
 258   if (new_count++ < wscount) \
 259     { \
 260     next_new_state->offset = (x); \
 261     next_new_state->count  = (y); \
 262     next_new_state->ims    = ims; \
 263     next_new_state++; \
 264     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 265     } \
 266   else return PCRE_ERROR_DFA_WSSIZE
 267
 268 #define ADD_NEW_DATA(x,y,z) \
 269   if (new_count++ < wscount) \
 270     { \
 271     next_new_state->offset = (x); \
 272     next_new_state->count  = (y); \
 273     next_new_state->ims    = ims; \
 274     next_new_state->data   = (z); \
 275     next_new_state++; \
 276     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 277     } \
 278   else return PCRE_ERROR_DFA_WSSIZE
 279
 280 /* And now, here is the code */
 281
 282 static int
 283 internal_dfa_exec(
 284   dfa_match_data *md,
 285   const uschar *this_start_code,
 286   const uschar *current_subject,
 287   int start_offset,
 288   int *offsets,
 289   int offsetcount,
 290   int *workspace,
 291   int wscount,
 292   int ims,
 293   int  rlevel,
 294   int  recursing)
 295 {
 296 stateblock *active_states, *new_states, *temp_states;
 297 stateblock *next_active_state, *next_new_state;
 298
 299 const uschar *ctypes, *lcc, *fcc;
 300 const uschar *ptr;
 301 const uschar *end_code, *first_op;
 302
 303 int active_count, new_count, match_count;
 304
 305 /* Some fields in the md block are frequently referenced, so we load them into
 306 independent variables in the hope that this will perform better. */
 307
 308 const uschar *start_subject = md->start_subject;
 309 const uschar *end_subject = md->end_subject;
 310 const uschar *start_code = md->start_code;
 311
 312 #ifdef SUPPORT_UTF8
 313 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
 314 #else
 315 BOOL utf8 = FALSE;
 316 #endif
 317
 318 rlevel++;
 319 offsetcount &= (-2);
 320
 321 wscount -= 2;
 322 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
 323           (2 * INTS_PER_STATEBLOCK);
 324
 325 DPRINTF(("\n%.*s---------------------\n"
 326   "%.*sCall to internal_dfa_exec f=%d r=%d\n",
 327   rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
 328
 329 ctypes = md->tables + ctypes_offset;
 330 lcc = md->tables + lcc_offset;
 331 fcc = md->tables + fcc_offset;
 332
 333 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
 334
 335 active_states = (stateblock *)(workspace + 2);
 336 next_new_state = new_states = active_states + wscount;
 337 new_count = 0;
 338
 339 first_op = this_start_code + 1 + LINK_SIZE +
 340   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
 341
 342 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
 343 the alternative states onto the list, and find out where the end is. This
 344 makes is possible to use this function recursively, when we want to stop at a
 345 matching internal ket rather than at the end.
 346
 347 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
 348 a backward assertion. In that case, we have to find out the maximum amount to
 349 move back, and set up each alternative appropriately. */
 350
 351 if (*first_op == OP_REVERSE)
 352   {
 353   int max_back = 0;
 354   int gone_back;
 355
 356   end_code = this_start_code;
 357   do
 358     {
 359     int back = GET(end_code, 2+LINK_SIZE);
 360     if (back > max_back) max_back = back;
 361     end_code += GET(end_code, 1);
 362     }
 363   while (*end_code == OP_ALT);
 364
 365   /* If we can't go back the amount required for the longest lookbehind
 366   pattern, go back as far as we can; some alternatives may still be viable. */
 367
 368 #ifdef SUPPORT_UTF8
 369   /* In character mode we have to step back character by character */
 370
 371   if (utf8)
 372     {
 373     for (gone_back = 0; gone_back < max_back; gone_back++)
 374       {
 375       if (current_subject <= start_subject) break;
 376       current_subject--;
 377       while (current_subject > start_subject &&
 378              (*current_subject & 0xc0) == 0x80)
 379         current_subject--;
 380       }
 381     }
 382   else
 383 #endif
 384
 385   /* In byte-mode we can do this quickly. */
 386
 387     {
 388     gone_back = (current_subject - max_back < start_subject)?
 389       current_subject - start_subject : max_back;
 390     current_subject -= gone_back;
 391     }
 392
 393   /* Now we can process the individual branches. */
 394
 395   end_code = this_start_code;
 396   do
 397     {
 398     int back = GET(end_code, 2+LINK_SIZE);
 399     if (back <= gone_back)
 400       {
 401       int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
 402       ADD_NEW_DATA(-bstate, 0, gone_back - back);
 403       }
 404     end_code += GET(end_code, 1);
 405     }
 406   while (*end_code == OP_ALT);
 407  }
 408
 409 /* This is the code for a "normal" subpattern (not a backward assertion). The
 410 start of a whole pattern is always one of these. If we are at the top level,
 411 we may be asked to restart matching from the same point that we reached for a
 412 previous partial match. We still have to scan through the top-level branches to
 413 find the end state. */
 414
 415 else
 416   {
 417   end_code = this_start_code;
 418
 419   /* Restarting */
 420
 421   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
 422     {
 423     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
 424     new_count = workspace[1];
 425     if (!workspace[0])
 426       memcpy(new_states, active_states, new_count * sizeof(stateblock));
 427     }
 428
 429   /* Not restarting */
 430
 431   else
 432     {
 433     int length = 1 + LINK_SIZE +
 434       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
 435     do
 436       {
 437       ADD_NEW(end_code - start_code + length, 0);
 438       end_code += GET(end_code, 1);
 439       length = 1 + LINK_SIZE;
 440       }
 441     while (*end_code == OP_ALT);
 442     }
 443   }
 444
 445 workspace[0] = 0;    /* Bit indicating which vector is current */
 446
 447 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
 448
 449 /* Loop for scanning the subject */
 450
 451 ptr = current_subject;
 452 for (;;)
 453   {
 454   int i, j;
 455   int clen, dlen;
 456   unsigned int c, d;
 457
 458   /* Make the new state list into the active state list and empty the
 459   new state list. */
 460
 461   temp_states = active_states;
 462   active_states = new_states;
 463   new_states = temp_states;
 464   active_count = new_count;
 465   new_count = 0;
 466
 467   workspace[0] ^= 1;              /* Remember for the restarting feature */
 468   workspace[1] = active_count;
 469
 470 #ifdef DEBUG
 471   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
 472   pchars((uschar *)ptr, strlen((char *)ptr), stdout);
 473   printf("\"\n");
 474
 475   printf("%.*sActive states: ", rlevel*2-2, SP);
 476   for (i = 0; i < active_count; i++)
 477     printf("%d/%d ", active_states[i].offset, active_states[i].count);
 478   printf("\n");
 479 #endif
 480
 481   /* Set the pointers for adding new states */
 482
 483   next_active_state = active_states + active_count;
 484   next_new_state = new_states;
 485
 486   /* Load the current character from the subject outside the loop, as many
 487   different states may want to look at it, and we assume that at least one
 488   will. */
 489
 490   if (ptr < end_subject)
 491     {
 492     clen = 1;        /* Number of bytes in the character */
 493 #ifdef SUPPORT_UTF8
 494     if (utf8) { GETCHARLEN(c, ptr, clen); } else
 495 #endif  /* SUPPORT_UTF8 */
 496     c = *ptr;
 497     }
 498   else
 499     {
 500     clen = 0;        /* This indicates the end of the subject */
 501     c = NOTACHAR;    /* This value should never actually be used */
 502     }
 503
 504   /* Scan up the active states and act on each one. The result of an action
 505   may be to add more states to the currently active list (e.g. on hitting a
 506   parenthesis) or it may be to put states on the new list, for considering
 507   when we move the character pointer on. */
 508
 509   for (i = 0; i < active_count; i++)
 510     {
 511     stateblock *current_state = active_states + i;
 512     const uschar *code;
 513     int state_offset = current_state->offset;
 514     int count, codevalue;
 515
 516 #ifdef DEBUG
 517     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
 518     if (clen == 0) printf("EOL\n");
 519       else if (c > 32 && c < 127) printf("'%c'\n", c);
 520         else printf("0x%02x\n", c);
 521 #endif
 522
 523     /* This variable is referred to implicity in the ADD_xxx macros. */
 524
 525     ims = current_state->ims;
 526
 527     /* A negative offset is a special case meaning "hold off going to this
 528     (negated) state until the number of characters in the data field have
 529     been skipped". */
 530
 531     if (state_offset < 0)
 532       {
 533       if (current_state->data > 0)
 534         {
 535         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
 536         ADD_NEW_DATA(state_offset, current_state->count,
 537           current_state->data - 1);
 538         continue;
 539         }
 540       else
 541         {
 542         current_state->offset = state_offset = -state_offset;
 543         }
 544       }
 545
 546     /* Check for a duplicate state with the same count, and skip if found. */
 547
 548     for (j = 0; j < i; j++)
 549       {
 550       if (active_states[j].offset == state_offset &&
 551           active_states[j].count == current_state->count)
 552         {
 553         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
 554         goto NEXT_ACTIVE_STATE;
 555         }
 556       }
 557
 558     /* The state offset is the offset to the opcode */
 559
 560     code = start_code + state_offset;
 561     codevalue = *code;
 562
 563     /* If this opcode is followed by an inline character, load it. It is
 564     tempting to test for the presence of a subject character here, but that
 565     is wrong, because sometimes zero repetitions of the subject are
 566     permitted.
 567
 568     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
 569     argument that is not a data character - but is always one byte long. We
 570     have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
 571     this case. To keep the other cases fast, convert these ones to new opcodes.
 572     */
 573
 574     if (coptable[codevalue] > 0)
 575       {
 576       dlen = 1;
 577 #ifdef SUPPORT_UTF8
 578       if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
 579 #endif  /* SUPPORT_UTF8 */
 580       d = code[coptable[codevalue]];
 581       if (codevalue >= OP_TYPESTAR)
 582         {
 583         switch(d)
 584           {
 585           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
 586           case OP_NOTPROP:
 587           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
 588           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
 589           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
 590           case OP_NOT_HSPACE:
 591           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
 592           case OP_NOT_VSPACE:
 593           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
 594           default: break;
 595           }
 596         }
 597       }
 598     else
 599       {
 600       dlen = 0;         /* Not strictly necessary, but compilers moan */
 601       d = NOTACHAR;     /* if these variables are not set. */
 602       }
 603
 604
 605     /* Now process the individual opcodes */
 606
 607     switch (codevalue)
 608       {
 609
 610 /* ========================================================================== */
 611       /* Reached a closing bracket. If not at the end of the pattern, carry
 612       on with the next opcode. Otherwise, unless we have an empty string and
 613       PCRE_NOTEMPTY is set, save the match data, shifting up all previous
 614       matches so we always have the longest first. */
 615
 616       case OP_KET:
 617       case OP_KETRMIN:
 618       case OP_KETRMAX:
 619       if (code != end_code)
 620         {
 621         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
 622         if (codevalue != OP_KET)
 623           {
 624           ADD_ACTIVE(state_offset - GET(code, 1), 0);
 625           }
 626         }
 627       else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
 628         {
 629         if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
 630           else if (match_count > 0 && ++match_count * 2 >= offsetcount)
 631             match_count = 0;
 632         count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
 633         if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
 634         if (offsetcount >= 2)
 635           {
 636           offsets[0] = current_subject - start_subject;
 637           offsets[1] = ptr - start_subject;
 638           DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
 639             offsets[1] - offsets[0], current_subject));
 640           }
 641         if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
 642           {
 643           DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
 644             "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
 645             match_count, rlevel*2-2, SP));
 646           return match_count;
 647           }
 648         }
 649       break;
 650
 651 /* ========================================================================== */
 652       /* These opcodes add to the current list of states without looking
 653       at the current character. */
 654
 655       /*-----------------------------------------------------------------*/
 656       case OP_ALT:
 657       do { code += GET(code, 1); } while (*code == OP_ALT);
 658       ADD_ACTIVE(code - start_code, 0);
 659       break;
 660
 661       /*-----------------------------------------------------------------*/
 662       case OP_BRA:
 663       case OP_SBRA:
 664       do
 665         {
 666         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
 667         code += GET(code, 1);
 668         }
 669       while (*code == OP_ALT);
 670       break;
 671
 672       /*-----------------------------------------------------------------*/
 673       case OP_CBRA:
 674       case OP_SCBRA:
 675       ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
 676       code += GET(code, 1);
 677       while (*code == OP_ALT)
 678         {
 679         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
 680         code += GET(code, 1);
 681         }
 682       break;
 683
 684       /*-----------------------------------------------------------------*/
 685       case OP_BRAZERO:
 686       case OP_BRAMINZERO:
 687       ADD_ACTIVE(state_offset + 1, 0);
 688       code += 1 + GET(code, 2);
 689       while (*code == OP_ALT) code += GET(code, 1);
 690       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
 691       break;
 692
 693       /*-----------------------------------------------------------------*/
 694       case OP_SKIPZERO:
 695       code += 1 + GET(code, 2);
 696       while (*code == OP_ALT) code += GET(code, 1);
 697       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
 698       break;
 699
 700       /*-----------------------------------------------------------------*/
 701       case OP_CIRC:
 702       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
 703           ((ims & PCRE_MULTILINE) != 0 &&
 704             ptr != end_subject &&
 705             WAS_NEWLINE(ptr)))
 706         { ADD_ACTIVE(state_offset + 1, 0); }
 707       break;
 708
 709       /*-----------------------------------------------------------------*/
 710       case OP_EOD:
 711       if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 712       break;
 713
 714       /*-----------------------------------------------------------------*/
 715       case OP_OPT:
 716       ims = code[1];
 717       ADD_ACTIVE(state_offset + 2, 0);
 718       break;
 719
 720       /*-----------------------------------------------------------------*/
 721       case OP_SOD:
 722       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 723       break;
 724
 725       /*-----------------------------------------------------------------*/
 726       case OP_SOM:
 727       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
 728       break;
 729
 730
 731 /* ========================================================================== */
 732       /* These opcodes inspect the next subject character, and sometimes
 733       the previous one as well, but do not have an argument. The variable
 734       clen contains the length of the current character and is zero if we are
 735       at the end of the subject. */
 736
 737       /*-----------------------------------------------------------------*/
 738       case OP_ANY:
 739       if (clen > 0 && !IS_NEWLINE(ptr))
 740         { ADD_NEW(state_offset + 1, 0); }
 741       break;
 742
 743       /*-----------------------------------------------------------------*/
 744       case OP_ALLANY:
 745       if (clen > 0)
 746         { ADD_NEW(state_offset + 1, 0); }
 747       break;
 748
 749       /*-----------------------------------------------------------------*/
 750       case OP_EODN:
 751       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
 752         { ADD_ACTIVE(state_offset + 1, 0); }
 753       break;
 754
 755       /*-----------------------------------------------------------------*/
 756       case OP_DOLL:
 757       if ((md->moptions & PCRE_NOTEOL) == 0)
 758         {
 759         if (clen == 0 ||
 760             (IS_NEWLINE(ptr) &&
 761                ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
 762             ))
 763           { ADD_ACTIVE(state_offset + 1, 0); }
 764         }
 765       else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
 766         { ADD_ACTIVE(state_offset + 1, 0); }
 767       break;
 768
 769       /*-----------------------------------------------------------------*/
 770
 771       case OP_DIGIT:
 772       case OP_WHITESPACE:
 773       case OP_WORDCHAR:
 774       if (clen > 0 && c < 256 &&
 775             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
 776         { ADD_NEW(state_offset + 1, 0); }
 777       break;
 778
 779       /*-----------------------------------------------------------------*/
 780       case OP_NOT_DIGIT:
 781       case OP_NOT_WHITESPACE:
 782       case OP_NOT_WORDCHAR:
 783       if (clen > 0 && (c >= 256 ||
 784             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
 785         { ADD_NEW(state_offset + 1, 0); }
 786       break;
 787
 788       /*-----------------------------------------------------------------*/
 789       case OP_WORD_BOUNDARY:
 790       case OP_NOT_WORD_BOUNDARY:
 791         {
 792         int left_word, right_word;
 793
 794         if (ptr > start_subject)
 795           {
 796           const uschar *temp = ptr - 1;
 797 #ifdef SUPPORT_UTF8
 798           if (utf8) BACKCHAR(temp);
 799 #endif
 800           GETCHARTEST(d, temp);
 801           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
 802           }
 803         else left_word = 0;
 804
 805         if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
 806           else right_word = 0;
 807
 808         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
 809           { ADD_ACTIVE(state_offset + 1, 0); }
 810         }
 811       break;
 812
 813
 814       /*-----------------------------------------------------------------*/
 815       /* Check the next character by Unicode property. We will get here only
 816       if the support is in the binary; otherwise a compile-time error occurs.
 817       */
 818
 819 #ifdef SUPPORT_UCP
 820       case OP_PROP:
 821       case OP_NOTPROP:
 822       if (clen > 0)
 823         {
 824         BOOL OK;
 825         int chartype = UCD_CHARTYPE(c);
 826         switch(code[1])
 827           {
 828           case PT_ANY:
 829           OK = TRUE;
 830           break;
 831
 832           case PT_LAMP:
 833           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
 834           break;
 835
 836           case PT_GC:
 837           OK = _pcre_ucp_gentype[chartype] == code[2];
 838           break;
 839
 840           case PT_PC:
 841           OK = chartype == code[2];
 842           break;
 843
 844           case PT_SC:
 845           OK = UCD_SCRIPT(c) == code[2];
 846           break;
 847
 848           /* Should never occur, but keep compilers from grumbling. */
 849
 850           default:
 851           OK = codevalue != OP_PROP;
 852           break;
 853           }
 854
 855         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
 856         }
 857       break;
 858 #endif
 859
 860
 861
 862 /* ========================================================================== */
 863       /* These opcodes likewise inspect the subject character, but have an
 864       argument that is not a data character. It is one of these opcodes:
 865       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
 866       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
 867
 868       case OP_TYPEPLUS:
 869       case OP_TYPEMINPLUS:
 870       case OP_TYPEPOSPLUS:
 871       count = current_state->count;  /* Already matched */
 872       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 873       if (clen > 0)
 874         {
 875         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 876             (c < 256 &&
 877               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 878               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 879           {
 880           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
 881             {
 882             active_count--;            /* Remove non-match possibility */
 883             next_active_state--;
 884             }
 885           count++;
 886           ADD_NEW(state_offset, count);
 887           }
 888         }
 889       break;
 890
 891       /*-----------------------------------------------------------------*/
 892       case OP_TYPEQUERY:
 893       case OP_TYPEMINQUERY:
 894       case OP_TYPEPOSQUERY:
 895       ADD_ACTIVE(state_offset + 2, 0);
 896       if (clen > 0)
 897         {
 898         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 899             (c < 256 &&
 900               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 901               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 902           {
 903           if (codevalue == OP_TYPEPOSQUERY)
 904             {
 905             active_count--;            /* Remove non-match possibility */
 906             next_active_state--;
 907             }
 908           ADD_NEW(state_offset + 2, 0);
 909           }
 910         }
 911       break;
 912
 913       /*-----------------------------------------------------------------*/
 914       case OP_TYPESTAR:
 915       case OP_TYPEMINSTAR:
 916       case OP_TYPEPOSSTAR:
 917       ADD_ACTIVE(state_offset + 2, 0);
 918       if (clen > 0)
 919         {
 920         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 921             (c < 256 &&
 922               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 923               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 924           {
 925           if (codevalue == OP_TYPEPOSSTAR)
 926             {
 927             active_count--;            /* Remove non-match possibility */
 928             next_active_state--;
 929             }
 930           ADD_NEW(state_offset, 0);
 931           }
 932         }
 933       break;
 934
 935       /*-----------------------------------------------------------------*/
 936       case OP_TYPEEXACT:
 937       count = current_state->count;  /* Number already matched */
 938       if (clen > 0)
 939         {
 940         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 941             (c < 256 &&
 942               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 943               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 944           {
 945           if (++count >= GET2(code, 1))
 946             { ADD_NEW(state_offset + 4, 0); }
 947           else
 948             { ADD_NEW(state_offset, count); }
 949           }
 950         }
 951       break;
 952
 953       /*-----------------------------------------------------------------*/
 954       case OP_TYPEUPTO:
 955       case OP_TYPEMINUPTO:
 956       case OP_TYPEPOSUPTO:
 957       ADD_ACTIVE(state_offset + 4, 0);
 958       count = current_state->count;  /* Number already matched */
 959       if (clen > 0)
 960         {
 961         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 962             (c < 256 &&
 963               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
 964               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
 965           {
 966           if (codevalue == OP_TYPEPOSUPTO)
 967             {
 968             active_count--;           /* Remove non-match possibility */
 969             next_active_state--;
 970             }
 971           if (++count >= GET2(code, 1))
 972             { ADD_NEW(state_offset + 4, 0); }
 973           else
 974             { ADD_NEW(state_offset, count); }
 975           }
 976         }
 977       break;
 978
 979 /* ========================================================================== */
 980       /* These are virtual opcodes that are used when something like
 981       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
 982       argument. It keeps the code above fast for the other cases. The argument
 983       is in the d variable. */
 984
 985 #ifdef SUPPORT_UCP
 986       case OP_PROP_EXTRA + OP_TYPEPLUS:
 987       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
 988       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
 989       count = current_state->count;           /* Already matched */
 990       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
 991       if (clen > 0)
 992         {
 993         BOOL OK;
 994         int chartype = UCD_CHARTYPE(c);
 995         switch(code[2])
 996           {
 997           case PT_ANY:
 998           OK = TRUE;
 999           break;
1000
1001           case PT_LAMP:
1002           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1003           break;
1004
1005           case PT_GC:
1006           OK = _pcre_ucp_gentype[chartype] == code[3];
1007           break;
1008
1009           case PT_PC:
1010           OK = chartype == code[3];
1011           break;
1012
1013           case PT_SC:
1014           OK = UCD_SCRIPT(c) == code[3];
1015           break;
1016
1017           /* Should never occur, but keep compilers from grumbling. */
1018
1019           default:
1020           OK = codevalue != OP_PROP;
1021           break;
1022           }
1023
1024         if (OK == (d == OP_PROP))
1025           {
1026           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1027             {
1028             active_count--;           /* Remove non-match possibility */
1029             next_active_state--;
1030             }
1031           count++;
1032           ADD_NEW(state_offset, count);
1033           }
1034         }
1035       break;
1036
1037       /*-----------------------------------------------------------------*/
1038       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1039       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1040       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1041       count = current_state->count;  /* Already matched */
1042       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1043       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1044         {
1045         const uschar *nptr = ptr + clen;
1046         int ncount = 0;
1047         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1048           {
1049           active_count--;           /* Remove non-match possibility */
1050           next_active_state--;
1051           }
1052         while (nptr < end_subject)
1053           {
1054           int nd;
1055           int ndlen = 1;
1056           GETCHARLEN(nd, nptr, ndlen);
1057           if (UCD_CATEGORY(nd) != ucp_M) break;
1058           ncount++;
1059           nptr += ndlen;
1060           }
1061         count++;
1062         ADD_NEW_DATA(-state_offset, count, ncount);
1063         }
1064       break;
1065 #endif
1066
1067       /*-----------------------------------------------------------------*/
1068       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1069       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1070       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1071       count = current_state->count;  /* Already matched */
1072       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1073       if (clen > 0)
1074         {
1075         int ncount = 0;
1076         switch (c)
1077           {
1078           case 0x000b:
1079           case 0x000c:
1080           case 0x0085:
1081           case 0x2028:
1082           case 0x2029:
1083           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1084           goto ANYNL01;
1085
1086           case 0x000d:
1087           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1088           /* Fall through */
1089
1090           ANYNL01:
1091           case 0x000a:
1092           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1093             {
1094             active_count--;           /* Remove non-match possibility */
1095             next_active_state--;
1096             }
1097           count++;
1098           ADD_NEW_DATA(-state_offset, count, ncount);
1099           break;
1100
1101           default:
1102           break;
1103           }
1104         }
1105       break;
1106
1107       /*-----------------------------------------------------------------*/
1108       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1109       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1110       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1111       count = current_state->count;  /* Already matched */
1112       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1113       if (clen > 0)
1114         {
1115         BOOL OK;
1116         switch (c)
1117           {
1118           case 0x000a:
1119           case 0x000b:
1120           case 0x000c:
1121           case 0x000d:
1122           case 0x0085:
1123           case 0x2028:
1124           case 0x2029:
1125           OK = TRUE;
1126           break;
1127
1128           default:
1129           OK = FALSE;
1130           break;
1131           }
1132
1133         if (OK == (d == OP_VSPACE))
1134           {
1135           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1136             {
1137             active_count--;           /* Remove non-match possibility */
1138             next_active_state--;
1139             }
1140           count++;
1141           ADD_NEW_DATA(-state_offset, count, 0);
1142           }
1143         }
1144       break;
1145
1146       /*-----------------------------------------------------------------*/
1147       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1148       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1149       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1150       count = current_state->count;  /* Already matched */
1151       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1152       if (clen > 0)
1153         {
1154         BOOL OK;
1155         switch (c)
1156           {
1157           case 0x09:      /* HT */
1158           case 0x20:      /* SPACE */
1159           case 0xa0:      /* NBSP */
1160           case 0x1680:    /* OGHAM SPACE MARK */
1161           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1162           case 0x2000:    /* EN QUAD */
1163           case 0x2001:    /* EM QUAD */
1164           case 0x2002:    /* EN SPACE */
1165           case 0x2003:    /* EM SPACE */
1166           case 0x2004:    /* THREE-PER-EM SPACE */
1167           case 0x2005:    /* FOUR-PER-EM SPACE */
1168           case 0x2006:    /* SIX-PER-EM SPACE */
1169           case 0x2007:    /* FIGURE SPACE */
1170           case 0x2008:    /* PUNCTUATION SPACE */
1171           case 0x2009:    /* THIN SPACE */
1172           case 0x200A:    /* HAIR SPACE */
1173           case 0x202f:    /* NARROW NO-BREAK SPACE */
1174           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1175           case 0x3000:    /* IDEOGRAPHIC SPACE */
1176           OK = TRUE;
1177           break;
1178
1179           default:
1180           OK = FALSE;
1181           break;
1182           }
1183
1184         if (OK == (d == OP_HSPACE))
1185           {
1186           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1187             {
1188             active_count--;           /* Remove non-match possibility */
1189             next_active_state--;
1190             }
1191           count++;
1192           ADD_NEW_DATA(-state_offset, count, 0);
1193           }
1194         }
1195       break;
1196
1197       /*-----------------------------------------------------------------*/
1198 #ifdef SUPPORT_UCP
1199       case OP_PROP_EXTRA + OP_TYPEQUERY:
1200       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1201       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1202       count = 4;
1203       goto QS1;
1204
1205       case OP_PROP_EXTRA + OP_TYPESTAR:
1206       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1207       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1208       count = 0;
1209
1210       QS1:
1211
1212       ADD_ACTIVE(state_offset + 4, 0);
1213       if (clen > 0)
1214         {
1215         BOOL OK;
1216         int chartype = UCD_CHARTYPE(c);
1217         switch(code[2])
1218           {
1219           case PT_ANY:
1220           OK = TRUE;
1221           break;
1222
1223           case PT_LAMP:
1224           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1225           break;
1226
1227           case PT_GC:
1228           OK = _pcre_ucp_gentype[chartype] == code[3];
1229           break;
1230
1231           case PT_PC:
1232           OK = chartype == code[3];
1233           break;
1234
1235           case PT_SC:
1236           OK = UCD_SCRIPT(c) == code[3];
1237           break;
1238
1239           /* Should never occur, but keep compilers from grumbling. */
1240
1241           default:
1242           OK = codevalue != OP_PROP;
1243           break;
1244           }
1245
1246         if (OK == (d == OP_PROP))
1247           {
1248           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1249               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1250             {
1251             active_count--;           /* Remove non-match possibility */
1252             next_active_state--;
1253             }
1254           ADD_NEW(state_offset + count, 0);
1255           }
1256         }
1257       break;
1258
1259       /*-----------------------------------------------------------------*/
1260       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1261       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1262       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1263       count = 2;
1264       goto QS2;
1265
1266       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1267       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1268       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1269       count = 0;
1270
1271       QS2:
1272
1273       ADD_ACTIVE(state_offset + 2, 0);
1274       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1275         {
1276         const uschar *nptr = ptr + clen;
1277         int ncount = 0;
1278         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1279             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1280           {
1281           active_count--;           /* Remove non-match possibility */
1282           next_active_state--;
1283           }
1284         while (nptr < end_subject)
1285           {
1286           int nd;
1287           int ndlen = 1;
1288           GETCHARLEN(nd, nptr, ndlen);
1289           if (UCD_CATEGORY(nd) != ucp_M) break;
1290           ncount++;
1291           nptr += ndlen;
1292           }
1293         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1294         }
1295       break;
1296 #endif
1297
1298       /*-----------------------------------------------------------------*/
1299       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1300       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1301       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1302       count = 2;
1303       goto QS3;
1304
1305       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1306       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1307       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1308       count = 0;
1309
1310       QS3:
1311       ADD_ACTIVE(state_offset + 2, 0);
1312       if (clen > 0)
1313         {
1314         int ncount = 0;
1315         switch (c)
1316           {
1317           case 0x000b:
1318           case 0x000c:
1319           case 0x0085:
1320           case 0x2028:
1321           case 0x2029:
1322           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1323           goto ANYNL02;
1324
1325           case 0x000d:
1326           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1327           /* Fall through */
1328
1329           ANYNL02:
1330           case 0x000a:
1331           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1332               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1333             {
1334             active_count--;           /* Remove non-match possibility */
1335             next_active_state--;
1336             }
1337           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1338           break;
1339
1340           default:
1341           break;
1342           }
1343         }
1344       break;
1345
1346       /*-----------------------------------------------------------------*/
1347       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1348       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1349       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1350       count = 2;
1351       goto QS4;
1352
1353       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1354       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1355       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1356       count = 0;
1357
1358       QS4:
1359       ADD_ACTIVE(state_offset + 2, 0);
1360       if (clen > 0)
1361         {
1362         BOOL OK;
1363         switch (c)
1364           {
1365           case 0x000a:
1366           case 0x000b:
1367           case 0x000c:
1368           case 0x000d:
1369           case 0x0085:
1370           case 0x2028:
1371           case 0x2029:
1372           OK = TRUE;
1373           break;
1374
1375           default:
1376           OK = FALSE;
1377           break;
1378           }
1379         if (OK == (d == OP_VSPACE))
1380           {
1381           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1382               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1383             {
1384             active_count--;           /* Remove non-match possibility */
1385             next_active_state--;
1386             }
1387           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1388           }
1389         }
1390       break;
1391
1392       /*-----------------------------------------------------------------*/
1393       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1394       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1395       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1396       count = 2;
1397       goto QS5;
1398
1399       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1400       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1401       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1402       count = 0;
1403
1404       QS5:
1405       ADD_ACTIVE(state_offset + 2, 0);
1406       if (clen > 0)
1407         {
1408         BOOL OK;
1409         switch (c)
1410           {
1411           case 0x09:      /* HT */
1412           case 0x20:      /* SPACE */
1413           case 0xa0:      /* NBSP */
1414           case 0x1680:    /* OGHAM SPACE MARK */
1415           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1416           case 0x2000:    /* EN QUAD */
1417           case 0x2001:    /* EM QUAD */
1418           case 0x2002:    /* EN SPACE */
1419           case 0x2003:    /* EM SPACE */
1420           case 0x2004:    /* THREE-PER-EM SPACE */
1421           case 0x2005:    /* FOUR-PER-EM SPACE */
1422           case 0x2006:    /* SIX-PER-EM SPACE */
1423           case 0x2007:    /* FIGURE SPACE */
1424           case 0x2008:    /* PUNCTUATION SPACE */
1425           case 0x2009:    /* THIN SPACE */
1426           case 0x200A:    /* HAIR SPACE */
1427           case 0x202f:    /* NARROW NO-BREAK SPACE */
1428           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1429           case 0x3000:    /* IDEOGRAPHIC SPACE */
1430           OK = TRUE;
1431           break;
1432
1433           default:
1434           OK = FALSE;
1435           break;
1436           }
1437
1438         if (OK == (d == OP_HSPACE))
1439           {
1440           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1441               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1442             {
1443             active_count--;           /* Remove non-match possibility */
1444             next_active_state--;
1445             }
1446           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1447           }
1448         }
1449       break;
1450
1451       /*-----------------------------------------------------------------*/
1452 #ifdef SUPPORT_UCP
1453       case OP_PROP_EXTRA + OP_TYPEEXACT:
1454       case OP_PROP_EXTRA + OP_TYPEUPTO:
1455       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1456       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1457       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1458         { ADD_ACTIVE(state_offset + 6, 0); }
1459       count = current_state->count;  /* Number already matched */
1460       if (clen > 0)
1461         {
1462         BOOL OK;
1463         int chartype = UCD_CHARTYPE(c);
1464         switch(code[4])
1465           {
1466           case PT_ANY:
1467           OK = TRUE;
1468           break;
1469
1470           case PT_LAMP:
1471           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1472           break;
1473
1474           case PT_GC:
1475           OK = _pcre_ucp_gentype[chartype] == code[5];
1476           break;
1477
1478           case PT_PC:
1479           OK = chartype == code[5];
1480           break;
1481
1482           case PT_SC:
1483           OK = UCD_SCRIPT(c) == code[5];
1484           break;
1485
1486           /* Should never occur, but keep compilers from grumbling. */
1487
1488           default:
1489           OK = codevalue != OP_PROP;
1490           break;
1491           }
1492
1493         if (OK == (d == OP_PROP))
1494           {
1495           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1496             {
1497             active_count--;           /* Remove non-match possibility */
1498             next_active_state--;
1499             }
1500           if (++count >= GET2(code, 1))
1501             { ADD_NEW(state_offset + 6, 0); }
1502           else
1503             { ADD_NEW(state_offset, count); }
1504           }
1505         }
1506       break;
1507
1508       /*-----------------------------------------------------------------*/
1509       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1510       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1511       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1512       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1513       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1514         { ADD_ACTIVE(state_offset + 4, 0); }
1515       count = current_state->count;  /* Number already matched */
1516       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1517         {
1518         const uschar *nptr = ptr + clen;
1519         int ncount = 0;
1520         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1521           {
1522           active_count--;           /* Remove non-match possibility */
1523           next_active_state--;
1524           }
1525         while (nptr < end_subject)
1526           {
1527           int nd;
1528           int ndlen = 1;
1529           GETCHARLEN(nd, nptr, ndlen);
1530           if (UCD_CATEGORY(nd) != ucp_M) break;
1531           ncount++;
1532           nptr += ndlen;
1533           }
1534         if (++count >= GET2(code, 1))
1535           { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1536         else
1537           { ADD_NEW_DATA(-state_offset, count, ncount); }
1538         }
1539       break;
1540 #endif
1541
1542       /*-----------------------------------------------------------------*/
1543       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1544       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1545       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1546       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1547       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1548         { ADD_ACTIVE(state_offset + 4, 0); }
1549       count = current_state->count;  /* Number already matched */
1550       if (clen > 0)
1551         {
1552         int ncount = 0;
1553         switch (c)
1554           {
1555           case 0x000b:
1556           case 0x000c:
1557           case 0x0085:
1558           case 0x2028:
1559           case 0x2029:
1560           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1561           goto ANYNL03;
1562
1563           case 0x000d:
1564           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1565           /* Fall through */
1566
1567           ANYNL03:
1568           case 0x000a:
1569           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1570             {
1571             active_count--;           /* Remove non-match possibility */
1572             next_active_state--;
1573             }
1574           if (++count >= GET2(code, 1))
1575             { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1576           else
1577             { ADD_NEW_DATA(-state_offset, count, ncount); }
1578           break;
1579
1580           default:
1581           break;
1582           }
1583         }
1584       break;
1585
1586       /*-----------------------------------------------------------------*/
1587       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1588       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1589       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1590       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1591       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1592         { ADD_ACTIVE(state_offset + 4, 0); }
1593       count = current_state->count;  /* Number already matched */
1594       if (clen > 0)
1595         {
1596         BOOL OK;
1597         switch (c)
1598           {
1599           case 0x000a:
1600           case 0x000b:
1601           case 0x000c:
1602           case 0x000d:
1603           case 0x0085:
1604           case 0x2028:
1605           case 0x2029:
1606           OK = TRUE;
1607           break;
1608
1609           default:
1610           OK = FALSE;
1611           }
1612
1613         if (OK == (d == OP_VSPACE))
1614           {
1615           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1616             {
1617             active_count--;           /* Remove non-match possibility */
1618             next_active_state--;
1619             }
1620           if (++count >= GET2(code, 1))
1621             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1622           else
1623             { ADD_NEW_DATA(-state_offset, count, 0); }
1624           }
1625         }
1626       break;
1627
1628       /*-----------------------------------------------------------------*/
1629       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1630       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1631       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1632       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1633       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1634         { ADD_ACTIVE(state_offset + 4, 0); }
1635       count = current_state->count;  /* Number already matched */
1636       if (clen > 0)
1637         {
1638         BOOL OK;
1639         switch (c)
1640           {
1641           case 0x09:      /* HT */
1642           case 0x20:      /* SPACE */
1643           case 0xa0:      /* NBSP */
1644           case 0x1680:    /* OGHAM SPACE MARK */
1645           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1646           case 0x2000:    /* EN QUAD */
1647           case 0x2001:    /* EM QUAD */
1648           case 0x2002:    /* EN SPACE */
1649           case 0x2003:    /* EM SPACE */
1650           case 0x2004:    /* THREE-PER-EM SPACE */
1651           case 0x2005:    /* FOUR-PER-EM SPACE */
1652           case 0x2006:    /* SIX-PER-EM SPACE */
1653           case 0x2007:    /* FIGURE SPACE */
1654           case 0x2008:    /* PUNCTUATION SPACE */
1655           case 0x2009:    /* THIN SPACE */
1656           case 0x200A:    /* HAIR SPACE */
1657           case 0x202f:    /* NARROW NO-BREAK SPACE */
1658           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1659           case 0x3000:    /* IDEOGRAPHIC SPACE */
1660           OK = TRUE;
1661           break;
1662
1663           default:
1664           OK = FALSE;
1665           break;
1666           }
1667
1668         if (OK == (d == OP_HSPACE))
1669           {
1670           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1671             {
1672             active_count--;           /* Remove non-match possibility */
1673             next_active_state--;
1674             }
1675           if (++count >= GET2(code, 1))
1676             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1677           else
1678             { ADD_NEW_DATA(-state_offset, count, 0); }
1679           }
1680         }
1681       break;
1682
1683 /* ========================================================================== */
1684       /* These opcodes are followed by a character that is usually compared
1685       to the current subject character; it is loaded into d. We still get
1686       here even if there is no subject character, because in some cases zero
1687       repetitions are permitted. */
1688
1689       /*-----------------------------------------------------------------*/
1690       case OP_CHAR:
1691       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1692       break;
1693
1694       /*-----------------------------------------------------------------*/
1695       case OP_CHARNC:
1696       if (clen == 0) break;
1697
1698 #ifdef SUPPORT_UTF8
1699       if (utf8)
1700         {
1701         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1702           {
1703           unsigned int othercase;
1704           if (c < 128) othercase = fcc[c]; else
1705
1706           /* If we have Unicode property support, we can use it to test the
1707           other case of the character. */
1708
1709 #ifdef SUPPORT_UCP
1710           othercase = UCD_OTHERCASE(c);
1711 #else
1712           othercase = NOTACHAR;
1713 #endif
1714
1715           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1716           }
1717         }
1718       else
1719 #endif  /* SUPPORT_UTF8 */
1720
1721       /* Non-UTF-8 mode */
1722         {
1723         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1724         }
1725       break;
1726
1727
1728 #ifdef SUPPORT_UCP
1729       /*-----------------------------------------------------------------*/
1730       /* This is a tricky one because it can match more than one character.
1731       Find out how many characters to skip, and then set up a negative state
1732       to wait for them to pass before continuing. */
1733
1734       case OP_EXTUNI:
1735       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1736         {
1737         const uschar *nptr = ptr + clen;
1738         int ncount = 0;
1739         while (nptr < end_subject)
1740           {
1741           int nclen = 1;
1742           GETCHARLEN(c, nptr, nclen);
1743           if (UCD_CATEGORY(c) != ucp_M) break;
1744           ncount++;
1745           nptr += nclen;
1746           }
1747         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1748         }
1749       break;
1750 #endif
1751
1752       /*-----------------------------------------------------------------*/
1753       /* This is a tricky like EXTUNI because it too can match more than one
1754       character (when CR is followed by LF). In this case, set up a negative
1755       state to wait for one character to pass before continuing. */
1756
1757       case OP_ANYNL:
1758       if (clen > 0) switch(c)
1759         {
1760         case 0x000b:
1761         case 0x000c:
1762         case 0x0085:
1763         case 0x2028:
1764         case 0x2029:
1765         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1766
1767         case 0x000a:
1768         ADD_NEW(state_offset + 1, 0);
1769         break;
1770
1771         case 0x000d:
1772         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1773           {
1774           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1775           }
1776         else
1777           {
1778           ADD_NEW(state_offset + 1, 0);
1779           }
1780         break;
1781         }
1782       break;
1783
1784       /*-----------------------------------------------------------------*/
1785       case OP_NOT_VSPACE:
1786       if (clen > 0) switch(c)
1787         {
1788         case 0x000a:
1789         case 0x000b:
1790         case 0x000c:
1791         case 0x000d:
1792         case 0x0085:
1793         case 0x2028:
1794         case 0x2029:
1795         break;
1796
1797         default:
1798         ADD_NEW(state_offset + 1, 0);
1799         break;
1800         }
1801       break;
1802
1803       /*-----------------------------------------------------------------*/
1804       case OP_VSPACE:
1805       if (clen > 0) switch(c)
1806         {
1807         case 0x000a:
1808         case 0x000b:
1809         case 0x000c:
1810         case 0x000d:
1811         case 0x0085:
1812         case 0x2028:
1813         case 0x2029:
1814         ADD_NEW(state_offset + 1, 0);
1815         break;
1816
1817         default: break;
1818         }
1819       break;
1820
1821       /*-----------------------------------------------------------------*/
1822       case OP_NOT_HSPACE:
1823       if (clen > 0) switch(c)
1824         {
1825         case 0x09:      /* HT */
1826         case 0x20:      /* SPACE */
1827         case 0xa0:      /* NBSP */
1828         case 0x1680:    /* OGHAM SPACE MARK */
1829         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1830         case 0x2000:    /* EN QUAD */
1831         case 0x2001:    /* EM QUAD */
1832         case 0x2002:    /* EN SPACE */
1833         case 0x2003:    /* EM SPACE */
1834         case 0x2004:    /* THREE-PER-EM SPACE */
1835         case 0x2005:    /* FOUR-PER-EM SPACE */
1836         case 0x2006:    /* SIX-PER-EM SPACE */
1837         case 0x2007:    /* FIGURE SPACE */
1838         case 0x2008:    /* PUNCTUATION SPACE */
1839         case 0x2009:    /* THIN SPACE */
1840         case 0x200A:    /* HAIR SPACE */
1841         case 0x202f:    /* NARROW NO-BREAK SPACE */
1842         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1843         case 0x3000:    /* IDEOGRAPHIC SPACE */
1844         break;
1845
1846         default:
1847         ADD_NEW(state_offset + 1, 0);
1848         break;
1849         }
1850       break;
1851
1852       /*-----------------------------------------------------------------*/
1853       case OP_HSPACE:
1854       if (clen > 0) switch(c)
1855         {
1856         case 0x09:      /* HT */
1857         case 0x20:      /* SPACE */
1858         case 0xa0:      /* NBSP */
1859         case 0x1680:    /* OGHAM SPACE MARK */
1860         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1861         case 0x2000:    /* EN QUAD */
1862         case 0x2001:    /* EM QUAD */
1863         case 0x2002:    /* EN SPACE */
1864         case 0x2003:    /* EM SPACE */
1865         case 0x2004:    /* THREE-PER-EM SPACE */
1866         case 0x2005:    /* FOUR-PER-EM SPACE */
1867         case 0x2006:    /* SIX-PER-EM SPACE */
1868         case 0x2007:    /* FIGURE SPACE */
1869         case 0x2008:    /* PUNCTUATION SPACE */
1870         case 0x2009:    /* THIN SPACE */
1871         case 0x200A:    /* HAIR SPACE */
1872         case 0x202f:    /* NARROW NO-BREAK SPACE */
1873         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1874         case 0x3000:    /* IDEOGRAPHIC SPACE */
1875         ADD_NEW(state_offset + 1, 0);
1876         break;
1877         }
1878       break;
1879
1880       /*-----------------------------------------------------------------*/
1881       /* Match a negated single character. This is only used for one-byte
1882       characters, that is, we know that d < 256. The character we are
1883       checking (c) can be multibyte. */
1884
1885       case OP_NOT:
1886       if (clen > 0)
1887         {
1888         unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1889         if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1890         }
1891       break;
1892
1893       /*-----------------------------------------------------------------*/
1894       case OP_PLUS:
1895       case OP_MINPLUS:
1896       case OP_POSPLUS:
1897       case OP_NOTPLUS:
1898       case OP_NOTMINPLUS:
1899       case OP_NOTPOSPLUS:
1900       count = current_state->count;  /* Already matched */
1901       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1902       if (clen > 0)
1903         {
1904         unsigned int otherd = NOTACHAR;
1905         if ((ims & PCRE_CASELESS) != 0)
1906           {
1907 #ifdef SUPPORT_UTF8
1908           if (utf8 && d >= 128)
1909             {
1910 #ifdef SUPPORT_UCP
1911             otherd = UCD_OTHERCASE(d);
1912 #endif  /* SUPPORT_UCP */
1913             }
1914           else
1915 #endif  /* SUPPORT_UTF8 */
1916           otherd = fcc[d];
1917           }
1918         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1919           {
1920           if (count > 0 &&
1921               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1922             {
1923             active_count--;             /* Remove non-match possibility */
1924             next_active_state--;
1925             }
1926           count++;
1927           ADD_NEW(state_offset, count);
1928           }
1929         }
1930       break;
1931
1932       /*-----------------------------------------------------------------*/
1933       case OP_QUERY:
1934       case OP_MINQUERY:
1935       case OP_POSQUERY:
1936       case OP_NOTQUERY:
1937       case OP_NOTMINQUERY:
1938       case OP_NOTPOSQUERY:
1939       ADD_ACTIVE(state_offset + dlen + 1, 0);
1940       if (clen > 0)
1941         {
1942         unsigned int otherd = NOTACHAR;
1943         if ((ims & PCRE_CASELESS) != 0)
1944           {
1945 #ifdef SUPPORT_UTF8
1946           if (utf8 && d >= 128)
1947             {
1948 #ifdef SUPPORT_UCP
1949             otherd = UCD_OTHERCASE(d);
1950 #endif  /* SUPPORT_UCP */
1951             }
1952           else
1953 #endif  /* SUPPORT_UTF8 */
1954           otherd = fcc[d];
1955           }
1956         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1957           {
1958           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1959             {
1960             active_count--;            /* Remove non-match possibility */
1961             next_active_state--;
1962             }
1963           ADD_NEW(state_offset + dlen + 1, 0);
1964           }
1965         }
1966       break;
1967
1968       /*-----------------------------------------------------------------*/
1969       case OP_STAR:
1970       case OP_MINSTAR:
1971       case OP_POSSTAR:
1972       case OP_NOTSTAR:
1973       case OP_NOTMINSTAR:
1974       case OP_NOTPOSSTAR:
1975       ADD_ACTIVE(state_offset + dlen + 1, 0);
1976       if (clen > 0)
1977         {
1978         unsigned int otherd = NOTACHAR;
1979         if ((ims & PCRE_CASELESS) != 0)
1980           {
1981 #ifdef SUPPORT_UTF8
1982           if (utf8 && d >= 128)
1983             {
1984 #ifdef SUPPORT_UCP
1985             otherd = UCD_OTHERCASE(d);
1986 #endif  /* SUPPORT_UCP */
1987             }
1988           else
1989 #endif  /* SUPPORT_UTF8 */
1990           otherd = fcc[d];
1991           }
1992         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1993           {
1994           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1995             {
1996             active_count--;            /* Remove non-match possibility */
1997             next_active_state--;
1998             }
1999           ADD_NEW(state_offset, 0);
2000           }
2001         }
2002       break;
2003
2004       /*-----------------------------------------------------------------*/
2005       case OP_EXACT:
2006       case OP_NOTEXACT:
2007       count = current_state->count;  /* Number already matched */
2008       if (clen > 0)
2009         {
2010         unsigned int otherd = NOTACHAR;
2011         if ((ims & PCRE_CASELESS) != 0)
2012           {
2013 #ifdef SUPPORT_UTF8
2014           if (utf8 && d >= 128)
2015             {
2016 #ifdef SUPPORT_UCP
2017             otherd = UCD_OTHERCASE(d);
2018 #endif  /* SUPPORT_UCP */
2019             }
2020           else
2021 #endif  /* SUPPORT_UTF8 */
2022           otherd = fcc[d];
2023           }
2024         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2025           {
2026           if (++count >= GET2(code, 1))
2027             { ADD_NEW(state_offset + dlen + 3, 0); }
2028           else
2029             { ADD_NEW(state_offset, count); }
2030           }
2031         }
2032       break;
2033
2034       /*-----------------------------------------------------------------*/
2035       case OP_UPTO:
2036       case OP_MINUPTO:
2037       case OP_POSUPTO:
2038       case OP_NOTUPTO:
2039       case OP_NOTMINUPTO:
2040       case OP_NOTPOSUPTO:
2041       ADD_ACTIVE(state_offset + dlen + 3, 0);
2042       count = current_state->count;  /* Number already matched */
2043       if (clen > 0)
2044         {
2045         unsigned int otherd = NOTACHAR;
2046         if ((ims & PCRE_CASELESS) != 0)
2047           {
2048 #ifdef SUPPORT_UTF8
2049           if (utf8 && d >= 128)
2050             {
2051 #ifdef SUPPORT_UCP
2052             otherd = UCD_OTHERCASE(d);
2053 #endif  /* SUPPORT_UCP */
2054             }
2055           else
2056 #endif  /* SUPPORT_UTF8 */
2057           otherd = fcc[d];
2058           }
2059         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2060           {
2061           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2062             {
2063             active_count--;             /* Remove non-match possibility */
2064             next_active_state--;
2065             }
2066           if (++count >= GET2(code, 1))
2067             { ADD_NEW(state_offset + dlen + 3, 0); }
2068           else
2069             { ADD_NEW(state_offset, count); }
2070           }
2071         }
2072       break;
2073
2074
2075 /* ========================================================================== */
2076       /* These are the class-handling opcodes */
2077
2078       case OP_CLASS:
2079       case OP_NCLASS:
2080       case OP_XCLASS:
2081         {
2082         BOOL isinclass = FALSE;
2083         int next_state_offset;
2084         const uschar *ecode;
2085
2086         /* For a simple class, there is always just a 32-byte table, and we
2087         can set isinclass from it. */
2088
2089         if (codevalue != OP_XCLASS)
2090           {
2091           ecode = code + 33;
2092           if (clen > 0)
2093             {
2094             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2095               ((code[1 + c/8] & (1 << (c&7))) != 0);
2096             }
2097           }
2098
2099         /* An extended class may have a table or a list of single characters,
2100         ranges, or both, and it may be positive or negative. There's a
2101         function that sorts all this out. */
2102
2103         else
2104          {
2105          ecode = code + GET(code, 1);
2106          if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2107          }
2108
2109         /* At this point, isinclass is set for all kinds of class, and ecode
2110         points to the byte after the end of the class. If there is a
2111         quantifier, this is where it will be. */
2112
2113         next_state_offset = ecode - start_code;
2114
2115         switch (*ecode)
2116           {
2117           case OP_CRSTAR:
2118           case OP_CRMINSTAR:
2119           ADD_ACTIVE(next_state_offset + 1, 0);
2120           if (isinclass) { ADD_NEW(state_offset, 0); }
2121           break;
2122
2123           case OP_CRPLUS:
2124           case OP_CRMINPLUS:
2125           count = current_state->count;  /* Already matched */
2126           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2127           if (isinclass) { count++; ADD_NEW(state_offset, count); }
2128           break;
2129
2130           case OP_CRQUERY:
2131           case OP_CRMINQUERY:
2132           ADD_ACTIVE(next_state_offset + 1, 0);
2133           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2134           break;
2135
2136           case OP_CRRANGE:
2137           case OP_CRMINRANGE:
2138           count = current_state->count;  /* Already matched */
2139           if (count >= GET2(ecode, 1))
2140             { ADD_ACTIVE(next_state_offset + 5, 0); }
2141           if (isinclass)
2142             {
2143             int max = GET2(ecode, 3);
2144             if (++count >= max && max != 0)   /* Max 0 => no limit */
2145               { ADD_NEW(next_state_offset + 5, 0); }
2146             else
2147               { ADD_NEW(state_offset, count); }
2148             }
2149           break;
2150
2151           default:
2152           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2153           break;
2154           }
2155         }
2156       break;
2157
2158 /* ========================================================================== */
2159       /* These are the opcodes for fancy brackets of various kinds. We have
2160       to use recursion in order to handle them. The "always failing" assersion
2161       (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2162       though the other "backtracking verbs" are not supported. */
2163
2164       case OP_FAIL:
2165       break;
2166
2167       case OP_ASSERT:
2168       case OP_ASSERT_NOT:
2169       case OP_ASSERTBACK:
2170       case OP_ASSERTBACK_NOT:
2171         {
2172         int rc;
2173         int local_offsets[2];
2174         int local_workspace[1000];
2175         const uschar *endasscode = code + GET(code, 1);
2176
2177         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2178
2179         rc = internal_dfa_exec(
2180           md,                                   /* static match data */
2181           code,                                 /* this subexpression's code */
2182           ptr,                                  /* where we currently are */
2183           ptr - start_subject,                  /* start offset */
2184           local_offsets,                        /* offset vector */
2185           sizeof(local_offsets)/sizeof(int),    /* size of same */
2186           local_workspace,                      /* workspace vector */
2187           sizeof(local_workspace)/sizeof(int),  /* size of same */
2188           ims,                                  /* the current ims flags */
2189           rlevel,                               /* function recursion level */
2190           recursing);                           /* pass on regex recursion */
2191
2192         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2193             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2194         }
2195       break;
2196
2197       /*-----------------------------------------------------------------*/
2198       case OP_COND:
2199       case OP_SCOND:
2200         {
2201         int local_offsets[1000];
2202         int local_workspace[1000];
2203         int condcode = code[LINK_SIZE+1];
2204
2205         /* Back reference conditions are not supported */
2206
2207         if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2208
2209         /* The DEFINE condition is always false */
2210
2211         if (condcode == OP_DEF)
2212           {
2213           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2214           }
2215
2216         /* The only supported version of OP_RREF is for the value RREF_ANY,
2217         which means "test if in any recursion". We can't test for specifically
2218         recursed groups. */
2219
2220         else if (condcode == OP_RREF)
2221           {
2222           int value = GET2(code, LINK_SIZE+2);
2223           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2224           if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2225             else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2226           }
2227
2228         /* Otherwise, the condition is an assertion */
2229
2230         else
2231           {
2232           int rc;
2233           const uschar *asscode = code + LINK_SIZE + 1;
2234           const uschar *endasscode = asscode + GET(asscode, 1);
2235
2236           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2237
2238           rc = internal_dfa_exec(
2239             md,                                   /* fixed match data */
2240             asscode,                              /* this subexpression's code */
2241             ptr,                                  /* where we currently are */
2242             ptr - start_subject,                  /* start offset */
2243             local_offsets,                        /* offset vector */
2244             sizeof(local_offsets)/sizeof(int),    /* size of same */
2245             local_workspace,                      /* workspace vector */
2246             sizeof(local_workspace)/sizeof(int),  /* size of same */
2247             ims,                                  /* the current ims flags */
2248             rlevel,                               /* function recursion level */
2249             recursing);                           /* pass on regex recursion */
2250
2251           if ((rc >= 0) ==
2252                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2253             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2254           else
2255             { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2256           }
2257         }
2258       break;
2259
2260       /*-----------------------------------------------------------------*/
2261       case OP_RECURSE:
2262         {
2263         int local_offsets[1000];
2264         int local_workspace[1000];
2265         int rc;
2266
2267         DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2268           recursing + 1));
2269
2270         rc = internal_dfa_exec(
2271           md,                                   /* fixed match data */
2272           start_code + GET(code, 1),            /* this subexpression's code */
2273           ptr,                                  /* where we currently are */
2274           ptr - start_subject,                  /* start offset */
2275           local_offsets,                        /* offset vector */
2276           sizeof(local_offsets)/sizeof(int),    /* size of same */
2277           local_workspace,                      /* workspace vector */
2278           sizeof(local_workspace)/sizeof(int),  /* size of same */
2279           ims,                                  /* the current ims flags */
2280           rlevel,                               /* function recursion level */
2281           recursing + 1);                       /* regex recurse level */
2282
2283         DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2284           recursing + 1, rc));
2285
2286         /* Ran out of internal offsets */
2287
2288         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2289
2290         /* For each successful matched substring, set up the next state with a
2291         count of characters to skip before trying it. Note that the count is in
2292         characters, not bytes. */
2293
2294         if (rc > 0)
2295           {
2296           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2297             {
2298             const uschar *p = start_subject + local_offsets[rc];
2299             const uschar *pp = start_subject + local_offsets[rc+1];
2300             int charcount = local_offsets[rc+1] - local_offsets[rc];
2301             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2302             if (charcount > 0)
2303               {
2304               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2305               }
2306             else
2307               {
2308               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2309               }
2310             }
2311           }
2312         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2313         }
2314       break;
2315
2316       /*-----------------------------------------------------------------*/
2317       case OP_ONCE:
2318         {
2319         int local_offsets[2];
2320         int local_workspace[1000];
2321
2322         int rc = internal_dfa_exec(
2323           md,                                   /* fixed match data */
2324           code,                                 /* this subexpression's code */
2325           ptr,                                  /* where we currently are */
2326           ptr - start_subject,                  /* start offset */
2327           local_offsets,                        /* offset vector */
2328           sizeof(local_offsets)/sizeof(int),    /* size of same */
2329           local_workspace,                      /* workspace vector */
2330           sizeof(local_workspace)/sizeof(int),  /* size of same */
2331           ims,                                  /* the current ims flags */
2332           rlevel,                               /* function recursion level */
2333           recursing);                           /* pass on regex recursion */
2334
2335         if (rc >= 0)
2336           {
2337           const uschar *end_subpattern = code;
2338           int charcount = local_offsets[1] - local_offsets[0];
2339           int next_state_offset, repeat_state_offset;
2340
2341           do { end_subpattern += GET(end_subpattern, 1); }
2342             while (*end_subpattern == OP_ALT);
2343           next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2344
2345           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2346           arrange for the repeat state also to be added to the relevant list.
2347           Calculate the offset, or set -1 for no repeat. */
2348
2349           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2350                                  *end_subpattern == OP_KETRMIN)?
2351             end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2352
2353           /* If we have matched an empty string, add the next state at the
2354           current character pointer. This is important so that the duplicate
2355           checking kicks in, which is what breaks infinite loops that match an
2356           empty string. */
2357
2358           if (charcount == 0)
2359             {
2360             ADD_ACTIVE(next_state_offset, 0);
2361             }
2362
2363           /* Optimization: if there are no more active states, and there
2364           are no new states yet set up, then skip over the subject string
2365           right here, to save looping. Otherwise, set up the new state to swing
2366           into action when the end of the substring is reached. */
2367
2368           else if (i + 1 >= active_count && new_count == 0)
2369             {
2370             ptr += charcount;
2371             clen = 0;
2372             ADD_NEW(next_state_offset, 0);
2373
2374             /* If we are adding a repeat state at the new character position,
2375             we must fudge things so that it is the only current state.
2376             Otherwise, it might be a duplicate of one we processed before, and
2377             that would cause it to be skipped. */
2378
2379             if (repeat_state_offset >= 0)
2380               {
2381               next_active_state = active_states;
2382               active_count = 0;
2383               i = -1;
2384               ADD_ACTIVE(repeat_state_offset, 0);
2385               }
2386             }
2387           else
2388             {
2389             const uschar *p = start_subject + local_offsets[0];
2390             const uschar *pp = start_subject + local_offsets[1];
2391             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2392             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2393             if (repeat_state_offset >= 0)
2394               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2395             }
2396
2397           }
2398         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2399         }
2400       break;
2401
2402
2403 /* ========================================================================== */
2404       /* Handle callouts */
2405
2406       case OP_CALLOUT:
2407       if (pcre_callout != NULL)
2408         {
2409         int rrc;
2410         pcre_callout_block cb;
2411         cb.version          = 1;   /* Version 1 of the callout block */
2412         cb.callout_number   = code[1];
2413         cb.offset_vector    = offsets;
2414         cb.subject          = (PCRE_SPTR)start_subject;
2415         cb.subject_length   = end_subject - start_subject;
2416         cb.start_match      = current_subject - start_subject;
2417         cb.current_position = ptr - start_subject;
2418         cb.pattern_position = GET(code, 2);
2419         cb.next_item_length = GET(code, 2 + LINK_SIZE);
2420         cb.capture_top      = 1;
2421         cb.capture_last     = -1;
2422         cb.callout_data     = md->callout_data;
2423         if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2424         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2425         }
2426       break;
2427
2428
2429 /* ========================================================================== */
2430       default:        /* Unsupported opcode */
2431       return PCRE_ERROR_DFA_UITEM;
2432       }
2433
2434     NEXT_ACTIVE_STATE: continue;
2435
2436     }      /* End of loop scanning active states */
2437
2438   /* We have finished the processing at the current subject character. If no
2439   new states have been set for the next character, we have found all the
2440   matches that we are going to find. If we are at the top level and partial
2441   matching has been requested, check for appropriate conditions. */
2442
2443   if (new_count <= 0)
2444     {
2445     if (match_count < 0 &&                     /* No matches found */
2446         rlevel == 1 &&                         /* Top level match function */
2447         (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */
2448         ptr >= end_subject &&                  /* Reached end of subject */
2449         ptr > current_subject)                 /* Matched non-empty string */
2450       {
2451       if (offsetcount >= 2)
2452         {
2453         offsets[0] = current_subject - start_subject;
2454         offsets[1] = end_subject - start_subject;
2455         }
2456       match_count = PCRE_ERROR_PARTIAL;
2457       }
2458
2459     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2460       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2461       rlevel*2-2, SP));
2462     break;        /* In effect, "return", but see the comment below */
2463     }
2464
2465   /* One or more states are active for the next character. */
2466
2467   ptr += clen;    /* Advance to next subject character */
2468   }               /* Loop to move along the subject string */
2469
2470 /* Control gets here from "break" a few lines above. We do it this way because
2471 if we use "return" above, we have compiler trouble. Some compilers warn if
2472 there's nothing here because they think the function doesn't return a value. On
2473 the other hand, if we put a dummy statement here, some more clever compilers
2474 complain that it can't be reached. Sigh. */
2475
2476 return match_count;
2477 }
2478
2479
2480
2481
2482 /*************************************************
2483 *    Execute a Regular Expression - DFA engine   *
2484 *************************************************/
2485
2486 /* This external function applies a compiled re to a subject string using a DFA
2487 engine. This function calls the internal function multiple times if the pattern
2488 is not anchored.
2489
2490 Arguments:
2491   argument_re     points to the compiled expression
2492   extra_data      points to extra data or is NULL
2493   subject         points to the subject string
2494   length          length of subject string (may contain binary zeros)
2495   start_offset    where to start in the subject string
2496   options         option bits
2497   offsets         vector of match offsets
2498   offsetcount     size of same
2499   workspace       workspace vector
2500   wscount         size of same
2501
2502 Returns:          > 0 => number of match offset pairs placed in offsets
2503                   = 0 => offsets overflowed; longest matches are present
2504                    -1 => failed to match
2505                  < -1 => some kind of unexpected problem
2506 */
2507
2508 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2509 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2510   const char *subject, int length, int start_offset, int options, int *offsets,
2511   int offsetcount, int *workspace, int wscount)
2512 {
2513 real_pcre *re = (real_pcre *)argument_re;
2514 dfa_match_data match_block;
2515 dfa_match_data *md = &match_block;
2516 BOOL utf8, anchored, startline, firstline;
2517 const uschar *current_subject, *end_subject, *lcc;
2518
2519 pcre_study_data internal_study;
2520 const pcre_study_data *study = NULL;
2521 real_pcre internal_re;
2522
2523 const uschar *req_byte_ptr;
2524 const uschar *start_bits = NULL;
2525 BOOL first_byte_caseless = FALSE;
2526 BOOL req_byte_caseless = FALSE;
2527 int first_byte = -1;
2528 int req_byte = -1;
2529 int req_byte2 = -1;
2530 int newline;
2531
2532 /* Plausibility checks */
2533
2534 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2535 if (re == NULL || subject == NULL || workspace == NULL ||
2536    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2537 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2538 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2539
2540 /* We need to find the pointer to any study data before we test for byte
2541 flipping, so we scan the extra_data block first. This may set two fields in the
2542 match block, so we must initialize them beforehand. However, the other fields
2543 in the match block must not be set until after the byte flipping. */
2544
2545 md->tables = re->tables;
2546 md->callout_data = NULL;
2547
2548 if (extra_data != NULL)
2549   {
2550   unsigned int flags = extra_data->flags;
2551   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2552     study = (const pcre_study_data *)extra_data->study_data;
2553   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2554   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2555     return PCRE_ERROR_DFA_UMLIMIT;
2556   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2557     md->callout_data = extra_data->callout_data;
2558   if ((flags & PCRE_EXTRA_TABLES) != 0)
2559     md->tables = extra_data->tables;
2560   }
2561
2562 /* Check that the first field in the block is the magic number. If it is not,
2563 test for a regex that was compiled on a host of opposite endianness. If this is
2564 the case, flipped values are put in internal_re and internal_study if there was
2565 study data too. */
2566
2567 if (re->magic_number != MAGIC_NUMBER)
2568   {
2569   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2570   if (re == NULL) return PCRE_ERROR_BADMAGIC;
2571   if (study != NULL) study = &internal_study;
2572   }
2573
2574 /* Set some local values */
2575
2576 current_subject = (const unsigned char *)subject + start_offset;
2577 end_subject = (const unsigned char *)subject + length;
2578 req_byte_ptr = current_subject - 1;
2579
2580 #ifdef SUPPORT_UTF8
2581 utf8 = (re->options & PCRE_UTF8) != 0;
2582 #else
2583 utf8 = FALSE;
2584 #endif
2585
2586 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2587   (re->options & PCRE_ANCHORED) != 0;
2588
2589 /* The remaining fixed data for passing around. */
2590
2591 md->start_code = (const uschar *)argument_re +
2592     re->name_table_offset + re->name_count * re->name_entry_size;
2593 md->start_subject = (const unsigned char *)subject;
2594 md->end_subject = end_subject;
2595 md->moptions = options;
2596 md->poptions = re->options;
2597
2598 /* If the BSR option is not set at match time, copy what was set
2599 at compile time. */
2600
2601 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2602   {
2603   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2604     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2605 #ifdef BSR_ANYCRLF
2606   else md->moptions |= PCRE_BSR_ANYCRLF;
2607 #endif
2608   }
2609
2610 /* Handle different types of newline. The three bits give eight cases. If
2611 nothing is set at run time, whatever was used at compile time applies. */
2612
2613 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2614          PCRE_NEWLINE_BITS)
2615   {
2616   case 0: newline = NEWLINE; break;   /* Compile-time default */
2617   case PCRE_NEWLINE_CR: newline = '\r'; break;
2618   case PCRE_NEWLINE_LF: newline = '\n'; break;
2619   case PCRE_NEWLINE_CR+
2620        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2621   case PCRE_NEWLINE_ANY: newline = -1; break;
2622   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2623   default: return PCRE_ERROR_BADNEWLINE;
2624   }
2625
2626 if (newline == -2)
2627   {
2628   md->nltype = NLTYPE_ANYCRLF;
2629   }
2630 else if (newline < 0)
2631   {
2632   md->nltype = NLTYPE_ANY;
2633   }
2634 else
2635   {
2636   md->nltype = NLTYPE_FIXED;
2637   if (newline > 255)
2638     {
2639     md->nllen = 2;
2640     md->nl[0] = (newline >> 8) & 255;
2641     md->nl[1] = newline & 255;
2642     }
2643   else
2644     {
2645     md->nllen = 1;
2646     md->nl[0] = newline;
2647     }
2648   }
2649
2650 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2651 back the character offset. */
2652
2653 #ifdef SUPPORT_UTF8
2654 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2655   {
2656   if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2657     return PCRE_ERROR_BADUTF8;
2658   if (start_offset > 0 && start_offset < length)
2659     {
2660     int tb = ((uschar *)subject)[start_offset];
2661     if (tb > 127)
2662       {
2663       tb &= 0xc0;
2664       if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2665       }
2666     }
2667   }
2668 #endif
2669
2670 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2671 is a feature that makes it possible to save compiled regex and re-use them
2672 in other programs later. */
2673
2674 if (md->tables == NULL) md->tables = _pcre_default_tables;
2675
2676 /* The lower casing table and the "must be at the start of a line" flag are
2677 used in a loop when finding where to start. */
2678
2679 lcc = md->tables + lcc_offset;
2680 startline = (re->flags & PCRE_STARTLINE) != 0;
2681 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2682
2683 /* Set up the first character to match, if available. The first_byte value is
2684 never set for an anchored regular expression, but the anchoring may be forced
2685 at run time, so we have to test for anchoring. The first char may be unset for
2686 an unanchored pattern, of course. If there's no first char and the pattern was
2687 studied, there may be a bitmap of possible first characters. */
2688
2689 if (!anchored)
2690   {
2691   if ((re->flags & PCRE_FIRSTSET) != 0)
2692     {
2693     first_byte = re->first_byte & 255;
2694     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2695       first_byte = lcc[first_byte];
2696     }
2697   else
2698     {
2699     if (startline && study != NULL &&
2700          (study->options & PCRE_STUDY_MAPPED) != 0)
2701       start_bits = study->start_bits;
2702     }
2703   }
2704
2705 /* For anchored or unanchored matches, there may be a "last known required
2706 character" set. */
2707
2708 if ((re->flags & PCRE_REQCHSET) != 0)
2709   {
2710   req_byte = re->req_byte & 255;
2711   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2712   req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
2713   }
2714
2715 /* Call the main matching function, looping for a non-anchored regex after a
2716 failed match. Unless restarting, optimize by moving to the first match
2717 character if possible, when not anchored. Then unless wanting a partial match,
2718 check for a required later character. */
2719
2720 for (;;)
2721   {
2722   int rc;
2723
2724   if ((options & PCRE_DFA_RESTART) == 0)
2725     {
2726     const uschar *save_end_subject = end_subject;
2727
2728     /* Advance to a unique first char if possible. If firstline is TRUE, the
2729     start of the match is constrained to the first line of a multiline string.
2730     Implement this by temporarily adjusting end_subject so that we stop
2731     scanning at a newline. If the match fails at the newline, later code breaks
2732     this loop. */
2733
2734     if (firstline)
2735       {
2736       USPTR t = current_subject;
2737 #ifdef SUPPORT_UTF8
2738       if (utf8)
2739         {
2740         while (t < md->end_subject && !IS_NEWLINE(t))
2741           {
2742           t++;
2743           while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2744           }
2745         }
2746       else
2747 #endif
2748       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2749       end_subject = t;
2750       }
2751
2752     if (first_byte >= 0)
2753       {
2754       if (first_byte_caseless)
2755         while (current_subject < end_subject &&
2756                lcc[*current_subject] != first_byte)
2757           current_subject++;
2758       else
2759         while (current_subject < end_subject && *current_subject != first_byte)
2760           current_subject++;
2761       }
2762
2763     /* Or to just after a linebreak for a multiline match if possible */
2764
2765     else if (startline)
2766       {
2767       if (current_subject > md->start_subject + start_offset)
2768         {
2769 #ifdef SUPPORT_UTF8
2770         if (utf8)
2771           {
2772           while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2773             {
2774             current_subject++;
2775             while(current_subject < end_subject &&
2776                   (*current_subject & 0xc0) == 0x80)
2777               current_subject++;
2778             }
2779           }
2780         else
2781 #endif
2782         while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2783           current_subject++;
2784
2785         /* If we have just passed a CR and the newline option is ANY or
2786         ANYCRLF, and we are now at a LF, advance the match position by one more
2787         character. */
2788
2789         if (current_subject[-1] == '\r' &&
2790              (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2791              current_subject < end_subject &&
2792              *current_subject == '\n')
2793           current_subject++;
2794         }
2795       }
2796
2797     /* Or to a non-unique first char after study */
2798
2799     else if (start_bits != NULL)
2800       {
2801       while (current_subject < end_subject)
2802         {
2803         register unsigned int c = *current_subject;
2804         if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2805           else break;
2806         }
2807       }
2808
2809     /* Restore fudged end_subject */
2810
2811     end_subject = save_end_subject;
2812     }
2813
2814   /* If req_byte is set, we know that that character must appear in the subject
2815   for the match to succeed. If the first character is set, req_byte must be
2816   later in the subject; otherwise the test starts at the match point. This
2817   optimization can save a huge amount of work in patterns with nested unlimited
2818   repeats that aren't going to match. Writing separate code for cased/caseless
2819   versions makes it go faster, as does using an autoincrement and backing off
2820   on a match.
2821
2822   HOWEVER: when the subject string is very, very long, searching to its end can
2823   take a long time, and give bad performance on quite ordinary patterns. This
2824   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2825   don't do this when the string is sufficiently long.
2826
2827   ALSO: this processing is disabled when partial matching is requested.
2828   */
2829
2830   if (req_byte >= 0 &&
2831       end_subject - current_subject < REQ_BYTE_MAX &&
2832       (options & PCRE_PARTIAL) == 0)
2833     {
2834     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2835
2836     /* We don't need to repeat the search if we haven't yet reached the
2837     place we found it at last time. */
2838
2839     if (p > req_byte_ptr)
2840       {
2841       if (req_byte_caseless)
2842         {
2843         while (p < end_subject)
2844           {
2845           register int pp = *p++;
2846           if (pp == req_byte || pp == req_byte2) { p--; break; }
2847           }
2848         }
2849       else
2850         {
2851         while (p < end_subject)
2852           {
2853           if (*p++ == req_byte) { p--; break; }
2854           }
2855         }
2856
2857       /* If we can't find the required character, break the matching loop,
2858       which will cause a return or PCRE_ERROR_NOMATCH. */
2859
2860       if (p >= end_subject) break;
2861
2862       /* If we have found the required character, save the point where we
2863       found it, so that we don't search again next time round the loop if
2864       the start hasn't passed this character yet. */
2865
2866       req_byte_ptr = p;
2867       }
2868     }
2869
2870   /* OK, now we can do the business */
2871
2872   rc = internal_dfa_exec(
2873     md,                                /* fixed match data */
2874     md->start_code,                    /* this subexpression's code */
2875     current_subject,                   /* where we currently are */
2876     start_offset,                      /* start offset in subject */
2877     offsets,                           /* offset vector */
2878     offsetcount,                       /* size of same */
2879     workspace,                         /* workspace vector */
2880     wscount,                           /* size of same */
2881     re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2882     0,                                 /* function recurse level */
2883     0);                                /* regex recurse level */
2884
2885   /* Anything other than "no match" means we are done, always; otherwise, carry
2886   on only if not anchored. */
2887
2888   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2889
2890   /* Advance to the next subject character unless we are at the end of a line
2891   and firstline is set. */
2892
2893   if (firstline && IS_NEWLINE(current_subject)) break;
2894   current_subject++;
2895   if (utf8)
2896     {
2897     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2898       current_subject++;
2899     }
2900   if (current_subject > end_subject) break;
2901
2902   /* If we have just passed a CR and we are now at a LF, and the pattern does
2903   not contain any explicit matches for \r or \n, and the newline option is CRLF
2904   or ANY or ANYCRLF, advance the match position by one more character. */
2905
2906   if (current_subject[-1] == '\r' &&
2907       current_subject < end_subject &&
2908       *current_subject == '\n' &&
2909       (re->flags & PCRE_HASCRORLF) == 0 &&
2910         (md->nltype == NLTYPE_ANY ||
2911          md->nltype == NLTYPE_ANYCRLF ||
2912          md->nllen == 2))
2913     current_subject++;
2914
2915   }   /* "Bumpalong" loop */
2916
2917 return PCRE_ERROR_NOMATCH;
2918 }
2919
2920 /* End of pcre_dfa_exec.c */