glib/pcre/pcre_dfa_exec.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language (but see
   7 below for why this module is different).
   8
   9                        Written by Philip Hazel
  10            Copyright (c) 1997-2010 University of Cambridge
  11
  12 -----------------------------------------------------------------------------
  13 Redistribution and use in source and binary forms, with or without
  14 modification, are permitted provided that the following conditions are met:
  15
  16     * Redistributions of source code must retain the above copyright notice,
  17       this list of conditions and the following disclaimer.
  18
  19     * Redistributions in binary form must reproduce the above copyright
  20       notice, this list of conditions and the following disclaimer in the
  21       documentation and/or other materials provided with the distribution.
  22
  23     * Neither the name of the University of Cambridge nor the names of its
  24       contributors may be used to endorse or promote products derived from
  25       this software without specific prior written permission.
  26
  27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  37 POSSIBILITY OF SUCH DAMAGE.
  38 -----------------------------------------------------------------------------
  39 */
  40
  41
  42 /* This module contains the external function pcre_dfa_exec(), which is an
  43 alternative matching function that uses a sort of DFA algorithm (not a true
  44 FSM). This is NOT Perl- compatible, but it has advantages in certain
  45 applications. */
  46
  47
  48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
  49 the performance of his patterns greatly. I could not use it as it stood, as it
  50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
  51 test 7 to loop, and test 9 to crash with a segfault.
  52
  53 The issue is the check for duplicate states, which is done by a simple linear
  54 search up the state list. (Grep for "duplicate" below to find the code.) For
  55 many patterns, there will never be many states active at one time, so a simple
  56 linear search is fine. In patterns that have many active states, it might be a
  57 bottleneck. The suggested code used an indexing scheme to remember which states
  58 had previously been used for each character, and avoided the linear search when
  59 it knew there was no chance of a duplicate. This was implemented when adding
  60 states to the state lists.
  61
  62 I wrote some thread-safe, not-limited code to try something similar at the time
  63 of checking for duplicates (instead of when adding states), using index vectors
  64 on the stack. It did give a 13% improvement with one specially constructed
  65 pattern for certain subject strings, but on other strings and on many of the
  66 simpler patterns in the test suite it did worse. The major problem, I think,
  67 was the extra time to initialize the index. This had to be done for each call
  68 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
  69 only once - I suspect this was the cause of the problems with the tests.)
  70
  71 Overall, I concluded that the gains in some cases did not outweigh the losses
  72 in others, so I abandoned this code. */
  73
  74
  75
  76 #ifdef HAVE_CONFIG_H
  77 #include "config.h"
  78 #endif
  79
  80 #define NLBLOCK md             /* Block containing newline information */
  81 #define PSSTART start_subject  /* Field containing processed string start */
  82 #define PSEND   end_subject    /* Field containing processed string end */
  83
  84 #include "pcre_internal.h"
  85
  86
  87 /* For use to indent debugging output */
  88
  89 #define SP "                   "
  90
  91
  92 /*************************************************
  93 *      Code parameters and static tables         *
  94 *************************************************/
  95
  96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
  97 into others, under special conditions. A gap of 20 between the blocks should be
  98 enough. The resulting opcodes don't have to be less than 256 because they are
  99 never stored, so we push them well clear of the normal opcodes. */
 100
 101 #define OP_PROP_EXTRA       300
 102 #define OP_EXTUNI_EXTRA     320
 103 #define OP_ANYNL_EXTRA      340
 104 #define OP_HSPACE_EXTRA     360
 105 #define OP_VSPACE_EXTRA     380
 106
 107
 108 /* This table identifies those opcodes that are followed immediately by a
 109 character that is to be tested in some way. This makes is possible to
 110 centralize the loading of these characters. In the case of Type * etc, the
 111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
 112 small value. Non-zero values in the table are the offsets from the opcode where
 113 the character is to be found. ***NOTE*** If the start of this table is
 114 modified, the three tables that follow must also be modified. */
 115
 116 static const uschar coptable[] = {
 117   0,                             /* End                                    */
 118   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
 119   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
 120   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
 121   0, 0,                          /* \P, \p                                 */
 122   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
 123   0,                             /* \X                                     */
 124   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
 125   1,                             /* Char                                   */
 126   1,                             /* Charnc                                 */
 127   1,                             /* not                                    */
 128   /* Positive single-char repeats                                          */
 129   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 130   3, 3, 3,                       /* upto, minupto, exact                   */
 131   1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
 132   /* Negative single-char repeats - only for chars < 256                   */
 133   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
 134   3, 3, 3,                       /* NOT upto, minupto, exact               */
 135   1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */
 136   /* Positive type repeats                                                 */
 137   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
 138   3, 3, 3,                       /* Type upto, minupto, exact              */
 139   1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
 140   /* Character class & ref repeats                                         */
 141   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
 142   0, 0,                          /* CRRANGE, CRMINRANGE                    */
 143   0,                             /* CLASS                                  */
 144   0,                             /* NCLASS                                 */
 145   0,                             /* XCLASS - variable length               */
 146   0,                             /* REF                                    */
 147   0,                             /* RECURSE                                */
 148   0,                             /* CALLOUT                                */
 149   0,                             /* Alt                                    */
 150   0,                             /* Ket                                    */
 151   0,                             /* KetRmax                                */
 152   0,                             /* KetRmin                                */
 153   0,                             /* Assert                                 */
 154   0,                             /* Assert not                             */
 155   0,                             /* Assert behind                          */
 156   0,                             /* Assert behind not                      */
 157   0,                             /* Reverse                                */
 158   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
 159   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
 160   0, 0,                          /* CREF, NCREF                            */
 161   0, 0,                          /* RREF, NRREF                            */
 162   0,                             /* DEF                                    */
 163   0, 0,                          /* BRAZERO, BRAMINZERO                    */
 164   0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
 165   0, 0, 0, 0                     /* FAIL, ACCEPT, CLOSE, SKIPZERO          */
 166 };
 167
 168 /* This table identifies those opcodes that inspect a character. It is used to
 169 remember the fact that a character could have been inspected when the end of
 170 the subject is reached. ***NOTE*** If the start of this table is modified, the
 171 two tables that follow must also be modified. */
 172
 173 static const uschar poptable[] = {
 174   0,                             /* End                                    */
 175   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
 176   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
 177   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
 178   1, 1,                          /* \P, \p                                 */
 179   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
 180   1,                             /* \X                                     */
 181   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
 182   1,                             /* Char                                   */
 183   1,                             /* Charnc                                 */
 184   1,                             /* not                                    */
 185   /* Positive single-char repeats                                          */
 186   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 187   1, 1, 1,                       /* upto, minupto, exact                   */
 188   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
 189   /* Negative single-char repeats - only for chars < 256                   */
 190   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
 191   1, 1, 1,                       /* NOT upto, minupto, exact               */
 192   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
 193   /* Positive type repeats                                                 */
 194   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
 195   1, 1, 1,                       /* Type upto, minupto, exact              */
 196   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
 197   /* Character class & ref repeats                                         */
 198   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 199   1, 1,                          /* CRRANGE, CRMINRANGE                    */
 200   1,                             /* CLASS                                  */
 201   1,                             /* NCLASS                                 */
 202   1,                             /* XCLASS - variable length               */
 203   0,                             /* REF                                    */
 204   0,                             /* RECURSE                                */
 205   0,                             /* CALLOUT                                */
 206   0,                             /* Alt                                    */
 207   0,                             /* Ket                                    */
 208   0,                             /* KetRmax                                */
 209   0,                             /* KetRmin                                */
 210   0,                             /* Assert                                 */
 211   0,                             /* Assert not                             */
 212   0,                             /* Assert behind                          */
 213   0,                             /* Assert behind not                      */
 214   0,                             /* Reverse                                */
 215   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
 216   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
 217   0, 0,                          /* CREF, NCREF                            */
 218   0, 0,                          /* RREF, NRREF                            */
 219   0,                             /* DEF                                    */
 220   0, 0,                          /* BRAZERO, BRAMINZERO                    */
 221   0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
 222   0, 0, 0, 0                     /* FAIL, ACCEPT, CLOSE, SKIPZERO          */
 223 };
 224
 225 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
 226 and \w */
 227
 228 static const uschar toptable1[] = {
 229   0, 0, 0, 0, 0, 0,
 230   ctype_digit, ctype_digit,
 231   ctype_space, ctype_space,
 232   ctype_word,  ctype_word,
 233   0, 0                            /* OP_ANY, OP_ALLANY */
 234 };
 235
 236 static const uschar toptable2[] = {
 237   0, 0, 0, 0, 0, 0,
 238   ctype_digit, 0,
 239   ctype_space, 0,
 240   ctype_word,  0,
 241   1, 1                            /* OP_ANY, OP_ALLANY */
 242 };
 243
 244
 245 /* Structure for holding data about a particular state, which is in effect the
 246 current data for an active path through the match tree. It must consist
 247 entirely of ints because the working vector we are passed, and which we put
 248 these structures in, is a vector of ints. */
 249
 250 typedef struct stateblock {
 251   int offset;                     /* Offset to opcode */
 252   int count;                      /* Count for repeats */
 253   int ims;                        /* ims flag bits */
 254   int data;                       /* Some use extra data */
 255 } stateblock;
 256
 257 #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
 258
 259
 260 #ifdef PCRE_DEBUG
 261 /*************************************************
 262 *             Print character string             *
 263 *************************************************/
 264
 265 /* Character string printing function for debugging.
 266
 267 Arguments:
 268   p            points to string
 269   length       number of bytes
 270   f            where to print
 271
 272 Returns:       nothing
 273 */
 274
 275 static void
 276 pchars(unsigned char *p, int length, FILE *f)
 277 {
 278 int c;
 279 while (length-- > 0)
 280   {
 281   if (isprint(c = *(p++)))
 282     fprintf(f, "%c", c);
 283   else
 284     fprintf(f, "\\x%02x", c);
 285   }
 286 }
 287 #endif
 288
 289
 290
 291 /*************************************************
 292 *    Execute a Regular Expression - DFA engine   *
 293 *************************************************/
 294
 295 /* This internal function applies a compiled pattern to a subject string,
 296 starting at a given point, using a DFA engine. This function is called from the
 297 external one, possibly multiple times if the pattern is not anchored. The
 298 function calls itself recursively for some kinds of subpattern.
 299
 300 Arguments:
 301   md                the match_data block with fixed information
 302   this_start_code   the opening bracket of this subexpression's code
 303   current_subject   where we currently are in the subject string
 304   start_offset      start offset in the subject string
 305   offsets           vector to contain the matching string offsets
 306   offsetcount       size of same
 307   workspace         vector of workspace
 308   wscount           size of same
 309   ims               the current ims flags
 310   rlevel            function call recursion level
 311   recursing         regex recursive call level
 312
 313 Returns:            > 0 => number of match offset pairs placed in offsets
 314                     = 0 => offsets overflowed; longest matches are present
 315                      -1 => failed to match
 316                    < -1 => some kind of unexpected problem
 317
 318 The following macros are used for adding states to the two state vectors (one
 319 for the current character, one for the following character). */
 320
 321 #define ADD_ACTIVE(x,y) \
 322   if (active_count++ < wscount) \
 323     { \
 324     next_active_state->offset = (x); \
 325     next_active_state->count  = (y); \
 326     next_active_state->ims    = ims; \
 327     next_active_state++; \
 328     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 329     } \
 330   else return PCRE_ERROR_DFA_WSSIZE
 331
 332 #define ADD_ACTIVE_DATA(x,y,z) \
 333   if (active_count++ < wscount) \
 334     { \
 335     next_active_state->offset = (x); \
 336     next_active_state->count  = (y); \
 337     next_active_state->ims    = ims; \
 338     next_active_state->data   = (z); \
 339     next_active_state++; \
 340     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 341     } \
 342   else return PCRE_ERROR_DFA_WSSIZE
 343
 344 #define ADD_NEW(x,y) \
 345   if (new_count++ < wscount) \
 346     { \
 347     next_new_state->offset = (x); \
 348     next_new_state->count  = (y); \
 349     next_new_state->ims    = ims; \
 350     next_new_state++; \
 351     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 352     } \
 353   else return PCRE_ERROR_DFA_WSSIZE
 354
 355 #define ADD_NEW_DATA(x,y,z) \
 356   if (new_count++ < wscount) \
 357     { \
 358     next_new_state->offset = (x); \
 359     next_new_state->count  = (y); \
 360     next_new_state->ims    = ims; \
 361     next_new_state->data   = (z); \
 362     next_new_state++; \
 363     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 364     } \
 365   else return PCRE_ERROR_DFA_WSSIZE
 366
 367 /* And now, here is the code */
 368
 369 static int
 370 internal_dfa_exec(
 371   dfa_match_data *md,
 372   const uschar *this_start_code,
 373   const uschar *current_subject,
 374   int start_offset,
 375   int *offsets,
 376   int offsetcount,
 377   int *workspace,
 378   int wscount,
 379   int ims,
 380   int  rlevel,
 381   int  recursing)
 382 {
 383 stateblock *active_states, *new_states, *temp_states;
 384 stateblock *next_active_state, *next_new_state;
 385
 386 const uschar *ctypes, *lcc, *fcc;
 387 const uschar *ptr;
 388 const uschar *end_code, *first_op;
 389
 390 int active_count, new_count, match_count;
 391
 392 /* Some fields in the md block are frequently referenced, so we load them into
 393 independent variables in the hope that this will perform better. */
 394
 395 const uschar *start_subject = md->start_subject;
 396 const uschar *end_subject = md->end_subject;
 397 const uschar *start_code = md->start_code;
 398
 399 #ifdef SUPPORT_UTF8
 400 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
 401 #else
 402 BOOL utf8 = FALSE;
 403 #endif
 404
 405 rlevel++;
 406 offsetcount &= (-2);
 407
 408 wscount -= 2;
 409 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
 410           (2 * INTS_PER_STATEBLOCK);
 411
 412 DPRINTF(("\n%.*s---------------------\n"
 413   "%.*sCall to internal_dfa_exec f=%d r=%d\n",
 414   rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
 415
 416 ctypes = md->tables + ctypes_offset;
 417 lcc = md->tables + lcc_offset;
 418 fcc = md->tables + fcc_offset;
 419
 420 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
 421
 422 active_states = (stateblock *)(workspace + 2);
 423 next_new_state = new_states = active_states + wscount;
 424 new_count = 0;
 425
 426 first_op = this_start_code + 1 + LINK_SIZE +
 427   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
 428
 429 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
 430 the alternative states onto the list, and find out where the end is. This
 431 makes is possible to use this function recursively, when we want to stop at a
 432 matching internal ket rather than at the end.
 433
 434 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
 435 a backward assertion. In that case, we have to find out the maximum amount to
 436 move back, and set up each alternative appropriately. */
 437
 438 if (*first_op == OP_REVERSE)
 439   {
 440   int max_back = 0;
 441   int gone_back;
 442
 443   end_code = this_start_code;
 444   do
 445     {
 446     int back = GET(end_code, 2+LINK_SIZE);
 447     if (back > max_back) max_back = back;
 448     end_code += GET(end_code, 1);
 449     }
 450   while (*end_code == OP_ALT);
 451
 452   /* If we can't go back the amount required for the longest lookbehind
 453   pattern, go back as far as we can; some alternatives may still be viable. */
 454
 455 #ifdef SUPPORT_UTF8
 456   /* In character mode we have to step back character by character */
 457
 458   if (utf8)
 459     {
 460     for (gone_back = 0; gone_back < max_back; gone_back++)
 461       {
 462       if (current_subject <= start_subject) break;
 463       current_subject--;
 464       while (current_subject > start_subject &&
 465              (*current_subject & 0xc0) == 0x80)
 466         current_subject--;
 467       }
 468     }
 469   else
 470 #endif
 471
 472   /* In byte-mode we can do this quickly. */
 473
 474     {
 475     gone_back = (current_subject - max_back < start_subject)?
 476       current_subject - start_subject : max_back;
 477     current_subject -= gone_back;
 478     }
 479
 480   /* Save the earliest consulted character */
 481
 482   if (current_subject < md->start_used_ptr)
 483     md->start_used_ptr = current_subject;
 484
 485   /* Now we can process the individual branches. */
 486
 487   end_code = this_start_code;
 488   do
 489     {
 490     int back = GET(end_code, 2+LINK_SIZE);
 491     if (back <= gone_back)
 492       {
 493       int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
 494       ADD_NEW_DATA(-bstate, 0, gone_back - back);
 495       }
 496     end_code += GET(end_code, 1);
 497     }
 498   while (*end_code == OP_ALT);
 499  }
 500
 501 /* This is the code for a "normal" subpattern (not a backward assertion). The
 502 start of a whole pattern is always one of these. If we are at the top level,
 503 we may be asked to restart matching from the same point that we reached for a
 504 previous partial match. We still have to scan through the top-level branches to
 505 find the end state. */
 506
 507 else
 508   {
 509   end_code = this_start_code;
 510
 511   /* Restarting */
 512
 513   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
 514     {
 515     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
 516     new_count = workspace[1];
 517     if (!workspace[0])
 518       memcpy(new_states, active_states, new_count * sizeof(stateblock));
 519     }
 520
 521   /* Not restarting */
 522
 523   else
 524     {
 525     int length = 1 + LINK_SIZE +
 526       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
 527     do
 528       {
 529       ADD_NEW(end_code - start_code + length, 0);
 530       end_code += GET(end_code, 1);
 531       length = 1 + LINK_SIZE;
 532       }
 533     while (*end_code == OP_ALT);
 534     }
 535   }
 536
 537 workspace[0] = 0;    /* Bit indicating which vector is current */
 538
 539 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
 540
 541 /* Loop for scanning the subject */
 542
 543 ptr = current_subject;
 544 for (;;)
 545   {
 546   int i, j;
 547   int clen, dlen;
 548   unsigned int c, d;
 549   int forced_fail = 0;
 550   BOOL could_continue = FALSE;
 551
 552   /* Make the new state list into the active state list and empty the
 553   new state list. */
 554
 555   temp_states = active_states;
 556   active_states = new_states;
 557   new_states = temp_states;
 558   active_count = new_count;
 559   new_count = 0;
 560
 561   workspace[0] ^= 1;              /* Remember for the restarting feature */
 562   workspace[1] = active_count;
 563
 564 #ifdef PCRE_DEBUG
 565   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
 566   pchars((uschar *)ptr, strlen((char *)ptr), stdout);
 567   printf("\"\n");
 568
 569   printf("%.*sActive states: ", rlevel*2-2, SP);
 570   for (i = 0; i < active_count; i++)
 571     printf("%d/%d ", active_states[i].offset, active_states[i].count);
 572   printf("\n");
 573 #endif
 574
 575   /* Set the pointers for adding new states */
 576
 577   next_active_state = active_states + active_count;
 578   next_new_state = new_states;
 579
 580   /* Load the current character from the subject outside the loop, as many
 581   different states may want to look at it, and we assume that at least one
 582   will. */
 583
 584   if (ptr < end_subject)
 585     {
 586     clen = 1;        /* Number of bytes in the character */
 587 #ifdef SUPPORT_UTF8
 588     if (utf8) { GETCHARLEN(c, ptr, clen); } else
 589 #endif  /* SUPPORT_UTF8 */
 590     c = *ptr;
 591     }
 592   else
 593     {
 594     clen = 0;        /* This indicates the end of the subject */
 595     c = NOTACHAR;    /* This value should never actually be used */
 596     }
 597
 598   /* Scan up the active states and act on each one. The result of an action
 599   may be to add more states to the currently active list (e.g. on hitting a
 600   parenthesis) or it may be to put states on the new list, for considering
 601   when we move the character pointer on. */
 602
 603   for (i = 0; i < active_count; i++)
 604     {
 605     stateblock *current_state = active_states + i;
 606     const uschar *code;
 607     int state_offset = current_state->offset;
 608     int count, codevalue, rrc;
 609
 610 #ifdef PCRE_DEBUG
 611     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
 612     if (clen == 0) printf("EOL\n");
 613       else if (c > 32 && c < 127) printf("'%c'\n", c);
 614         else printf("0x%02x\n", c);
 615 #endif
 616
 617     /* This variable is referred to implicity in the ADD_xxx macros. */
 618
 619     ims = current_state->ims;
 620
 621     /* A negative offset is a special case meaning "hold off going to this
 622     (negated) state until the number of characters in the data field have
 623     been skipped". */
 624
 625     if (state_offset < 0)
 626       {
 627       if (current_state->data > 0)
 628         {
 629         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
 630         ADD_NEW_DATA(state_offset, current_state->count,
 631           current_state->data - 1);
 632         continue;
 633         }
 634       else
 635         {
 636         current_state->offset = state_offset = -state_offset;
 637         }
 638       }
 639
 640     /* Check for a duplicate state with the same count, and skip if found.
 641     See the note at the head of this module about the possibility of improving
 642     performance here. */
 643
 644     for (j = 0; j < i; j++)
 645       {
 646       if (active_states[j].offset == state_offset &&
 647           active_states[j].count == current_state->count)
 648         {
 649         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
 650         goto NEXT_ACTIVE_STATE;
 651         }
 652       }
 653
 654     /* The state offset is the offset to the opcode */
 655
 656     code = start_code + state_offset;
 657     codevalue = *code;
 658
 659     /* If this opcode inspects a character, but we are at the end of the
 660     subject, remember the fact for use when testing for a partial match. */
 661
 662     if (clen == 0 && poptable[codevalue] != 0)
 663       could_continue = TRUE;
 664
 665     /* If this opcode is followed by an inline character, load it. It is
 666     tempting to test for the presence of a subject character here, but that
 667     is wrong, because sometimes zero repetitions of the subject are
 668     permitted.
 669
 670     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
 671     argument that is not a data character - but is always one byte long. We
 672     have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
 673     this case. To keep the other cases fast, convert these ones to new opcodes.
 674     */
 675
 676     if (coptable[codevalue] > 0)
 677       {
 678       dlen = 1;
 679 #ifdef SUPPORT_UTF8
 680       if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
 681 #endif  /* SUPPORT_UTF8 */
 682       d = code[coptable[codevalue]];
 683       if (codevalue >= OP_TYPESTAR)
 684         {
 685         switch(d)
 686           {
 687           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
 688           case OP_NOTPROP:
 689           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
 690           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
 691           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
 692           case OP_NOT_HSPACE:
 693           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
 694           case OP_NOT_VSPACE:
 695           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
 696           default: break;
 697           }
 698         }
 699       }
 700     else
 701       {
 702       dlen = 0;         /* Not strictly necessary, but compilers moan */
 703       d = NOTACHAR;     /* if these variables are not set. */
 704       }
 705
 706
 707     /* Now process the individual opcodes */
 708
 709     switch (codevalue)
 710       {
 711 /* ========================================================================== */
 712       /* These cases are never obeyed. This is a fudge that causes a compile-
 713       time error if the vectors coptable or poptable, which are indexed by
 714       opcode, are not the correct length. It seems to be the only way to do
 715       such a check at compile time, as the sizeof() operator does not work
 716       in the C preprocessor. */
 717
 718       case OP_TABLE_LENGTH:
 719       case OP_TABLE_LENGTH +
 720         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
 721          (sizeof(poptable) == OP_TABLE_LENGTH)):
 722       break;
 723
 724 /* ========================================================================== */
 725       /* Reached a closing bracket. If not at the end of the pattern, carry
 726       on with the next opcode. Otherwise, unless we have an empty string and
 727       PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
 728       start of the subject, save the match data, shifting up all previous
 729       matches so we always have the longest first. */
 730
 731       case OP_KET:
 732       case OP_KETRMIN:
 733       case OP_KETRMAX:
 734       if (code != end_code)
 735         {
 736         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
 737         if (codevalue != OP_KET)
 738           {
 739           ADD_ACTIVE(state_offset - GET(code, 1), 0);
 740           }
 741         }
 742       else
 743         {
 744         if (ptr > current_subject ||
 745             ((md->moptions & PCRE_NOTEMPTY) == 0 &&
 746               ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
 747                 current_subject > start_subject + md->start_offset)))
 748           {
 749           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
 750             else if (match_count > 0 && ++match_count * 2 >= offsetcount)
 751               match_count = 0;
 752           count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
 753           if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
 754           if (offsetcount >= 2)
 755             {
 756             offsets[0] = current_subject - start_subject;
 757             offsets[1] = ptr - start_subject;
 758             DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
 759               offsets[1] - offsets[0], current_subject));
 760             }
 761           if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
 762             {
 763             DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
 764               "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
 765               match_count, rlevel*2-2, SP));
 766             return match_count;
 767             }
 768           }
 769         }
 770       break;
 771
 772 /* ========================================================================== */
 773       /* These opcodes add to the current list of states without looking
 774       at the current character. */
 775
 776       /*-----------------------------------------------------------------*/
 777       case OP_ALT:
 778       do { code += GET(code, 1); } while (*code == OP_ALT);
 779       ADD_ACTIVE(code - start_code, 0);
 780       break;
 781
 782       /*-----------------------------------------------------------------*/
 783       case OP_BRA:
 784       case OP_SBRA:
 785       do
 786         {
 787         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
 788         code += GET(code, 1);
 789         }
 790       while (*code == OP_ALT);
 791       break;
 792
 793       /*-----------------------------------------------------------------*/
 794       case OP_CBRA:
 795       case OP_SCBRA:
 796       ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
 797       code += GET(code, 1);
 798       while (*code == OP_ALT)
 799         {
 800         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
 801         code += GET(code, 1);
 802         }
 803       break;
 804
 805       /*-----------------------------------------------------------------*/
 806       case OP_BRAZERO:
 807       case OP_BRAMINZERO:
 808       ADD_ACTIVE(state_offset + 1, 0);
 809       code += 1 + GET(code, 2);
 810       while (*code == OP_ALT) code += GET(code, 1);
 811       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
 812       break;
 813
 814       /*-----------------------------------------------------------------*/
 815       case OP_SKIPZERO:
 816       code += 1 + GET(code, 2);
 817       while (*code == OP_ALT) code += GET(code, 1);
 818       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
 819       break;
 820
 821       /*-----------------------------------------------------------------*/
 822       case OP_CIRC:
 823       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
 824           ((ims & PCRE_MULTILINE) != 0 &&
 825             ptr != end_subject &&
 826             WAS_NEWLINE(ptr)))
 827         { ADD_ACTIVE(state_offset + 1, 0); }
 828       break;
 829
 830       /*-----------------------------------------------------------------*/
 831       case OP_EOD:
 832       if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 833       break;
 834
 835       /*-----------------------------------------------------------------*/
 836       case OP_OPT:
 837       ims = code[1];
 838       ADD_ACTIVE(state_offset + 2, 0);
 839       break;
 840
 841       /*-----------------------------------------------------------------*/
 842       case OP_SOD:
 843       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 844       break;
 845
 846       /*-----------------------------------------------------------------*/
 847       case OP_SOM:
 848       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
 849       break;
 850
 851
 852 /* ========================================================================== */
 853       /* These opcodes inspect the next subject character, and sometimes
 854       the previous one as well, but do not have an argument. The variable
 855       clen contains the length of the current character and is zero if we are
 856       at the end of the subject. */
 857
 858       /*-----------------------------------------------------------------*/
 859       case OP_ANY:
 860       if (clen > 0 && !IS_NEWLINE(ptr))
 861         { ADD_NEW(state_offset + 1, 0); }
 862       break;
 863
 864       /*-----------------------------------------------------------------*/
 865       case OP_ALLANY:
 866       if (clen > 0)
 867         { ADD_NEW(state_offset + 1, 0); }
 868       break;
 869
 870       /*-----------------------------------------------------------------*/
 871       case OP_EODN:
 872       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
 873         { ADD_ACTIVE(state_offset + 1, 0); }
 874       break;
 875
 876       /*-----------------------------------------------------------------*/
 877       case OP_DOLL:
 878       if ((md->moptions & PCRE_NOTEOL) == 0)
 879         {
 880         if (clen == 0 ||
 881             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
 882                ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
 883             ))
 884           { ADD_ACTIVE(state_offset + 1, 0); }
 885         }
 886       else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
 887         { ADD_ACTIVE(state_offset + 1, 0); }
 888       break;
 889
 890       /*-----------------------------------------------------------------*/
 891
 892       case OP_DIGIT:
 893       case OP_WHITESPACE:
 894       case OP_WORDCHAR:
 895       if (clen > 0 && c < 256 &&
 896             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
 897         { ADD_NEW(state_offset + 1, 0); }
 898       break;
 899
 900       /*-----------------------------------------------------------------*/
 901       case OP_NOT_DIGIT:
 902       case OP_NOT_WHITESPACE:
 903       case OP_NOT_WORDCHAR:
 904       if (clen > 0 && (c >= 256 ||
 905             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
 906         { ADD_NEW(state_offset + 1, 0); }
 907       break;
 908
 909       /*-----------------------------------------------------------------*/
 910       case OP_WORD_BOUNDARY:
 911       case OP_NOT_WORD_BOUNDARY:
 912         {
 913         int left_word, right_word;
 914
 915         if (ptr > start_subject)
 916           {
 917           const uschar *temp = ptr - 1;
 918           if (temp < md->start_used_ptr) md->start_used_ptr = temp;
 919 #ifdef SUPPORT_UTF8
 920           if (utf8) BACKCHAR(temp);
 921 #endif
 922           GETCHARTEST(d, temp);
 923           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
 924           }
 925         else left_word = 0;
 926
 927         if (clen > 0)
 928           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
 929         else right_word = 0;
 930
 931         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
 932           { ADD_ACTIVE(state_offset + 1, 0); }
 933         }
 934       break;
 935
 936
 937       /*-----------------------------------------------------------------*/
 938       /* Check the next character by Unicode property. We will get here only
 939       if the support is in the binary; otherwise a compile-time error occurs.
 940       */
 941
 942 #ifdef SUPPORT_UCP
 943       case OP_PROP:
 944       case OP_NOTPROP:
 945       if (clen > 0)
 946         {
 947         BOOL OK;
 948         int chartype = UCD_CHARTYPE(c);
 949         switch(code[1])
 950           {
 951           case PT_ANY:
 952           OK = TRUE;
 953           break;
 954
 955           case PT_LAMP:
 956           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
 957           break;
 958
 959           case PT_GC:
 960           OK = _pcre_ucp_gentype[chartype] == code[2];
 961           break;
 962
 963           case PT_PC:
 964           OK = chartype == code[2];
 965           break;
 966
 967           case PT_SC:
 968           OK = UCD_SCRIPT(c) == code[2];
 969           break;
 970
 971           /* Should never occur, but keep compilers from grumbling. */
 972
 973           default:
 974           OK = codevalue != OP_PROP;
 975           break;
 976           }
 977
 978         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
 979         }
 980       break;
 981 #endif
 982
 983
 984
 985 /* ========================================================================== */
 986       /* These opcodes likewise inspect the subject character, but have an
 987       argument that is not a data character. It is one of these opcodes:
 988       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
 989       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
 990
 991       case OP_TYPEPLUS:
 992       case OP_TYPEMINPLUS:
 993       case OP_TYPEPOSPLUS:
 994       count = current_state->count;  /* Already matched */
 995       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 996       if (clen > 0)
 997         {
 998         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
 999             (c < 256 &&
1000               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1001               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1002           {
1003           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1004             {
1005             active_count--;            /* Remove non-match possibility */
1006             next_active_state--;
1007             }
1008           count++;
1009           ADD_NEW(state_offset, count);
1010           }
1011         }
1012       break;
1013
1014       /*-----------------------------------------------------------------*/
1015       case OP_TYPEQUERY:
1016       case OP_TYPEMINQUERY:
1017       case OP_TYPEPOSQUERY:
1018       ADD_ACTIVE(state_offset + 2, 0);
1019       if (clen > 0)
1020         {
1021         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1022             (c < 256 &&
1023               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1024               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1025           {
1026           if (codevalue == OP_TYPEPOSQUERY)
1027             {
1028             active_count--;            /* Remove non-match possibility */
1029             next_active_state--;
1030             }
1031           ADD_NEW(state_offset + 2, 0);
1032           }
1033         }
1034       break;
1035
1036       /*-----------------------------------------------------------------*/
1037       case OP_TYPESTAR:
1038       case OP_TYPEMINSTAR:
1039       case OP_TYPEPOSSTAR:
1040       ADD_ACTIVE(state_offset + 2, 0);
1041       if (clen > 0)
1042         {
1043         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1044             (c < 256 &&
1045               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1046               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1047           {
1048           if (codevalue == OP_TYPEPOSSTAR)
1049             {
1050             active_count--;            /* Remove non-match possibility */
1051             next_active_state--;
1052             }
1053           ADD_NEW(state_offset, 0);
1054           }
1055         }
1056       break;
1057
1058       /*-----------------------------------------------------------------*/
1059       case OP_TYPEEXACT:
1060       count = current_state->count;  /* Number already matched */
1061       if (clen > 0)
1062         {
1063         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1064             (c < 256 &&
1065               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1066               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1067           {
1068           if (++count >= GET2(code, 1))
1069             { ADD_NEW(state_offset + 4, 0); }
1070           else
1071             { ADD_NEW(state_offset, count); }
1072           }
1073         }
1074       break;
1075
1076       /*-----------------------------------------------------------------*/
1077       case OP_TYPEUPTO:
1078       case OP_TYPEMINUPTO:
1079       case OP_TYPEPOSUPTO:
1080       ADD_ACTIVE(state_offset + 4, 0);
1081       count = current_state->count;  /* Number already matched */
1082       if (clen > 0)
1083         {
1084         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1085             (c < 256 &&
1086               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1087               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1088           {
1089           if (codevalue == OP_TYPEPOSUPTO)
1090             {
1091             active_count--;           /* Remove non-match possibility */
1092             next_active_state--;
1093             }
1094           if (++count >= GET2(code, 1))
1095             { ADD_NEW(state_offset + 4, 0); }
1096           else
1097             { ADD_NEW(state_offset, count); }
1098           }
1099         }
1100       break;
1101
1102 /* ========================================================================== */
1103       /* These are virtual opcodes that are used when something like
1104       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1105       argument. It keeps the code above fast for the other cases. The argument
1106       is in the d variable. */
1107
1108 #ifdef SUPPORT_UCP
1109       case OP_PROP_EXTRA + OP_TYPEPLUS:
1110       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1111       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1112       count = current_state->count;           /* Already matched */
1113       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1114       if (clen > 0)
1115         {
1116         BOOL OK;
1117         int chartype = UCD_CHARTYPE(c);
1118         switch(code[2])
1119           {
1120           case PT_ANY:
1121           OK = TRUE;
1122           break;
1123
1124           case PT_LAMP:
1125           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1126           break;
1127
1128           case PT_GC:
1129           OK = _pcre_ucp_gentype[chartype] == code[3];
1130           break;
1131
1132           case PT_PC:
1133           OK = chartype == code[3];
1134           break;
1135
1136           case PT_SC:
1137           OK = UCD_SCRIPT(c) == code[3];
1138           break;
1139
1140           /* Should never occur, but keep compilers from grumbling. */
1141
1142           default:
1143           OK = codevalue != OP_PROP;
1144           break;
1145           }
1146
1147         if (OK == (d == OP_PROP))
1148           {
1149           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1150             {
1151             active_count--;           /* Remove non-match possibility */
1152             next_active_state--;
1153             }
1154           count++;
1155           ADD_NEW(state_offset, count);
1156           }
1157         }
1158       break;
1159
1160       /*-----------------------------------------------------------------*/
1161       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1162       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1163       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1164       count = current_state->count;  /* Already matched */
1165       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1166       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1167         {
1168         const uschar *nptr = ptr + clen;
1169         int ncount = 0;
1170         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1171           {
1172           active_count--;           /* Remove non-match possibility */
1173           next_active_state--;
1174           }
1175         while (nptr < end_subject)
1176           {
1177           int nd;
1178           int ndlen = 1;
1179           GETCHARLEN(nd, nptr, ndlen);
1180           if (UCD_CATEGORY(nd) != ucp_M) break;
1181           ncount++;
1182           nptr += ndlen;
1183           }
1184         count++;
1185         ADD_NEW_DATA(-state_offset, count, ncount);
1186         }
1187       break;
1188 #endif
1189
1190       /*-----------------------------------------------------------------*/
1191       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1192       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1193       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1194       count = current_state->count;  /* Already matched */
1195       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1196       if (clen > 0)
1197         {
1198         int ncount = 0;
1199         switch (c)
1200           {
1201           case 0x000b:
1202           case 0x000c:
1203           case 0x0085:
1204           case 0x2028:
1205           case 0x2029:
1206           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1207           goto ANYNL01;
1208
1209           case 0x000d:
1210           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1211           /* Fall through */
1212
1213           ANYNL01:
1214           case 0x000a:
1215           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1216             {
1217             active_count--;           /* Remove non-match possibility */
1218             next_active_state--;
1219             }
1220           count++;
1221           ADD_NEW_DATA(-state_offset, count, ncount);
1222           break;
1223
1224           default:
1225           break;
1226           }
1227         }
1228       break;
1229
1230       /*-----------------------------------------------------------------*/
1231       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1232       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1233       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1234       count = current_state->count;  /* Already matched */
1235       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1236       if (clen > 0)
1237         {
1238         BOOL OK;
1239         switch (c)
1240           {
1241           case 0x000a:
1242           case 0x000b:
1243           case 0x000c:
1244           case 0x000d:
1245           case 0x0085:
1246           case 0x2028:
1247           case 0x2029:
1248           OK = TRUE;
1249           break;
1250
1251           default:
1252           OK = FALSE;
1253           break;
1254           }
1255
1256         if (OK == (d == OP_VSPACE))
1257           {
1258           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1259             {
1260             active_count--;           /* Remove non-match possibility */
1261             next_active_state--;
1262             }
1263           count++;
1264           ADD_NEW_DATA(-state_offset, count, 0);
1265           }
1266         }
1267       break;
1268
1269       /*-----------------------------------------------------------------*/
1270       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1271       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1272       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1273       count = current_state->count;  /* Already matched */
1274       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1275       if (clen > 0)
1276         {
1277         BOOL OK;
1278         switch (c)
1279           {
1280           case 0x09:      /* HT */
1281           case 0x20:      /* SPACE */
1282           case 0xa0:      /* NBSP */
1283           case 0x1680:    /* OGHAM SPACE MARK */
1284           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1285           case 0x2000:    /* EN QUAD */
1286           case 0x2001:    /* EM QUAD */
1287           case 0x2002:    /* EN SPACE */
1288           case 0x2003:    /* EM SPACE */
1289           case 0x2004:    /* THREE-PER-EM SPACE */
1290           case 0x2005:    /* FOUR-PER-EM SPACE */
1291           case 0x2006:    /* SIX-PER-EM SPACE */
1292           case 0x2007:    /* FIGURE SPACE */
1293           case 0x2008:    /* PUNCTUATION SPACE */
1294           case 0x2009:    /* THIN SPACE */
1295           case 0x200A:    /* HAIR SPACE */
1296           case 0x202f:    /* NARROW NO-BREAK SPACE */
1297           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1298           case 0x3000:    /* IDEOGRAPHIC SPACE */
1299           OK = TRUE;
1300           break;
1301
1302           default:
1303           OK = FALSE;
1304           break;
1305           }
1306
1307         if (OK == (d == OP_HSPACE))
1308           {
1309           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1310             {
1311             active_count--;           /* Remove non-match possibility */
1312             next_active_state--;
1313             }
1314           count++;
1315           ADD_NEW_DATA(-state_offset, count, 0);
1316           }
1317         }
1318       break;
1319
1320       /*-----------------------------------------------------------------*/
1321 #ifdef SUPPORT_UCP
1322       case OP_PROP_EXTRA + OP_TYPEQUERY:
1323       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1324       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1325       count = 4;
1326       goto QS1;
1327
1328       case OP_PROP_EXTRA + OP_TYPESTAR:
1329       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1330       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1331       count = 0;
1332
1333       QS1:
1334
1335       ADD_ACTIVE(state_offset + 4, 0);
1336       if (clen > 0)
1337         {
1338         BOOL OK;
1339         int chartype = UCD_CHARTYPE(c);
1340         switch(code[2])
1341           {
1342           case PT_ANY:
1343           OK = TRUE;
1344           break;
1345
1346           case PT_LAMP:
1347           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1348           break;
1349
1350           case PT_GC:
1351           OK = _pcre_ucp_gentype[chartype] == code[3];
1352           break;
1353
1354           case PT_PC:
1355           OK = chartype == code[3];
1356           break;
1357
1358           case PT_SC:
1359           OK = UCD_SCRIPT(c) == code[3];
1360           break;
1361
1362           /* Should never occur, but keep compilers from grumbling. */
1363
1364           default:
1365           OK = codevalue != OP_PROP;
1366           break;
1367           }
1368
1369         if (OK == (d == OP_PROP))
1370           {
1371           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1372               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1373             {
1374             active_count--;           /* Remove non-match possibility */
1375             next_active_state--;
1376             }
1377           ADD_NEW(state_offset + count, 0);
1378           }
1379         }
1380       break;
1381
1382       /*-----------------------------------------------------------------*/
1383       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1384       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1385       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1386       count = 2;
1387       goto QS2;
1388
1389       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1390       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1391       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1392       count = 0;
1393
1394       QS2:
1395
1396       ADD_ACTIVE(state_offset + 2, 0);
1397       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1398         {
1399         const uschar *nptr = ptr + clen;
1400         int ncount = 0;
1401         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1402             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1403           {
1404           active_count--;           /* Remove non-match possibility */
1405           next_active_state--;
1406           }
1407         while (nptr < end_subject)
1408           {
1409           int nd;
1410           int ndlen = 1;
1411           GETCHARLEN(nd, nptr, ndlen);
1412           if (UCD_CATEGORY(nd) != ucp_M) break;
1413           ncount++;
1414           nptr += ndlen;
1415           }
1416         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1417         }
1418       break;
1419 #endif
1420
1421       /*-----------------------------------------------------------------*/
1422       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1423       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1424       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1425       count = 2;
1426       goto QS3;
1427
1428       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1429       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1430       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1431       count = 0;
1432
1433       QS3:
1434       ADD_ACTIVE(state_offset + 2, 0);
1435       if (clen > 0)
1436         {
1437         int ncount = 0;
1438         switch (c)
1439           {
1440           case 0x000b:
1441           case 0x000c:
1442           case 0x0085:
1443           case 0x2028:
1444           case 0x2029:
1445           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1446           goto ANYNL02;
1447
1448           case 0x000d:
1449           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1450           /* Fall through */
1451
1452           ANYNL02:
1453           case 0x000a:
1454           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1455               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1456             {
1457             active_count--;           /* Remove non-match possibility */
1458             next_active_state--;
1459             }
1460           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1461           break;
1462
1463           default:
1464           break;
1465           }
1466         }
1467       break;
1468
1469       /*-----------------------------------------------------------------*/
1470       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1471       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1472       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1473       count = 2;
1474       goto QS4;
1475
1476       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1477       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1478       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1479       count = 0;
1480
1481       QS4:
1482       ADD_ACTIVE(state_offset + 2, 0);
1483       if (clen > 0)
1484         {
1485         BOOL OK;
1486         switch (c)
1487           {
1488           case 0x000a:
1489           case 0x000b:
1490           case 0x000c:
1491           case 0x000d:
1492           case 0x0085:
1493           case 0x2028:
1494           case 0x2029:
1495           OK = TRUE;
1496           break;
1497
1498           default:
1499           OK = FALSE;
1500           break;
1501           }
1502         if (OK == (d == OP_VSPACE))
1503           {
1504           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1505               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1506             {
1507             active_count--;           /* Remove non-match possibility */
1508             next_active_state--;
1509             }
1510           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1511           }
1512         }
1513       break;
1514
1515       /*-----------------------------------------------------------------*/
1516       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1517       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1518       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1519       count = 2;
1520       goto QS5;
1521
1522       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1523       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1524       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1525       count = 0;
1526
1527       QS5:
1528       ADD_ACTIVE(state_offset + 2, 0);
1529       if (clen > 0)
1530         {
1531         BOOL OK;
1532         switch (c)
1533           {
1534           case 0x09:      /* HT */
1535           case 0x20:      /* SPACE */
1536           case 0xa0:      /* NBSP */
1537           case 0x1680:    /* OGHAM SPACE MARK */
1538           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1539           case 0x2000:    /* EN QUAD */
1540           case 0x2001:    /* EM QUAD */
1541           case 0x2002:    /* EN SPACE */
1542           case 0x2003:    /* EM SPACE */
1543           case 0x2004:    /* THREE-PER-EM SPACE */
1544           case 0x2005:    /* FOUR-PER-EM SPACE */
1545           case 0x2006:    /* SIX-PER-EM SPACE */
1546           case 0x2007:    /* FIGURE SPACE */
1547           case 0x2008:    /* PUNCTUATION SPACE */
1548           case 0x2009:    /* THIN SPACE */
1549           case 0x200A:    /* HAIR SPACE */
1550           case 0x202f:    /* NARROW NO-BREAK SPACE */
1551           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1552           case 0x3000:    /* IDEOGRAPHIC SPACE */
1553           OK = TRUE;
1554           break;
1555
1556           default:
1557           OK = FALSE;
1558           break;
1559           }
1560
1561         if (OK == (d == OP_HSPACE))
1562           {
1563           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1564               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1565             {
1566             active_count--;           /* Remove non-match possibility */
1567             next_active_state--;
1568             }
1569           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1570           }
1571         }
1572       break;
1573
1574       /*-----------------------------------------------------------------*/
1575 #ifdef SUPPORT_UCP
1576       case OP_PROP_EXTRA + OP_TYPEEXACT:
1577       case OP_PROP_EXTRA + OP_TYPEUPTO:
1578       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1579       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1580       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1581         { ADD_ACTIVE(state_offset + 6, 0); }
1582       count = current_state->count;  /* Number already matched */
1583       if (clen > 0)
1584         {
1585         BOOL OK;
1586         int chartype = UCD_CHARTYPE(c);
1587         switch(code[4])
1588           {
1589           case PT_ANY:
1590           OK = TRUE;
1591           break;
1592
1593           case PT_LAMP:
1594           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1595           break;
1596
1597           case PT_GC:
1598           OK = _pcre_ucp_gentype[chartype] == code[5];
1599           break;
1600
1601           case PT_PC:
1602           OK = chartype == code[5];
1603           break;
1604
1605           case PT_SC:
1606           OK = UCD_SCRIPT(c) == code[5];
1607           break;
1608
1609           /* Should never occur, but keep compilers from grumbling. */
1610
1611           default:
1612           OK = codevalue != OP_PROP;
1613           break;
1614           }
1615
1616         if (OK == (d == OP_PROP))
1617           {
1618           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1619             {
1620             active_count--;           /* Remove non-match possibility */
1621             next_active_state--;
1622             }
1623           if (++count >= GET2(code, 1))
1624             { ADD_NEW(state_offset + 6, 0); }
1625           else
1626             { ADD_NEW(state_offset, count); }
1627           }
1628         }
1629       break;
1630
1631       /*-----------------------------------------------------------------*/
1632       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1633       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1634       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1635       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1636       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1637         { ADD_ACTIVE(state_offset + 4, 0); }
1638       count = current_state->count;  /* Number already matched */
1639       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1640         {
1641         const uschar *nptr = ptr + clen;
1642         int ncount = 0;
1643         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1644           {
1645           active_count--;           /* Remove non-match possibility */
1646           next_active_state--;
1647           }
1648         while (nptr < end_subject)
1649           {
1650           int nd;
1651           int ndlen = 1;
1652           GETCHARLEN(nd, nptr, ndlen);
1653           if (UCD_CATEGORY(nd) != ucp_M) break;
1654           ncount++;
1655           nptr += ndlen;
1656           }
1657         if (++count >= GET2(code, 1))
1658           { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1659         else
1660           { ADD_NEW_DATA(-state_offset, count, ncount); }
1661         }
1662       break;
1663 #endif
1664
1665       /*-----------------------------------------------------------------*/
1666       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1667       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1668       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1669       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1670       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1671         { ADD_ACTIVE(state_offset + 4, 0); }
1672       count = current_state->count;  /* Number already matched */
1673       if (clen > 0)
1674         {
1675         int ncount = 0;
1676         switch (c)
1677           {
1678           case 0x000b:
1679           case 0x000c:
1680           case 0x0085:
1681           case 0x2028:
1682           case 0x2029:
1683           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1684           goto ANYNL03;
1685
1686           case 0x000d:
1687           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1688           /* Fall through */
1689
1690           ANYNL03:
1691           case 0x000a:
1692           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1693             {
1694             active_count--;           /* Remove non-match possibility */
1695             next_active_state--;
1696             }
1697           if (++count >= GET2(code, 1))
1698             { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1699           else
1700             { ADD_NEW_DATA(-state_offset, count, ncount); }
1701           break;
1702
1703           default:
1704           break;
1705           }
1706         }
1707       break;
1708
1709       /*-----------------------------------------------------------------*/
1710       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1711       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1712       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1713       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1714       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1715         { ADD_ACTIVE(state_offset + 4, 0); }
1716       count = current_state->count;  /* Number already matched */
1717       if (clen > 0)
1718         {
1719         BOOL OK;
1720         switch (c)
1721           {
1722           case 0x000a:
1723           case 0x000b:
1724           case 0x000c:
1725           case 0x000d:
1726           case 0x0085:
1727           case 0x2028:
1728           case 0x2029:
1729           OK = TRUE;
1730           break;
1731
1732           default:
1733           OK = FALSE;
1734           }
1735
1736         if (OK == (d == OP_VSPACE))
1737           {
1738           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1739             {
1740             active_count--;           /* Remove non-match possibility */
1741             next_active_state--;
1742             }
1743           if (++count >= GET2(code, 1))
1744             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1745           else
1746             { ADD_NEW_DATA(-state_offset, count, 0); }
1747           }
1748         }
1749       break;
1750
1751       /*-----------------------------------------------------------------*/
1752       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1753       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1754       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1755       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1756       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1757         { ADD_ACTIVE(state_offset + 4, 0); }
1758       count = current_state->count;  /* Number already matched */
1759       if (clen > 0)
1760         {
1761         BOOL OK;
1762         switch (c)
1763           {
1764           case 0x09:      /* HT */
1765           case 0x20:      /* SPACE */
1766           case 0xa0:      /* NBSP */
1767           case 0x1680:    /* OGHAM SPACE MARK */
1768           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1769           case 0x2000:    /* EN QUAD */
1770           case 0x2001:    /* EM QUAD */
1771           case 0x2002:    /* EN SPACE */
1772           case 0x2003:    /* EM SPACE */
1773           case 0x2004:    /* THREE-PER-EM SPACE */
1774           case 0x2005:    /* FOUR-PER-EM SPACE */
1775           case 0x2006:    /* SIX-PER-EM SPACE */
1776           case 0x2007:    /* FIGURE SPACE */
1777           case 0x2008:    /* PUNCTUATION SPACE */
1778           case 0x2009:    /* THIN SPACE */
1779           case 0x200A:    /* HAIR SPACE */
1780           case 0x202f:    /* NARROW NO-BREAK SPACE */
1781           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1782           case 0x3000:    /* IDEOGRAPHIC SPACE */
1783           OK = TRUE;
1784           break;
1785
1786           default:
1787           OK = FALSE;
1788           break;
1789           }
1790
1791         if (OK == (d == OP_HSPACE))
1792           {
1793           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1794             {
1795             active_count--;           /* Remove non-match possibility */
1796             next_active_state--;
1797             }
1798           if (++count >= GET2(code, 1))
1799             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1800           else
1801             { ADD_NEW_DATA(-state_offset, count, 0); }
1802           }
1803         }
1804       break;
1805
1806 /* ========================================================================== */
1807       /* These opcodes are followed by a character that is usually compared
1808       to the current subject character; it is loaded into d. We still get
1809       here even if there is no subject character, because in some cases zero
1810       repetitions are permitted. */
1811
1812       /*-----------------------------------------------------------------*/
1813       case OP_CHAR:
1814       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1815       break;
1816
1817       /*-----------------------------------------------------------------*/
1818       case OP_CHARNC:
1819       if (clen == 0) break;
1820
1821 #ifdef SUPPORT_UTF8
1822       if (utf8)
1823         {
1824         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1825           {
1826           unsigned int othercase;
1827           if (c < 128) othercase = fcc[c]; else
1828
1829           /* If we have Unicode property support, we can use it to test the
1830           other case of the character. */
1831
1832 #ifdef SUPPORT_UCP
1833           othercase = UCD_OTHERCASE(c);
1834 #else
1835           othercase = NOTACHAR;
1836 #endif
1837
1838           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1839           }
1840         }
1841       else
1842 #endif  /* SUPPORT_UTF8 */
1843
1844       /* Non-UTF-8 mode */
1845         {
1846         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1847         }
1848       break;
1849
1850
1851 #ifdef SUPPORT_UCP
1852       /*-----------------------------------------------------------------*/
1853       /* This is a tricky one because it can match more than one character.
1854       Find out how many characters to skip, and then set up a negative state
1855       to wait for them to pass before continuing. */
1856
1857       case OP_EXTUNI:
1858       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1859         {
1860         const uschar *nptr = ptr + clen;
1861         int ncount = 0;
1862         while (nptr < end_subject)
1863           {
1864           int nclen = 1;
1865           GETCHARLEN(c, nptr, nclen);
1866           if (UCD_CATEGORY(c) != ucp_M) break;
1867           ncount++;
1868           nptr += nclen;
1869           }
1870         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1871         }
1872       break;
1873 #endif
1874
1875       /*-----------------------------------------------------------------*/
1876       /* This is a tricky like EXTUNI because it too can match more than one
1877       character (when CR is followed by LF). In this case, set up a negative
1878       state to wait for one character to pass before continuing. */
1879
1880       case OP_ANYNL:
1881       if (clen > 0) switch(c)
1882         {
1883         case 0x000b:
1884         case 0x000c:
1885         case 0x0085:
1886         case 0x2028:
1887         case 0x2029:
1888         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1889
1890         case 0x000a:
1891         ADD_NEW(state_offset + 1, 0);
1892         break;
1893
1894         case 0x000d:
1895         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1896           {
1897           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1898           }
1899         else
1900           {
1901           ADD_NEW(state_offset + 1, 0);
1902           }
1903         break;
1904         }
1905       break;
1906
1907       /*-----------------------------------------------------------------*/
1908       case OP_NOT_VSPACE:
1909       if (clen > 0) switch(c)
1910         {
1911         case 0x000a:
1912         case 0x000b:
1913         case 0x000c:
1914         case 0x000d:
1915         case 0x0085:
1916         case 0x2028:
1917         case 0x2029:
1918         break;
1919
1920         default:
1921         ADD_NEW(state_offset + 1, 0);
1922         break;
1923         }
1924       break;
1925
1926       /*-----------------------------------------------------------------*/
1927       case OP_VSPACE:
1928       if (clen > 0) switch(c)
1929         {
1930         case 0x000a:
1931         case 0x000b:
1932         case 0x000c:
1933         case 0x000d:
1934         case 0x0085:
1935         case 0x2028:
1936         case 0x2029:
1937         ADD_NEW(state_offset + 1, 0);
1938         break;
1939
1940         default: break;
1941         }
1942       break;
1943
1944       /*-----------------------------------------------------------------*/
1945       case OP_NOT_HSPACE:
1946       if (clen > 0) switch(c)
1947         {
1948         case 0x09:      /* HT */
1949         case 0x20:      /* SPACE */
1950         case 0xa0:      /* NBSP */
1951         case 0x1680:    /* OGHAM SPACE MARK */
1952         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1953         case 0x2000:    /* EN QUAD */
1954         case 0x2001:    /* EM QUAD */
1955         case 0x2002:    /* EN SPACE */
1956         case 0x2003:    /* EM SPACE */
1957         case 0x2004:    /* THREE-PER-EM SPACE */
1958         case 0x2005:    /* FOUR-PER-EM SPACE */
1959         case 0x2006:    /* SIX-PER-EM SPACE */
1960         case 0x2007:    /* FIGURE SPACE */
1961         case 0x2008:    /* PUNCTUATION SPACE */
1962         case 0x2009:    /* THIN SPACE */
1963         case 0x200A:    /* HAIR SPACE */
1964         case 0x202f:    /* NARROW NO-BREAK SPACE */
1965         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1966         case 0x3000:    /* IDEOGRAPHIC SPACE */
1967         break;
1968
1969         default:
1970         ADD_NEW(state_offset + 1, 0);
1971         break;
1972         }
1973       break;
1974
1975       /*-----------------------------------------------------------------*/
1976       case OP_HSPACE:
1977       if (clen > 0) switch(c)
1978         {
1979         case 0x09:      /* HT */
1980         case 0x20:      /* SPACE */
1981         case 0xa0:      /* NBSP */
1982         case 0x1680:    /* OGHAM SPACE MARK */
1983         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1984         case 0x2000:    /* EN QUAD */
1985         case 0x2001:    /* EM QUAD */
1986         case 0x2002:    /* EN SPACE */
1987         case 0x2003:    /* EM SPACE */
1988         case 0x2004:    /* THREE-PER-EM SPACE */
1989         case 0x2005:    /* FOUR-PER-EM SPACE */
1990         case 0x2006:    /* SIX-PER-EM SPACE */
1991         case 0x2007:    /* FIGURE SPACE */
1992         case 0x2008:    /* PUNCTUATION SPACE */
1993         case 0x2009:    /* THIN SPACE */
1994         case 0x200A:    /* HAIR SPACE */
1995         case 0x202f:    /* NARROW NO-BREAK SPACE */
1996         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1997         case 0x3000:    /* IDEOGRAPHIC SPACE */
1998         ADD_NEW(state_offset + 1, 0);
1999         break;
2000         }
2001       break;
2002
2003       /*-----------------------------------------------------------------*/
2004       /* Match a negated single character. This is only used for one-byte
2005       characters, that is, we know that d < 256. The character we are
2006       checking (c) can be multibyte. */
2007
2008       case OP_NOT:
2009       if (clen > 0)
2010         {
2011         unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
2012         if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
2013         }
2014       break;
2015
2016       /*-----------------------------------------------------------------*/
2017       case OP_PLUS:
2018       case OP_MINPLUS:
2019       case OP_POSPLUS:
2020       case OP_NOTPLUS:
2021       case OP_NOTMINPLUS:
2022       case OP_NOTPOSPLUS:
2023       count = current_state->count;  /* Already matched */
2024       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2025       if (clen > 0)
2026         {
2027         unsigned int otherd = NOTACHAR;
2028         if ((ims & PCRE_CASELESS) != 0)
2029           {
2030 #ifdef SUPPORT_UTF8
2031           if (utf8 && d >= 128)
2032             {
2033 #ifdef SUPPORT_UCP
2034             otherd = UCD_OTHERCASE(d);
2035 #endif  /* SUPPORT_UCP */
2036             }
2037           else
2038 #endif  /* SUPPORT_UTF8 */
2039           otherd = fcc[d];
2040           }
2041         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2042           {
2043           if (count > 0 &&
2044               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2045             {
2046             active_count--;             /* Remove non-match possibility */
2047             next_active_state--;
2048             }
2049           count++;
2050           ADD_NEW(state_offset, count);
2051           }
2052         }
2053       break;
2054
2055       /*-----------------------------------------------------------------*/
2056       case OP_QUERY:
2057       case OP_MINQUERY:
2058       case OP_POSQUERY:
2059       case OP_NOTQUERY:
2060       case OP_NOTMINQUERY:
2061       case OP_NOTPOSQUERY:
2062       ADD_ACTIVE(state_offset + dlen + 1, 0);
2063       if (clen > 0)
2064         {
2065         unsigned int otherd = NOTACHAR;
2066         if ((ims & PCRE_CASELESS) != 0)
2067           {
2068 #ifdef SUPPORT_UTF8
2069           if (utf8 && d >= 128)
2070             {
2071 #ifdef SUPPORT_UCP
2072             otherd = UCD_OTHERCASE(d);
2073 #endif  /* SUPPORT_UCP */
2074             }
2075           else
2076 #endif  /* SUPPORT_UTF8 */
2077           otherd = fcc[d];
2078           }
2079         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2080           {
2081           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2082             {
2083             active_count--;            /* Remove non-match possibility */
2084             next_active_state--;
2085             }
2086           ADD_NEW(state_offset + dlen + 1, 0);
2087           }
2088         }
2089       break;
2090
2091       /*-----------------------------------------------------------------*/
2092       case OP_STAR:
2093       case OP_MINSTAR:
2094       case OP_POSSTAR:
2095       case OP_NOTSTAR:
2096       case OP_NOTMINSTAR:
2097       case OP_NOTPOSSTAR:
2098       ADD_ACTIVE(state_offset + dlen + 1, 0);
2099       if (clen > 0)
2100         {
2101         unsigned int otherd = NOTACHAR;
2102         if ((ims & PCRE_CASELESS) != 0)
2103           {
2104 #ifdef SUPPORT_UTF8
2105           if (utf8 && d >= 128)
2106             {
2107 #ifdef SUPPORT_UCP
2108             otherd = UCD_OTHERCASE(d);
2109 #endif  /* SUPPORT_UCP */
2110             }
2111           else
2112 #endif  /* SUPPORT_UTF8 */
2113           otherd = fcc[d];
2114           }
2115         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2116           {
2117           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2118             {
2119             active_count--;            /* Remove non-match possibility */
2120             next_active_state--;
2121             }
2122           ADD_NEW(state_offset, 0);
2123           }
2124         }
2125       break;
2126
2127       /*-----------------------------------------------------------------*/
2128       case OP_EXACT:
2129       case OP_NOTEXACT:
2130       count = current_state->count;  /* Number already matched */
2131       if (clen > 0)
2132         {
2133         unsigned int otherd = NOTACHAR;
2134         if ((ims & PCRE_CASELESS) != 0)
2135           {
2136 #ifdef SUPPORT_UTF8
2137           if (utf8 && d >= 128)
2138             {
2139 #ifdef SUPPORT_UCP
2140             otherd = UCD_OTHERCASE(d);
2141 #endif  /* SUPPORT_UCP */
2142             }
2143           else
2144 #endif  /* SUPPORT_UTF8 */
2145           otherd = fcc[d];
2146           }
2147         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2148           {
2149           if (++count >= GET2(code, 1))
2150             { ADD_NEW(state_offset + dlen + 3, 0); }
2151           else
2152             { ADD_NEW(state_offset, count); }
2153           }
2154         }
2155       break;
2156
2157       /*-----------------------------------------------------------------*/
2158       case OP_UPTO:
2159       case OP_MINUPTO:
2160       case OP_POSUPTO:
2161       case OP_NOTUPTO:
2162       case OP_NOTMINUPTO:
2163       case OP_NOTPOSUPTO:
2164       ADD_ACTIVE(state_offset + dlen + 3, 0);
2165       count = current_state->count;  /* Number already matched */
2166       if (clen > 0)
2167         {
2168         unsigned int otherd = NOTACHAR;
2169         if ((ims & PCRE_CASELESS) != 0)
2170           {
2171 #ifdef SUPPORT_UTF8
2172           if (utf8 && d >= 128)
2173             {
2174 #ifdef SUPPORT_UCP
2175             otherd = UCD_OTHERCASE(d);
2176 #endif  /* SUPPORT_UCP */
2177             }
2178           else
2179 #endif  /* SUPPORT_UTF8 */
2180           otherd = fcc[d];
2181           }
2182         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2183           {
2184           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2185             {
2186             active_count--;             /* Remove non-match possibility */
2187             next_active_state--;
2188             }
2189           if (++count >= GET2(code, 1))
2190             { ADD_NEW(state_offset + dlen + 3, 0); }
2191           else
2192             { ADD_NEW(state_offset, count); }
2193           }
2194         }
2195       break;
2196
2197
2198 /* ========================================================================== */
2199       /* These are the class-handling opcodes */
2200
2201       case OP_CLASS:
2202       case OP_NCLASS:
2203       case OP_XCLASS:
2204         {
2205         BOOL isinclass = FALSE;
2206         int next_state_offset;
2207         const uschar *ecode;
2208
2209         /* For a simple class, there is always just a 32-byte table, and we
2210         can set isinclass from it. */
2211
2212         if (codevalue != OP_XCLASS)
2213           {
2214           ecode = code + 33;
2215           if (clen > 0)
2216             {
2217             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2218               ((code[1 + c/8] & (1 << (c&7))) != 0);
2219             }
2220           }
2221
2222         /* An extended class may have a table or a list of single characters,
2223         ranges, or both, and it may be positive or negative. There's a
2224         function that sorts all this out. */
2225
2226         else
2227          {
2228          ecode = code + GET(code, 1);
2229          if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2230          }
2231
2232         /* At this point, isinclass is set for all kinds of class, and ecode
2233         points to the byte after the end of the class. If there is a
2234         quantifier, this is where it will be. */
2235
2236         next_state_offset = ecode - start_code;
2237
2238         switch (*ecode)
2239           {
2240           case OP_CRSTAR:
2241           case OP_CRMINSTAR:
2242           ADD_ACTIVE(next_state_offset + 1, 0);
2243           if (isinclass) { ADD_NEW(state_offset, 0); }
2244           break;
2245
2246           case OP_CRPLUS:
2247           case OP_CRMINPLUS:
2248           count = current_state->count;  /* Already matched */
2249           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2250           if (isinclass) { count++; ADD_NEW(state_offset, count); }
2251           break;
2252
2253           case OP_CRQUERY:
2254           case OP_CRMINQUERY:
2255           ADD_ACTIVE(next_state_offset + 1, 0);
2256           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2257           break;
2258
2259           case OP_CRRANGE:
2260           case OP_CRMINRANGE:
2261           count = current_state->count;  /* Already matched */
2262           if (count >= GET2(ecode, 1))
2263             { ADD_ACTIVE(next_state_offset + 5, 0); }
2264           if (isinclass)
2265             {
2266             int max = GET2(ecode, 3);
2267             if (++count >= max && max != 0)   /* Max 0 => no limit */
2268               { ADD_NEW(next_state_offset + 5, 0); }
2269             else
2270               { ADD_NEW(state_offset, count); }
2271             }
2272           break;
2273
2274           default:
2275           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2276           break;
2277           }
2278         }
2279       break;
2280
2281 /* ========================================================================== */
2282       /* These are the opcodes for fancy brackets of various kinds. We have
2283       to use recursion in order to handle them. The "always failing" assertion
2284       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2285       though the other "backtracking verbs" are not supported. */
2286
2287       case OP_FAIL:
2288       forced_fail++;    /* Count FAILs for multiple states */
2289       break;
2290
2291       case OP_ASSERT:
2292       case OP_ASSERT_NOT:
2293       case OP_ASSERTBACK:
2294       case OP_ASSERTBACK_NOT:
2295         {
2296         int rc;
2297         int local_offsets[2];
2298         int local_workspace[1000];
2299         const uschar *endasscode = code + GET(code, 1);
2300
2301         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2302
2303         rc = internal_dfa_exec(
2304           md,                                   /* static match data */
2305           code,                                 /* this subexpression's code */
2306           ptr,                                  /* where we currently are */
2307           ptr - start_subject,                  /* start offset */
2308           local_offsets,                        /* offset vector */
2309           sizeof(local_offsets)/sizeof(int),    /* size of same */
2310           local_workspace,                      /* workspace vector */
2311           sizeof(local_workspace)/sizeof(int),  /* size of same */
2312           ims,                                  /* the current ims flags */
2313           rlevel,                               /* function recursion level */
2314           recursing);                           /* pass on regex recursion */
2315
2316         if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2317         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2318             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2319         }
2320       break;
2321
2322       /*-----------------------------------------------------------------*/
2323       case OP_COND:
2324       case OP_SCOND:
2325         {
2326         int local_offsets[1000];
2327         int local_workspace[1000];
2328         int codelink = GET(code, 1);
2329         int condcode;
2330
2331         /* Because of the way auto-callout works during compile, a callout item
2332         is inserted between OP_COND and an assertion condition. This does not
2333         happen for the other conditions. */
2334
2335         if (code[LINK_SIZE+1] == OP_CALLOUT)
2336           {
2337           rrc = 0;
2338           if (pcre_callout != NULL)
2339             {
2340             pcre_callout_block cb;
2341             cb.version          = 1;   /* Version 1 of the callout block */
2342             cb.callout_number   = code[LINK_SIZE+2];
2343             cb.offset_vector    = offsets;
2344             cb.subject          = (PCRE_SPTR)start_subject;
2345             cb.subject_length   = end_subject - start_subject;
2346             cb.start_match      = current_subject - start_subject;
2347             cb.current_position = ptr - start_subject;
2348             cb.pattern_position = GET(code, LINK_SIZE + 3);
2349             cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2350             cb.capture_top      = 1;
2351             cb.capture_last     = -1;
2352             cb.callout_data     = md->callout_data;
2353             if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2354             }
2355           if (rrc > 0) break;                      /* Fail this thread */
2356           code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
2357           }
2358
2359         condcode = code[LINK_SIZE+1];
2360
2361         /* Back reference conditions are not supported */
2362
2363         if (condcode == OP_CREF || condcode == OP_NCREF)
2364           return PCRE_ERROR_DFA_UCOND;
2365
2366         /* The DEFINE condition is always false */
2367
2368         if (condcode == OP_DEF)
2369           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2370
2371         /* The only supported version of OP_RREF is for the value RREF_ANY,
2372         which means "test if in any recursion". We can't test for specifically
2373         recursed groups. */
2374
2375         else if (condcode == OP_RREF || condcode == OP_NRREF)
2376           {
2377           int value = GET2(code, LINK_SIZE+2);
2378           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2379           if (recursing > 0)
2380             { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2381           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2382           }
2383
2384         /* Otherwise, the condition is an assertion */
2385
2386         else
2387           {
2388           int rc;
2389           const uschar *asscode = code + LINK_SIZE + 1;
2390           const uschar *endasscode = asscode + GET(asscode, 1);
2391
2392           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2393
2394           rc = internal_dfa_exec(
2395             md,                                   /* fixed match data */
2396             asscode,                              /* this subexpression's code */
2397             ptr,                                  /* where we currently are */
2398             ptr - start_subject,                  /* start offset */
2399             local_offsets,                        /* offset vector */
2400             sizeof(local_offsets)/sizeof(int),    /* size of same */
2401             local_workspace,                      /* workspace vector */
2402             sizeof(local_workspace)/sizeof(int),  /* size of same */
2403             ims,                                  /* the current ims flags */
2404             rlevel,                               /* function recursion level */
2405             recursing);                           /* pass on regex recursion */
2406
2407           if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2408           if ((rc >= 0) ==
2409                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2410             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2411           else
2412             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2413           }
2414         }
2415       break;
2416
2417       /*-----------------------------------------------------------------*/
2418       case OP_RECURSE:
2419         {
2420         int local_offsets[1000];
2421         int local_workspace[1000];
2422         int rc;
2423
2424         DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2425           recursing + 1));
2426
2427         rc = internal_dfa_exec(
2428           md,                                   /* fixed match data */
2429           start_code + GET(code, 1),            /* this subexpression's code */
2430           ptr,                                  /* where we currently are */
2431           ptr - start_subject,                  /* start offset */
2432           local_offsets,                        /* offset vector */
2433           sizeof(local_offsets)/sizeof(int),    /* size of same */
2434           local_workspace,                      /* workspace vector */
2435           sizeof(local_workspace)/sizeof(int),  /* size of same */
2436           ims,                                  /* the current ims flags */
2437           rlevel,                               /* function recursion level */
2438           recursing + 1);                       /* regex recurse level */
2439
2440         DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2441           recursing + 1, rc));
2442
2443         /* Ran out of internal offsets */
2444
2445         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2446
2447         /* For each successful matched substring, set up the next state with a
2448         count of characters to skip before trying it. Note that the count is in
2449         characters, not bytes. */
2450
2451         if (rc > 0)
2452           {
2453           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2454             {
2455             const uschar *p = start_subject + local_offsets[rc];
2456             const uschar *pp = start_subject + local_offsets[rc+1];
2457             int charcount = local_offsets[rc+1] - local_offsets[rc];
2458             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2459             if (charcount > 0)
2460               {
2461               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2462               }
2463             else
2464               {
2465               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2466               }
2467             }
2468           }
2469         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2470         }
2471       break;
2472
2473       /*-----------------------------------------------------------------*/
2474       case OP_ONCE:
2475         {
2476         int local_offsets[2];
2477         int local_workspace[1000];
2478
2479         int rc = internal_dfa_exec(
2480           md,                                   /* fixed match data */
2481           code,                                 /* this subexpression's code */
2482           ptr,                                  /* where we currently are */
2483           ptr - start_subject,                  /* start offset */
2484           local_offsets,                        /* offset vector */
2485           sizeof(local_offsets)/sizeof(int),    /* size of same */
2486           local_workspace,                      /* workspace vector */
2487           sizeof(local_workspace)/sizeof(int),  /* size of same */
2488           ims,                                  /* the current ims flags */
2489           rlevel,                               /* function recursion level */
2490           recursing);                           /* pass on regex recursion */
2491
2492         if (rc >= 0)
2493           {
2494           const uschar *end_subpattern = code;
2495           int charcount = local_offsets[1] - local_offsets[0];
2496           int next_state_offset, repeat_state_offset;
2497
2498           do { end_subpattern += GET(end_subpattern, 1); }
2499             while (*end_subpattern == OP_ALT);
2500           next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2501
2502           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2503           arrange for the repeat state also to be added to the relevant list.
2504           Calculate the offset, or set -1 for no repeat. */
2505
2506           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2507                                  *end_subpattern == OP_KETRMIN)?
2508             end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2509
2510           /* If we have matched an empty string, add the next state at the
2511           current character pointer. This is important so that the duplicate
2512           checking kicks in, which is what breaks infinite loops that match an
2513           empty string. */
2514
2515           if (charcount == 0)
2516             {
2517             ADD_ACTIVE(next_state_offset, 0);
2518             }
2519
2520           /* Optimization: if there are no more active states, and there
2521           are no new states yet set up, then skip over the subject string
2522           right here, to save looping. Otherwise, set up the new state to swing
2523           into action when the end of the substring is reached. */
2524
2525           else if (i + 1 >= active_count && new_count == 0)
2526             {
2527             ptr += charcount;
2528             clen = 0;
2529             ADD_NEW(next_state_offset, 0);
2530
2531             /* If we are adding a repeat state at the new character position,
2532             we must fudge things so that it is the only current state.
2533             Otherwise, it might be a duplicate of one we processed before, and
2534             that would cause it to be skipped. */
2535
2536             if (repeat_state_offset >= 0)
2537               {
2538               next_active_state = active_states;
2539               active_count = 0;
2540               i = -1;
2541               ADD_ACTIVE(repeat_state_offset, 0);
2542               }
2543             }
2544           else
2545             {
2546             const uschar *p = start_subject + local_offsets[0];
2547             const uschar *pp = start_subject + local_offsets[1];
2548             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2549             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2550             if (repeat_state_offset >= 0)
2551               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2552             }
2553
2554           }
2555         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2556         }
2557       break;
2558
2559
2560 /* ========================================================================== */
2561       /* Handle callouts */
2562
2563       case OP_CALLOUT:
2564       rrc = 0;
2565       if (pcre_callout != NULL)
2566         {
2567         pcre_callout_block cb;
2568         cb.version          = 1;   /* Version 1 of the callout block */
2569         cb.callout_number   = code[1];
2570         cb.offset_vector    = offsets;
2571         cb.subject          = (PCRE_SPTR)start_subject;
2572         cb.subject_length   = end_subject - start_subject;
2573         cb.start_match      = current_subject - start_subject;
2574         cb.current_position = ptr - start_subject;
2575         cb.pattern_position = GET(code, 2);
2576         cb.next_item_length = GET(code, 2 + LINK_SIZE);
2577         cb.capture_top      = 1;
2578         cb.capture_last     = -1;
2579         cb.callout_data     = md->callout_data;
2580         if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2581         }
2582       if (rrc == 0)
2583         { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2584       break;
2585
2586
2587 /* ========================================================================== */
2588       default:        /* Unsupported opcode */
2589       return PCRE_ERROR_DFA_UITEM;
2590       }
2591
2592     NEXT_ACTIVE_STATE: continue;
2593
2594     }      /* End of loop scanning active states */
2595
2596   /* We have finished the processing at the current subject character. If no
2597   new states have been set for the next character, we have found all the
2598   matches that we are going to find. If we are at the top level and partial
2599   matching has been requested, check for appropriate conditions.
2600
2601   The "forced_ fail" variable counts the number of (*F) encountered for the
2602   character. If it is equal to the original active_count (saved in
2603   workspace[1]) it means that (*F) was found on every active state. In this
2604   case we don't want to give a partial match.
2605
2606   The "could_continue" variable is true if a state could have continued but
2607   for the fact that the end of the subject was reached. */
2608
2609   if (new_count <= 0)
2610     {
2611     if (rlevel == 1 &&                               /* Top level, and */
2612         could_continue &&                            /* Some could go on */
2613         forced_fail != workspace[1] &&               /* Not all forced fail & */
2614         (                                            /* either... */
2615         (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2616         ||                                           /* or... */
2617         ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
2618          match_count < 0)                            /* no matches */
2619         ) &&                                         /* And... */
2620         ptr >= end_subject &&                     /* Reached end of subject */
2621         ptr > current_subject)                    /* Matched non-empty string */
2622       {
2623       if (offsetcount >= 2)
2624         {
2625         offsets[0] = md->start_used_ptr - start_subject;
2626         offsets[1] = end_subject - start_subject;
2627         }
2628       match_count = PCRE_ERROR_PARTIAL;
2629       }
2630
2631     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2632       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2633       rlevel*2-2, SP));
2634     break;        /* In effect, "return", but see the comment below */
2635     }
2636
2637   /* One or more states are active for the next character. */
2638
2639   ptr += clen;    /* Advance to next subject character */
2640   }               /* Loop to move along the subject string */
2641
2642 /* Control gets here from "break" a few lines above. We do it this way because
2643 if we use "return" above, we have compiler trouble. Some compilers warn if
2644 there's nothing here because they think the function doesn't return a value. On
2645 the other hand, if we put a dummy statement here, some more clever compilers
2646 complain that it can't be reached. Sigh. */
2647
2648 return match_count;
2649 }
2650
2651
2652
2653
2654 /*************************************************
2655 *    Execute a Regular Expression - DFA engine   *
2656 *************************************************/
2657
2658 /* This external function applies a compiled re to a subject string using a DFA
2659 engine. This function calls the internal function multiple times if the pattern
2660 is not anchored.
2661
2662 Arguments:
2663   argument_re     points to the compiled expression
2664   extra_data      points to extra data or is NULL
2665   subject         points to the subject string
2666   length          length of subject string (may contain binary zeros)
2667   start_offset    where to start in the subject string
2668   options         option bits
2669   offsets         vector of match offsets
2670   offsetcount     size of same
2671   workspace       workspace vector
2672   wscount         size of same
2673
2674 Returns:          > 0 => number of match offset pairs placed in offsets
2675                   = 0 => offsets overflowed; longest matches are present
2676                    -1 => failed to match
2677                  < -1 => some kind of unexpected problem
2678 */
2679
2680 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2681 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2682   const char *subject, int length, int start_offset, int options, int *offsets,
2683   int offsetcount, int *workspace, int wscount)
2684 {
2685 real_pcre *re = (real_pcre *)argument_re;
2686 dfa_match_data match_block;
2687 dfa_match_data *md = &match_block;
2688 BOOL utf8, anchored, startline, firstline;
2689 const uschar *current_subject, *end_subject, *lcc;
2690
2691 pcre_study_data internal_study;
2692 const pcre_study_data *study = NULL;
2693 real_pcre internal_re;
2694
2695 const uschar *req_byte_ptr;
2696 const uschar *start_bits = NULL;
2697 BOOL first_byte_caseless = FALSE;
2698 BOOL req_byte_caseless = FALSE;
2699 int first_byte = -1;
2700 int req_byte = -1;
2701 int req_byte2 = -1;
2702 int newline;
2703
2704 /* Plausibility checks */
2705
2706 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2707 if (re == NULL || subject == NULL || workspace == NULL ||
2708    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2709 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2710 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2711
2712 /* We need to find the pointer to any study data before we test for byte
2713 flipping, so we scan the extra_data block first. This may set two fields in the
2714 match block, so we must initialize them beforehand. However, the other fields
2715 in the match block must not be set until after the byte flipping. */
2716
2717 md->tables = re->tables;
2718 md->callout_data = NULL;
2719
2720 if (extra_data != NULL)
2721   {
2722   unsigned int flags = extra_data->flags;
2723   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2724     study = (const pcre_study_data *)extra_data->study_data;
2725   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2726   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2727     return PCRE_ERROR_DFA_UMLIMIT;
2728   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2729     md->callout_data = extra_data->callout_data;
2730   if ((flags & PCRE_EXTRA_TABLES) != 0)
2731     md->tables = extra_data->tables;
2732   }
2733
2734 /* Check that the first field in the block is the magic number. If it is not,
2735 test for a regex that was compiled on a host of opposite endianness. If this is
2736 the case, flipped values are put in internal_re and internal_study if there was
2737 study data too. */
2738
2739 if (re->magic_number != MAGIC_NUMBER)
2740   {
2741   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2742   if (re == NULL) return PCRE_ERROR_BADMAGIC;
2743   if (study != NULL) study = &internal_study;
2744   }
2745
2746 /* Set some local values */
2747
2748 current_subject = (const unsigned char *)subject + start_offset;
2749 end_subject = (const unsigned char *)subject + length;
2750 req_byte_ptr = current_subject - 1;
2751
2752 #ifdef SUPPORT_UTF8
2753 utf8 = (re->options & PCRE_UTF8) != 0;
2754 #else
2755 utf8 = FALSE;
2756 #endif
2757
2758 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2759   (re->options & PCRE_ANCHORED) != 0;
2760
2761 /* The remaining fixed data for passing around. */
2762
2763 md->start_code = (const uschar *)argument_re +
2764     re->name_table_offset + re->name_count * re->name_entry_size;
2765 md->start_subject = (const unsigned char *)subject;
2766 md->end_subject = end_subject;
2767 md->start_offset = start_offset;
2768 md->moptions = options;
2769 md->poptions = re->options;
2770
2771 /* If the BSR option is not set at match time, copy what was set
2772 at compile time. */
2773
2774 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2775   {
2776   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2777     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2778 #ifdef BSR_ANYCRLF
2779   else md->moptions |= PCRE_BSR_ANYCRLF;
2780 #endif
2781   }
2782
2783 /* Handle different types of newline. The three bits give eight cases. If
2784 nothing is set at run time, whatever was used at compile time applies. */
2785
2786 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2787          PCRE_NEWLINE_BITS)
2788   {
2789   case 0: newline = NEWLINE; break;   /* Compile-time default */
2790   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2791   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2792   case PCRE_NEWLINE_CR+
2793        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2794   case PCRE_NEWLINE_ANY: newline = -1; break;
2795   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2796   default: return PCRE_ERROR_BADNEWLINE;
2797   }
2798
2799 if (newline == -2)
2800   {
2801   md->nltype = NLTYPE_ANYCRLF;
2802   }
2803 else if (newline < 0)
2804   {
2805   md->nltype = NLTYPE_ANY;
2806   }
2807 else
2808   {
2809   md->nltype = NLTYPE_FIXED;
2810   if (newline > 255)
2811     {
2812     md->nllen = 2;
2813     md->nl[0] = (newline >> 8) & 255;
2814     md->nl[1] = newline & 255;
2815     }
2816   else
2817     {
2818     md->nllen = 1;
2819     md->nl[0] = newline;
2820     }
2821   }
2822
2823 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2824 back the character offset. */
2825
2826 #ifdef SUPPORT_UTF8
2827 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2828   {
2829   if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2830     return PCRE_ERROR_BADUTF8;
2831   if (start_offset > 0 && start_offset < length)
2832     {
2833     int tb = ((uschar *)subject)[start_offset];
2834     if (tb > 127)
2835       {
2836       tb &= 0xc0;
2837       if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2838       }
2839     }
2840   }
2841 #endif
2842
2843 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2844 is a feature that makes it possible to save compiled regex and re-use them
2845 in other programs later. */
2846
2847 if (md->tables == NULL) md->tables = _pcre_default_tables;
2848
2849 /* The lower casing table and the "must be at the start of a line" flag are
2850 used in a loop when finding where to start. */
2851
2852 lcc = md->tables + lcc_offset;
2853 startline = (re->flags & PCRE_STARTLINE) != 0;
2854 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2855
2856 /* Set up the first character to match, if available. The first_byte value is
2857 never set for an anchored regular expression, but the anchoring may be forced
2858 at run time, so we have to test for anchoring. The first char may be unset for
2859 an unanchored pattern, of course. If there's no first char and the pattern was
2860 studied, there may be a bitmap of possible first characters. */
2861
2862 if (!anchored)
2863   {
2864   if ((re->flags & PCRE_FIRSTSET) != 0)
2865     {
2866     first_byte = re->first_byte & 255;
2867     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2868       first_byte = lcc[first_byte];
2869     }
2870   else
2871     {
2872     if (!startline && study != NULL &&
2873          (study->flags & PCRE_STUDY_MAPPED) != 0)
2874       start_bits = study->start_bits;
2875     }
2876   }
2877
2878 /* For anchored or unanchored matches, there may be a "last known required
2879 character" set. */
2880
2881 if ((re->flags & PCRE_REQCHSET) != 0)
2882   {
2883   req_byte = re->req_byte & 255;
2884   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2885   req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
2886   }
2887
2888 /* Call the main matching function, looping for a non-anchored regex after a
2889 failed match. If not restarting, perform certain optimizations at the start of
2890 a match. */
2891
2892 for (;;)
2893   {
2894   int rc;
2895
2896   if ((options & PCRE_DFA_RESTART) == 0)
2897     {
2898     const uschar *save_end_subject = end_subject;
2899
2900     /* If firstline is TRUE, the start of the match is constrained to the first
2901     line of a multiline string. Implement this by temporarily adjusting
2902     end_subject so that we stop scanning at a newline. If the match fails at
2903     the newline, later code breaks this loop. */
2904
2905     if (firstline)
2906       {
2907       USPTR t = current_subject;
2908 #ifdef SUPPORT_UTF8
2909       if (utf8)
2910         {
2911         while (t < md->end_subject && !IS_NEWLINE(t))
2912           {
2913           t++;
2914           while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2915           }
2916         }
2917       else
2918 #endif
2919       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2920       end_subject = t;
2921       }
2922
2923     /* There are some optimizations that avoid running the match if a known
2924     starting point is not found. However, there is an option that disables
2925     these, for testing and for ensuring that all callouts do actually occur. */
2926
2927     if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2928       {
2929       /* Advance to a known first byte. */
2930
2931       if (first_byte >= 0)
2932         {
2933         if (first_byte_caseless)
2934           while (current_subject < end_subject &&
2935                  lcc[*current_subject] != first_byte)
2936             current_subject++;
2937         else
2938           while (current_subject < end_subject &&
2939                  *current_subject != first_byte)
2940             current_subject++;
2941         }
2942
2943       /* Or to just after a linebreak for a multiline match if possible */
2944
2945       else if (startline)
2946         {
2947         if (current_subject > md->start_subject + start_offset)
2948           {
2949 #ifdef SUPPORT_UTF8
2950           if (utf8)
2951             {
2952             while (current_subject < end_subject &&
2953                    !WAS_NEWLINE(current_subject))
2954               {
2955               current_subject++;
2956               while(current_subject < end_subject &&
2957                     (*current_subject & 0xc0) == 0x80)
2958                 current_subject++;
2959               }
2960             }
2961           else
2962 #endif
2963           while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2964             current_subject++;
2965
2966           /* If we have just passed a CR and the newline option is ANY or
2967           ANYCRLF, and we are now at a LF, advance the match position by one
2968           more character. */
2969
2970           if (current_subject[-1] == CHAR_CR &&
2971                (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2972                current_subject < end_subject &&
2973                *current_subject == CHAR_NL)
2974             current_subject++;
2975           }
2976         }
2977
2978       /* Or to a non-unique first char after study */
2979
2980       else if (start_bits != NULL)
2981         {
2982         while (current_subject < end_subject)
2983           {
2984           register unsigned int c = *current_subject;
2985           if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2986             else break;
2987           }
2988         }
2989       }
2990
2991     /* Restore fudged end_subject */
2992
2993     end_subject = save_end_subject;
2994
2995     /* The following two optimizations are disabled for partial matching or if
2996     disabling is explicitly requested (and of course, by the test above, this
2997     code is not obeyed when restarting after a partial match). */
2998
2999     if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
3000         (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3001       {
3002       /* If the pattern was studied, a minimum subject length may be set. This
3003       is a lower bound; no actual string of that length may actually match the
3004       pattern. Although the value is, strictly, in characters, we treat it as
3005       bytes to avoid spending too much time in this optimization. */
3006
3007       if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3008           (pcre_uint32)(end_subject - current_subject) < study->minlength)
3009         return PCRE_ERROR_NOMATCH;
3010
3011       /* If req_byte is set, we know that that character must appear in the
3012       subject for the match to succeed. If the first character is set, req_byte
3013       must be later in the subject; otherwise the test starts at the match
3014       point. This optimization can save a huge amount of work in patterns with
3015       nested unlimited repeats that aren't going to match. Writing separate
3016       code for cased/caseless versions makes it go faster, as does using an
3017       autoincrement and backing off on a match.
3018
3019       HOWEVER: when the subject string is very, very long, searching to its end
3020       can take a long time, and give bad performance on quite ordinary
3021       patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3022       string... so we don't do this when the string is sufficiently long. */
3023
3024       if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3025         {
3026         register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3027
3028         /* We don't need to repeat the search if we haven't yet reached the
3029         place we found it at last time. */
3030
3031         if (p > req_byte_ptr)
3032           {
3033           if (req_byte_caseless)
3034             {
3035             while (p < end_subject)
3036               {
3037               register int pp = *p++;
3038               if (pp == req_byte || pp == req_byte2) { p--; break; }
3039               }
3040             }
3041           else
3042             {
3043             while (p < end_subject)
3044               {
3045               if (*p++ == req_byte) { p--; break; }
3046               }
3047             }
3048
3049           /* If we can't find the required character, break the matching loop,
3050           which will cause a return or PCRE_ERROR_NOMATCH. */
3051
3052           if (p >= end_subject) break;
3053
3054           /* If we have found the required character, save the point where we
3055           found it, so that we don't search again next time round the loop if
3056           the start hasn't passed this character yet. */
3057
3058           req_byte_ptr = p;
3059           }
3060         }
3061       }
3062     }   /* End of optimizations that are done when not restarting */
3063
3064   /* OK, now we can do the business */
3065
3066   md->start_used_ptr = current_subject;
3067
3068   rc = internal_dfa_exec(
3069     md,                                /* fixed match data */
3070     md->start_code,                    /* this subexpression's code */
3071     current_subject,                   /* where we currently are */
3072     start_offset,                      /* start offset in subject */
3073     offsets,                           /* offset vector */
3074     offsetcount,                       /* size of same */
3075     workspace,                         /* workspace vector */
3076     wscount,                           /* size of same */
3077     re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
3078     0,                                 /* function recurse level */
3079     0);                                /* regex recurse level */
3080
3081   /* Anything other than "no match" means we are done, always; otherwise, carry
3082   on only if not anchored. */
3083
3084   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3085
3086   /* Advance to the next subject character unless we are at the end of a line
3087   and firstline is set. */
3088
3089   if (firstline && IS_NEWLINE(current_subject)) break;
3090   current_subject++;
3091   if (utf8)
3092     {
3093     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3094       current_subject++;
3095     }
3096   if (current_subject > end_subject) break;
3097
3098   /* If we have just passed a CR and we are now at a LF, and the pattern does
3099   not contain any explicit matches for \r or \n, and the newline option is CRLF
3100   or ANY or ANYCRLF, advance the match position by one more character. */
3101
3102   if (current_subject[-1] == CHAR_CR &&
3103       current_subject < end_subject &&
3104       *current_subject == CHAR_NL &&
3105       (re->flags & PCRE_HASCRORLF) == 0 &&
3106         (md->nltype == NLTYPE_ANY ||
3107          md->nltype == NLTYPE_ANYCRLF ||
3108          md->nllen == 2))
3109     current_subject++;
3110
3111   }   /* "Bumpalong" loop */
3112
3113 return PCRE_ERROR_NOMATCH;
3114 }
3115
3116 /* End of pcre_dfa_exec.c */