glib/pcre/pcre_dfa_exec.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language (but see
   7 below for why this module is different).
   8
   9                        Written by Philip Hazel
  10            Copyright (c) 1997-2012 University of Cambridge
  11
  12 -----------------------------------------------------------------------------
  13 Redistribution and use in source and binary forms, with or without
  14 modification, are permitted provided that the following conditions are met:
  15
  16     * Redistributions of source code must retain the above copyright notice,
  17       this list of conditions and the following disclaimer.
  18
  19     * Redistributions in binary form must reproduce the above copyright
  20       notice, this list of conditions and the following disclaimer in the
  21       documentation and/or other materials provided with the distribution.
  22
  23     * Neither the name of the University of Cambridge nor the names of its
  24       contributors may be used to endorse or promote products derived from
  25       this software without specific prior written permission.
  26
  27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  37 POSSIBILITY OF SUCH DAMAGE.
  38 -----------------------------------------------------------------------------
  39 */
  40
  41
  42 /* This module contains the external function pcre_dfa_exec(), which is an
  43 alternative matching function that uses a sort of DFA algorithm (not a true
  44 FSM). This is NOT Perl- compatible, but it has advantages in certain
  45 applications. */
  46
  47
  48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
  49 the performance of his patterns greatly. I could not use it as it stood, as it
  50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
  51 test 7 to loop, and test 9 to crash with a segfault.
  52
  53 The issue is the check for duplicate states, which is done by a simple linear
  54 search up the state list. (Grep for "duplicate" below to find the code.) For
  55 many patterns, there will never be many states active at one time, so a simple
  56 linear search is fine. In patterns that have many active states, it might be a
  57 bottleneck. The suggested code used an indexing scheme to remember which states
  58 had previously been used for each character, and avoided the linear search when
  59 it knew there was no chance of a duplicate. This was implemented when adding
  60 states to the state lists.
  61
  62 I wrote some thread-safe, not-limited code to try something similar at the time
  63 of checking for duplicates (instead of when adding states), using index vectors
  64 on the stack. It did give a 13% improvement with one specially constructed
  65 pattern for certain subject strings, but on other strings and on many of the
  66 simpler patterns in the test suite it did worse. The major problem, I think,
  67 was the extra time to initialize the index. This had to be done for each call
  68 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
  69 only once - I suspect this was the cause of the problems with the tests.)
  70
  71 Overall, I concluded that the gains in some cases did not outweigh the losses
  72 in others, so I abandoned this code. */
  73
  74
  75
  76 #ifdef HAVE_CONFIG_H
  77 #include "config.h"
  78 #endif
  79
  80 #define NLBLOCK md             /* Block containing newline information */
  81 #define PSSTART start_subject  /* Field containing processed string start */
  82 #define PSEND   end_subject    /* Field containing processed string end */
  83
  84 #include "pcre_internal.h"
  85
  86
  87 /* For use to indent debugging output */
  88
  89 #define SP "                   "
  90
  91
  92 /*************************************************
  93 *      Code parameters and static tables         *
  94 *************************************************/
  95
  96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
  97 into others, under special conditions. A gap of 20 between the blocks should be
  98 enough. The resulting opcodes don't have to be less than 256 because they are
  99 never stored, so we push them well clear of the normal opcodes. */
 100
 101 #define OP_PROP_EXTRA       300
 102 #define OP_EXTUNI_EXTRA     320
 103 #define OP_ANYNL_EXTRA      340
 104 #define OP_HSPACE_EXTRA     360
 105 #define OP_VSPACE_EXTRA     380
 106
 107
 108 /* This table identifies those opcodes that are followed immediately by a
 109 character that is to be tested in some way. This makes it possible to
 110 centralize the loading of these characters. In the case of Type * etc, the
 111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
 112 small value. Non-zero values in the table are the offsets from the opcode where
 113 the character is to be found. ***NOTE*** If the start of this table is
 114 modified, the three tables that follow must also be modified. */
 115
 116 static const pcre_uint8 coptable[] = {
 117   0,                             /* End                                    */
 118   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
 119   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
 120   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
 121   0, 0,                          /* \P, \p                                 */
 122   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
 123   0,                             /* \X                                     */
 124   0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
 125   1,                             /* Char                                   */
 126   1,                             /* Chari                                  */
 127   1,                             /* not                                    */
 128   1,                             /* noti                                   */
 129   /* Positive single-char repeats                                          */
 130   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 131   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
 132   1+IMM2_SIZE,                   /* exact                                  */
 133   1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
 134   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
 135   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
 136   1+IMM2_SIZE,                   /* exact I                                */
 137   1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
 138   /* Negative single-char repeats - only for chars < 256                   */
 139   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
 140   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
 141   1+IMM2_SIZE,                   /* NOT exact                              */
 142   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
 143   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
 144   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
 145   1+IMM2_SIZE,                   /* NOT exact I                            */
 146   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
 147   /* Positive type repeats                                                 */
 148   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
 149   1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
 150   1+IMM2_SIZE,                   /* Type exact                             */
 151   1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
 152   /* Character class & ref repeats                                         */
 153   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
 154   0, 0,                          /* CRRANGE, CRMINRANGE                    */
 155   0,                             /* CLASS                                  */
 156   0,                             /* NCLASS                                 */
 157   0,                             /* XCLASS - variable length               */
 158   0,                             /* REF                                    */
 159   0,                             /* REFI                                   */
 160   0,                             /* RECURSE                                */
 161   0,                             /* CALLOUT                                */
 162   0,                             /* Alt                                    */
 163   0,                             /* Ket                                    */
 164   0,                             /* KetRmax                                */
 165   0,                             /* KetRmin                                */
 166   0,                             /* KetRpos                                */
 167   0,                             /* Reverse                                */
 168   0,                             /* Assert                                 */
 169   0,                             /* Assert not                             */
 170   0,                             /* Assert behind                          */
 171   0,                             /* Assert behind not                      */
 172   0, 0,                          /* ONCE, ONCE_NC                          */
 173   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
 174   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
 175   0, 0,                          /* CREF, NCREF                            */
 176   0, 0,                          /* RREF, NRREF                            */
 177   0,                             /* DEF                                    */
 178   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
 179   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
 180   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
 181   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
 182   0, 0                           /* CLOSE, SKIPZERO  */
 183 };
 184
 185 /* This table identifies those opcodes that inspect a character. It is used to
 186 remember the fact that a character could have been inspected when the end of
 187 the subject is reached. ***NOTE*** If the start of this table is modified, the
 188 two tables that follow must also be modified. */
 189
 190 static const pcre_uint8 poptable[] = {
 191   0,                             /* End                                    */
 192   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
 193   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
 194   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
 195   1, 1,                          /* \P, \p                                 */
 196   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
 197   1,                             /* \X                                     */
 198   0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
 199   1,                             /* Char                                   */
 200   1,                             /* Chari                                  */
 201   1,                             /* not                                    */
 202   1,                             /* noti                                   */
 203   /* Positive single-char repeats                                          */
 204   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 205   1, 1, 1,                       /* upto, minupto, exact                   */
 206   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
 207   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
 208   1, 1, 1,                       /* upto I, minupto I, exact I             */
 209   1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
 210   /* Negative single-char repeats - only for chars < 256                   */
 211   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
 212   1, 1, 1,                       /* NOT upto, minupto, exact               */
 213   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
 214   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
 215   1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
 216   1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
 217   /* Positive type repeats                                                 */
 218   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
 219   1, 1, 1,                       /* Type upto, minupto, exact              */
 220   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
 221   /* Character class & ref repeats                                         */
 222   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 223   1, 1,                          /* CRRANGE, CRMINRANGE                    */
 224   1,                             /* CLASS                                  */
 225   1,                             /* NCLASS                                 */
 226   1,                             /* XCLASS - variable length               */
 227   0,                             /* REF                                    */
 228   0,                             /* REFI                                   */
 229   0,                             /* RECURSE                                */
 230   0,                             /* CALLOUT                                */
 231   0,                             /* Alt                                    */
 232   0,                             /* Ket                                    */
 233   0,                             /* KetRmax                                */
 234   0,                             /* KetRmin                                */
 235   0,                             /* KetRpos                                */
 236   0,                             /* Reverse                                */
 237   0,                             /* Assert                                 */
 238   0,                             /* Assert not                             */
 239   0,                             /* Assert behind                          */
 240   0,                             /* Assert behind not                      */
 241   0, 0,                          /* ONCE, ONCE_NC                          */
 242   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
 243   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
 244   0, 0,                          /* CREF, NCREF                            */
 245   0, 0,                          /* RREF, NRREF                            */
 246   0,                             /* DEF                                    */
 247   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
 248   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
 249   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
 250   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
 251   0, 0                           /* CLOSE, SKIPZERO                        */
 252 };
 253
 254 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
 255 and \w */
 256
 257 static const pcre_uint8 toptable1[] = {
 258   0, 0, 0, 0, 0, 0,
 259   ctype_digit, ctype_digit,
 260   ctype_space, ctype_space,
 261   ctype_word,  ctype_word,
 262   0, 0                            /* OP_ANY, OP_ALLANY */
 263 };
 264
 265 static const pcre_uint8 toptable2[] = {
 266   0, 0, 0, 0, 0, 0,
 267   ctype_digit, 0,
 268   ctype_space, 0,
 269   ctype_word,  0,
 270   1, 1                            /* OP_ANY, OP_ALLANY */
 271 };
 272
 273
 274 /* Structure for holding data about a particular state, which is in effect the
 275 current data for an active path through the match tree. It must consist
 276 entirely of ints because the working vector we are passed, and which we put
 277 these structures in, is a vector of ints. */
 278
 279 typedef struct stateblock {
 280   int offset;                     /* Offset to opcode */
 281   int count;                      /* Count for repeats */
 282   int data;                       /* Some use extra data */
 283 } stateblock;
 284
 285 #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
 286
 287
 288 #ifdef PCRE_DEBUG
 289 /*************************************************
 290 *             Print character string             *
 291 *************************************************/
 292
 293 /* Character string printing function for debugging.
 294
 295 Arguments:
 296   p            points to string
 297   length       number of bytes
 298   f            where to print
 299
 300 Returns:       nothing
 301 */
 302
 303 static void
 304 pchars(const pcre_uchar *p, int length, FILE *f)
 305 {
 306 int c;
 307 while (length-- > 0)
 308   {
 309   if (isprint(c = *(p++)))
 310     fprintf(f, "%c", c);
 311   else
 312     fprintf(f, "\\x%02x", c);
 313   }
 314 }
 315 #endif
 316
 317
 318
 319 /*************************************************
 320 *    Execute a Regular Expression - DFA engine   *
 321 *************************************************/
 322
 323 /* This internal function applies a compiled pattern to a subject string,
 324 starting at a given point, using a DFA engine. This function is called from the
 325 external one, possibly multiple times if the pattern is not anchored. The
 326 function calls itself recursively for some kinds of subpattern.
 327
 328 Arguments:
 329   md                the match_data block with fixed information
 330   this_start_code   the opening bracket of this subexpression's code
 331   current_subject   where we currently are in the subject string
 332   start_offset      start offset in the subject string
 333   offsets           vector to contain the matching string offsets
 334   offsetcount       size of same
 335   workspace         vector of workspace
 336   wscount           size of same
 337   rlevel            function call recursion level
 338
 339 Returns:            > 0 => number of match offset pairs placed in offsets
 340                     = 0 => offsets overflowed; longest matches are present
 341                      -1 => failed to match
 342                    < -1 => some kind of unexpected problem
 343
 344 The following macros are used for adding states to the two state vectors (one
 345 for the current character, one for the following character). */
 346
 347 #define ADD_ACTIVE(x,y) \
 348   if (active_count++ < wscount) \
 349     { \
 350     next_active_state->offset = (x); \
 351     next_active_state->count  = (y); \
 352     next_active_state++; \
 353     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 354     } \
 355   else return PCRE_ERROR_DFA_WSSIZE
 356
 357 #define ADD_ACTIVE_DATA(x,y,z) \
 358   if (active_count++ < wscount) \
 359     { \
 360     next_active_state->offset = (x); \
 361     next_active_state->count  = (y); \
 362     next_active_state->data   = (z); \
 363     next_active_state++; \
 364     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 365     } \
 366   else return PCRE_ERROR_DFA_WSSIZE
 367
 368 #define ADD_NEW(x,y) \
 369   if (new_count++ < wscount) \
 370     { \
 371     next_new_state->offset = (x); \
 372     next_new_state->count  = (y); \
 373     next_new_state++; \
 374     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 375     } \
 376   else return PCRE_ERROR_DFA_WSSIZE
 377
 378 #define ADD_NEW_DATA(x,y,z) \
 379   if (new_count++ < wscount) \
 380     { \
 381     next_new_state->offset = (x); \
 382     next_new_state->count  = (y); \
 383     next_new_state->data   = (z); \
 384     next_new_state++; \
 385     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 386     } \
 387   else return PCRE_ERROR_DFA_WSSIZE
 388
 389 /* And now, here is the code */
 390
 391 static int
 392 internal_dfa_exec(
 393   dfa_match_data *md,
 394   const pcre_uchar *this_start_code,
 395   const pcre_uchar *current_subject,
 396   int start_offset,
 397   int *offsets,
 398   int offsetcount,
 399   int *workspace,
 400   int wscount,
 401   int  rlevel)
 402 {
 403 stateblock *active_states, *new_states, *temp_states;
 404 stateblock *next_active_state, *next_new_state;
 405
 406 const pcre_uint8 *ctypes, *lcc, *fcc;
 407 const pcre_uchar *ptr;
 408 const pcre_uchar *end_code, *first_op;
 409
 410 dfa_recursion_info new_recursive;
 411
 412 int active_count, new_count, match_count;
 413
 414 /* Some fields in the md block are frequently referenced, so we load them into
 415 independent variables in the hope that this will perform better. */
 416
 417 const pcre_uchar *start_subject = md->start_subject;
 418 const pcre_uchar *end_subject = md->end_subject;
 419 const pcre_uchar *start_code = md->start_code;
 420
 421 #ifdef SUPPORT_UTF
 422 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
 423 #else
 424 BOOL utf = FALSE;
 425 #endif
 426
 427 rlevel++;
 428 offsetcount &= (-2);
 429
 430 wscount -= 2;
 431 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
 432           (2 * INTS_PER_STATEBLOCK);
 433
 434 DPRINTF(("\n%.*s---------------------\n"
 435   "%.*sCall to internal_dfa_exec f=%d\n",
 436   rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
 437
 438 ctypes = md->tables + ctypes_offset;
 439 lcc = md->tables + lcc_offset;
 440 fcc = md->tables + fcc_offset;
 441
 442 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
 443
 444 active_states = (stateblock *)(workspace + 2);
 445 next_new_state = new_states = active_states + wscount;
 446 new_count = 0;
 447
 448 first_op = this_start_code + 1 + LINK_SIZE +
 449   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
 450     *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
 451     ? IMM2_SIZE:0);
 452
 453 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
 454 the alternative states onto the list, and find out where the end is. This
 455 makes is possible to use this function recursively, when we want to stop at a
 456 matching internal ket rather than at the end.
 457
 458 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
 459 a backward assertion. In that case, we have to find out the maximum amount to
 460 move back, and set up each alternative appropriately. */
 461
 462 if (*first_op == OP_REVERSE)
 463   {
 464   int max_back = 0;
 465   int gone_back;
 466
 467   end_code = this_start_code;
 468   do
 469     {
 470     int back = GET(end_code, 2+LINK_SIZE);
 471     if (back > max_back) max_back = back;
 472     end_code += GET(end_code, 1);
 473     }
 474   while (*end_code == OP_ALT);
 475
 476   /* If we can't go back the amount required for the longest lookbehind
 477   pattern, go back as far as we can; some alternatives may still be viable. */
 478
 479 #ifdef SUPPORT_UTF
 480   /* In character mode we have to step back character by character */
 481
 482   if (utf)
 483     {
 484     for (gone_back = 0; gone_back < max_back; gone_back++)
 485       {
 486       if (current_subject <= start_subject) break;
 487       current_subject--;
 488       ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
 489       }
 490     }
 491   else
 492 #endif
 493
 494   /* In byte-mode we can do this quickly. */
 495
 496     {
 497     gone_back = (current_subject - max_back < start_subject)?
 498       (int)(current_subject - start_subject) : max_back;
 499     current_subject -= gone_back;
 500     }
 501
 502   /* Save the earliest consulted character */
 503
 504   if (current_subject < md->start_used_ptr)
 505     md->start_used_ptr = current_subject;
 506
 507   /* Now we can process the individual branches. */
 508
 509   end_code = this_start_code;
 510   do
 511     {
 512     int back = GET(end_code, 2+LINK_SIZE);
 513     if (back <= gone_back)
 514       {
 515       int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
 516       ADD_NEW_DATA(-bstate, 0, gone_back - back);
 517       }
 518     end_code += GET(end_code, 1);
 519     }
 520   while (*end_code == OP_ALT);
 521  }
 522
 523 /* This is the code for a "normal" subpattern (not a backward assertion). The
 524 start of a whole pattern is always one of these. If we are at the top level,
 525 we may be asked to restart matching from the same point that we reached for a
 526 previous partial match. We still have to scan through the top-level branches to
 527 find the end state. */
 528
 529 else
 530   {
 531   end_code = this_start_code;
 532
 533   /* Restarting */
 534
 535   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
 536     {
 537     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
 538     new_count = workspace[1];
 539     if (!workspace[0])
 540       memcpy(new_states, active_states, new_count * sizeof(stateblock));
 541     }
 542
 543   /* Not restarting */
 544
 545   else
 546     {
 547     int length = 1 + LINK_SIZE +
 548       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
 549         *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
 550         ? IMM2_SIZE:0);
 551     do
 552       {
 553       ADD_NEW((int)(end_code - start_code + length), 0);
 554       end_code += GET(end_code, 1);
 555       length = 1 + LINK_SIZE;
 556       }
 557     while (*end_code == OP_ALT);
 558     }
 559   }
 560
 561 workspace[0] = 0;    /* Bit indicating which vector is current */
 562
 563 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
 564
 565 /* Loop for scanning the subject */
 566
 567 ptr = current_subject;
 568 for (;;)
 569   {
 570   int i, j;
 571   int clen, dlen;
 572   unsigned int c, d;
 573   int forced_fail = 0;
 574   BOOL could_continue = FALSE;
 575
 576   /* Make the new state list into the active state list and empty the
 577   new state list. */
 578
 579   temp_states = active_states;
 580   active_states = new_states;
 581   new_states = temp_states;
 582   active_count = new_count;
 583   new_count = 0;
 584
 585   workspace[0] ^= 1;              /* Remember for the restarting feature */
 586   workspace[1] = active_count;
 587
 588 #ifdef PCRE_DEBUG
 589   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
 590   pchars(ptr, STRLEN_UC(ptr), stdout);
 591   printf("\"\n");
 592
 593   printf("%.*sActive states: ", rlevel*2-2, SP);
 594   for (i = 0; i < active_count; i++)
 595     printf("%d/%d ", active_states[i].offset, active_states[i].count);
 596   printf("\n");
 597 #endif
 598
 599   /* Set the pointers for adding new states */
 600
 601   next_active_state = active_states + active_count;
 602   next_new_state = new_states;
 603
 604   /* Load the current character from the subject outside the loop, as many
 605   different states may want to look at it, and we assume that at least one
 606   will. */
 607
 608   if (ptr < end_subject)
 609     {
 610     clen = 1;        /* Number of bytes in the character */
 611 #ifdef SUPPORT_UTF
 612     if (utf) { GETCHARLEN(c, ptr, clen); } else
 613 #endif  /* SUPPORT_UTF */
 614     c = *ptr;
 615     }
 616   else
 617     {
 618     clen = 0;        /* This indicates the end of the subject */
 619     c = NOTACHAR;    /* This value should never actually be used */
 620     }
 621
 622   /* Scan up the active states and act on each one. The result of an action
 623   may be to add more states to the currently active list (e.g. on hitting a
 624   parenthesis) or it may be to put states on the new list, for considering
 625   when we move the character pointer on. */
 626
 627   for (i = 0; i < active_count; i++)
 628     {
 629     stateblock *current_state = active_states + i;
 630     BOOL caseless = FALSE;
 631     const pcre_uchar *code;
 632     int state_offset = current_state->offset;
 633     int count, codevalue, rrc;
 634
 635 #ifdef PCRE_DEBUG
 636     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
 637     if (clen == 0) printf("EOL\n");
 638       else if (c > 32 && c < 127) printf("'%c'\n", c);
 639         else printf("0x%02x\n", c);
 640 #endif
 641
 642     /* A negative offset is a special case meaning "hold off going to this
 643     (negated) state until the number of characters in the data field have
 644     been skipped". */
 645
 646     if (state_offset < 0)
 647       {
 648       if (current_state->data > 0)
 649         {
 650         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
 651         ADD_NEW_DATA(state_offset, current_state->count,
 652           current_state->data - 1);
 653         continue;
 654         }
 655       else
 656         {
 657         current_state->offset = state_offset = -state_offset;
 658         }
 659       }
 660
 661     /* Check for a duplicate state with the same count, and skip if found.
 662     See the note at the head of this module about the possibility of improving
 663     performance here. */
 664
 665     for (j = 0; j < i; j++)
 666       {
 667       if (active_states[j].offset == state_offset &&
 668           active_states[j].count == current_state->count)
 669         {
 670         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
 671         goto NEXT_ACTIVE_STATE;
 672         }
 673       }
 674
 675     /* The state offset is the offset to the opcode */
 676
 677     code = start_code + state_offset;
 678     codevalue = *code;
 679
 680     /* If this opcode inspects a character, but we are at the end of the
 681     subject, remember the fact for use when testing for a partial match. */
 682
 683     if (clen == 0 && poptable[codevalue] != 0)
 684       could_continue = TRUE;
 685
 686     /* If this opcode is followed by an inline character, load it. It is
 687     tempting to test for the presence of a subject character here, but that
 688     is wrong, because sometimes zero repetitions of the subject are
 689     permitted.
 690
 691     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
 692     argument that is not a data character - but is always one byte long. We
 693     have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
 694     this case. To keep the other cases fast, convert these ones to new opcodes.
 695     */
 696
 697     if (coptable[codevalue] > 0)
 698       {
 699       dlen = 1;
 700 #ifdef SUPPORT_UTF
 701       if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
 702 #endif  /* SUPPORT_UTF */
 703       d = code[coptable[codevalue]];
 704       if (codevalue >= OP_TYPESTAR)
 705         {
 706         switch(d)
 707           {
 708           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
 709           case OP_NOTPROP:
 710           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
 711           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
 712           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
 713           case OP_NOT_HSPACE:
 714           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
 715           case OP_NOT_VSPACE:
 716           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
 717           default: break;
 718           }
 719         }
 720       }
 721     else
 722       {
 723       dlen = 0;         /* Not strictly necessary, but compilers moan */
 724       d = NOTACHAR;     /* if these variables are not set. */
 725       }
 726
 727
 728     /* Now process the individual opcodes */
 729
 730     switch (codevalue)
 731       {
 732 /* ========================================================================== */
 733       /* These cases are never obeyed. This is a fudge that causes a compile-
 734       time error if the vectors coptable or poptable, which are indexed by
 735       opcode, are not the correct length. It seems to be the only way to do
 736       such a check at compile time, as the sizeof() operator does not work
 737       in the C preprocessor. */
 738
 739       case OP_TABLE_LENGTH:
 740       case OP_TABLE_LENGTH +
 741         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
 742          (sizeof(poptable) == OP_TABLE_LENGTH)):
 743       break;
 744
 745 /* ========================================================================== */
 746       /* Reached a closing bracket. If not at the end of the pattern, carry
 747       on with the next opcode. For repeating opcodes, also add the repeat
 748       state. Note that KETRPOS will always be encountered at the end of the
 749       subpattern, because the possessive subpattern repeats are always handled
 750       using recursive calls. Thus, it never adds any new states.
 751
 752       At the end of the (sub)pattern, unless we have an empty string and
 753       PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
 754       start of the subject, save the match data, shifting up all previous
 755       matches so we always have the longest first. */
 756
 757       case OP_KET:
 758       case OP_KETRMIN:
 759       case OP_KETRMAX:
 760       case OP_KETRPOS:
 761       if (code != end_code)
 762         {
 763         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
 764         if (codevalue != OP_KET)
 765           {
 766           ADD_ACTIVE(state_offset - GET(code, 1), 0);
 767           }
 768         }
 769       else
 770         {
 771         if (ptr > current_subject ||
 772             ((md->moptions & PCRE_NOTEMPTY) == 0 &&
 773               ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
 774                 current_subject > start_subject + md->start_offset)))
 775           {
 776           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
 777             else if (match_count > 0 && ++match_count * 2 > offsetcount)
 778               match_count = 0;
 779           count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
 780           if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
 781           if (offsetcount >= 2)
 782             {
 783             offsets[0] = (int)(current_subject - start_subject);
 784             offsets[1] = (int)(ptr - start_subject);
 785             DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
 786               offsets[1] - offsets[0], current_subject));
 787             }
 788           if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
 789             {
 790             DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
 791               "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
 792               match_count, rlevel*2-2, SP));
 793             return match_count;
 794             }
 795           }
 796         }
 797       break;
 798
 799 /* ========================================================================== */
 800       /* These opcodes add to the current list of states without looking
 801       at the current character. */
 802
 803       /*-----------------------------------------------------------------*/
 804       case OP_ALT:
 805       do { code += GET(code, 1); } while (*code == OP_ALT);
 806       ADD_ACTIVE((int)(code - start_code), 0);
 807       break;
 808
 809       /*-----------------------------------------------------------------*/
 810       case OP_BRA:
 811       case OP_SBRA:
 812       do
 813         {
 814         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
 815         code += GET(code, 1);
 816         }
 817       while (*code == OP_ALT);
 818       break;
 819
 820       /*-----------------------------------------------------------------*/
 821       case OP_CBRA:
 822       case OP_SCBRA:
 823       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
 824       code += GET(code, 1);
 825       while (*code == OP_ALT)
 826         {
 827         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
 828         code += GET(code, 1);
 829         }
 830       break;
 831
 832       /*-----------------------------------------------------------------*/
 833       case OP_BRAZERO:
 834       case OP_BRAMINZERO:
 835       ADD_ACTIVE(state_offset + 1, 0);
 836       code += 1 + GET(code, 2);
 837       while (*code == OP_ALT) code += GET(code, 1);
 838       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
 839       break;
 840
 841       /*-----------------------------------------------------------------*/
 842       case OP_SKIPZERO:
 843       code += 1 + GET(code, 2);
 844       while (*code == OP_ALT) code += GET(code, 1);
 845       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
 846       break;
 847
 848       /*-----------------------------------------------------------------*/
 849       case OP_CIRC:
 850       if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
 851         { ADD_ACTIVE(state_offset + 1, 0); }
 852       break;
 853
 854       /*-----------------------------------------------------------------*/
 855       case OP_CIRCM:
 856       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
 857           (ptr != end_subject && WAS_NEWLINE(ptr)))
 858         { ADD_ACTIVE(state_offset + 1, 0); }
 859       break;
 860
 861       /*-----------------------------------------------------------------*/
 862       case OP_EOD:
 863       if (ptr >= end_subject)
 864         {
 865         if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
 866           could_continue = TRUE;
 867         else { ADD_ACTIVE(state_offset + 1, 0); }
 868         }
 869       break;
 870
 871       /*-----------------------------------------------------------------*/
 872       case OP_SOD:
 873       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 874       break;
 875
 876       /*-----------------------------------------------------------------*/
 877       case OP_SOM:
 878       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
 879       break;
 880
 881
 882 /* ========================================================================== */
 883       /* These opcodes inspect the next subject character, and sometimes
 884       the previous one as well, but do not have an argument. The variable
 885       clen contains the length of the current character and is zero if we are
 886       at the end of the subject. */
 887
 888       /*-----------------------------------------------------------------*/
 889       case OP_ANY:
 890       if (clen > 0 && !IS_NEWLINE(ptr))
 891         { ADD_NEW(state_offset + 1, 0); }
 892       break;
 893
 894       /*-----------------------------------------------------------------*/
 895       case OP_ALLANY:
 896       if (clen > 0)
 897         { ADD_NEW(state_offset + 1, 0); }
 898       break;
 899
 900       /*-----------------------------------------------------------------*/
 901       case OP_EODN:
 902       if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
 903         could_continue = TRUE;
 904       else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
 905         { ADD_ACTIVE(state_offset + 1, 0); }
 906       break;
 907
 908       /*-----------------------------------------------------------------*/
 909       case OP_DOLL:
 910       if ((md->moptions & PCRE_NOTEOL) == 0)
 911         {
 912         if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
 913           could_continue = TRUE;
 914         else if (clen == 0 ||
 915             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
 916                (ptr == end_subject - md->nllen)
 917             ))
 918           { ADD_ACTIVE(state_offset + 1, 0); }
 919         }
 920       break;
 921
 922       /*-----------------------------------------------------------------*/
 923       case OP_DOLLM:
 924       if ((md->moptions & PCRE_NOTEOL) == 0)
 925         {
 926         if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
 927           could_continue = TRUE;
 928         else if (clen == 0 ||
 929             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
 930           { ADD_ACTIVE(state_offset + 1, 0); }
 931         }
 932       else if (IS_NEWLINE(ptr))
 933         { ADD_ACTIVE(state_offset + 1, 0); }
 934       break;
 935
 936       /*-----------------------------------------------------------------*/
 937
 938       case OP_DIGIT:
 939       case OP_WHITESPACE:
 940       case OP_WORDCHAR:
 941       if (clen > 0 && c < 256 &&
 942             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
 943         { ADD_NEW(state_offset + 1, 0); }
 944       break;
 945
 946       /*-----------------------------------------------------------------*/
 947       case OP_NOT_DIGIT:
 948       case OP_NOT_WHITESPACE:
 949       case OP_NOT_WORDCHAR:
 950       if (clen > 0 && (c >= 256 ||
 951             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
 952         { ADD_NEW(state_offset + 1, 0); }
 953       break;
 954
 955       /*-----------------------------------------------------------------*/
 956       case OP_WORD_BOUNDARY:
 957       case OP_NOT_WORD_BOUNDARY:
 958         {
 959         int left_word, right_word;
 960
 961         if (ptr > start_subject)
 962           {
 963           const pcre_uchar *temp = ptr - 1;
 964           if (temp < md->start_used_ptr) md->start_used_ptr = temp;
 965 #ifdef SUPPORT_UTF
 966           if (utf) { BACKCHAR(temp); }
 967 #endif
 968           GETCHARTEST(d, temp);
 969 #ifdef SUPPORT_UCP
 970           if ((md->poptions & PCRE_UCP) != 0)
 971             {
 972             if (d == '_') left_word = TRUE; else
 973               {
 974               int cat = UCD_CATEGORY(d);
 975               left_word = (cat == ucp_L || cat == ucp_N);
 976               }
 977             }
 978           else
 979 #endif
 980           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
 981           }
 982         else left_word = FALSE;
 983
 984         if (clen > 0)
 985           {
 986 #ifdef SUPPORT_UCP
 987           if ((md->poptions & PCRE_UCP) != 0)
 988             {
 989             if (c == '_') right_word = TRUE; else
 990               {
 991               int cat = UCD_CATEGORY(c);
 992               right_word = (cat == ucp_L || cat == ucp_N);
 993               }
 994             }
 995           else
 996 #endif
 997           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
 998           }
 999         else right_word = FALSE;
1000
1001         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1002           { ADD_ACTIVE(state_offset + 1, 0); }
1003         }
1004       break;
1005
1006
1007       /*-----------------------------------------------------------------*/
1008       /* Check the next character by Unicode property. We will get here only
1009       if the support is in the binary; otherwise a compile-time error occurs.
1010       */
1011
1012 #ifdef SUPPORT_UCP
1013       case OP_PROP:
1014       case OP_NOTPROP:
1015       if (clen > 0)
1016         {
1017         BOOL OK;
1018         const pcre_uint8 chartype = UCD_CHARTYPE(c);
1019         switch(code[1])
1020           {
1021           case PT_ANY:
1022           OK = TRUE;
1023           break;
1024
1025           case PT_LAMP:
1026           OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1027                chartype == ucp_Lt;
1028           break;
1029
1030           case PT_GC:
1031           OK = PRIV(ucp_gentype)[chartype] == code[2];
1032           break;
1033
1034           case PT_PC:
1035           OK = chartype == code[2];
1036           break;
1037
1038           case PT_SC:
1039           OK = UCD_SCRIPT(c) == code[2];
1040           break;
1041
1042           /* These are specials for combination cases. */
1043
1044           case PT_ALNUM:
1045           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1046                PRIV(ucp_gentype)[chartype] == ucp_N;
1047           break;
1048
1049           case PT_SPACE:    /* Perl space */
1050           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1051                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1052           break;
1053
1054           case PT_PXSPACE:  /* POSIX space */
1055           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1056                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1057                c == CHAR_FF || c == CHAR_CR;
1058           break;
1059
1060           case PT_WORD:
1061           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1062                PRIV(ucp_gentype)[chartype] == ucp_N ||
1063                c == CHAR_UNDERSCORE;
1064           break;
1065
1066           /* Should never occur, but keep compilers from grumbling. */
1067
1068           default:
1069           OK = codevalue != OP_PROP;
1070           break;
1071           }
1072
1073         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1074         }
1075       break;
1076 #endif
1077
1078
1079
1080 /* ========================================================================== */
1081       /* These opcodes likewise inspect the subject character, but have an
1082       argument that is not a data character. It is one of these opcodes:
1083       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1084       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1085
1086       case OP_TYPEPLUS:
1087       case OP_TYPEMINPLUS:
1088       case OP_TYPEPOSPLUS:
1089       count = current_state->count;  /* Already matched */
1090       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1091       if (clen > 0)
1092         {
1093         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1094             (c < 256 &&
1095               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1096               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1097           {
1098           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1099             {
1100             active_count--;            /* Remove non-match possibility */
1101             next_active_state--;
1102             }
1103           count++;
1104           ADD_NEW(state_offset, count);
1105           }
1106         }
1107       break;
1108
1109       /*-----------------------------------------------------------------*/
1110       case OP_TYPEQUERY:
1111       case OP_TYPEMINQUERY:
1112       case OP_TYPEPOSQUERY:
1113       ADD_ACTIVE(state_offset + 2, 0);
1114       if (clen > 0)
1115         {
1116         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1117             (c < 256 &&
1118               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1119               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1120           {
1121           if (codevalue == OP_TYPEPOSQUERY)
1122             {
1123             active_count--;            /* Remove non-match possibility */
1124             next_active_state--;
1125             }
1126           ADD_NEW(state_offset + 2, 0);
1127           }
1128         }
1129       break;
1130
1131       /*-----------------------------------------------------------------*/
1132       case OP_TYPESTAR:
1133       case OP_TYPEMINSTAR:
1134       case OP_TYPEPOSSTAR:
1135       ADD_ACTIVE(state_offset + 2, 0);
1136       if (clen > 0)
1137         {
1138         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1139             (c < 256 &&
1140               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1141               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1142           {
1143           if (codevalue == OP_TYPEPOSSTAR)
1144             {
1145             active_count--;            /* Remove non-match possibility */
1146             next_active_state--;
1147             }
1148           ADD_NEW(state_offset, 0);
1149           }
1150         }
1151       break;
1152
1153       /*-----------------------------------------------------------------*/
1154       case OP_TYPEEXACT:
1155       count = current_state->count;  /* Number already matched */
1156       if (clen > 0)
1157         {
1158         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1159             (c < 256 &&
1160               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1161               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1162           {
1163           if (++count >= GET2(code, 1))
1164             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1165           else
1166             { ADD_NEW(state_offset, count); }
1167           }
1168         }
1169       break;
1170
1171       /*-----------------------------------------------------------------*/
1172       case OP_TYPEUPTO:
1173       case OP_TYPEMINUPTO:
1174       case OP_TYPEPOSUPTO:
1175       ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1176       count = current_state->count;  /* Number already matched */
1177       if (clen > 0)
1178         {
1179         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1180             (c < 256 &&
1181               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1182               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1183           {
1184           if (codevalue == OP_TYPEPOSUPTO)
1185             {
1186             active_count--;           /* Remove non-match possibility */
1187             next_active_state--;
1188             }
1189           if (++count >= GET2(code, 1))
1190             { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1191           else
1192             { ADD_NEW(state_offset, count); }
1193           }
1194         }
1195       break;
1196
1197 /* ========================================================================== */
1198       /* These are virtual opcodes that are used when something like
1199       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1200       argument. It keeps the code above fast for the other cases. The argument
1201       is in the d variable. */
1202
1203 #ifdef SUPPORT_UCP
1204       case OP_PROP_EXTRA + OP_TYPEPLUS:
1205       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1206       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1207       count = current_state->count;           /* Already matched */
1208       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1209       if (clen > 0)
1210         {
1211         BOOL OK;
1212         const pcre_uint8 chartype = UCD_CHARTYPE(c);
1213         switch(code[2])
1214           {
1215           case PT_ANY:
1216           OK = TRUE;
1217           break;
1218
1219           case PT_LAMP:
1220           OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1221             chartype == ucp_Lt;
1222           break;
1223
1224           case PT_GC:
1225           OK = PRIV(ucp_gentype)[chartype] == code[3];
1226           break;
1227
1228           case PT_PC:
1229           OK = chartype == code[3];
1230           break;
1231
1232           case PT_SC:
1233           OK = UCD_SCRIPT(c) == code[3];
1234           break;
1235
1236           /* These are specials for combination cases. */
1237
1238           case PT_ALNUM:
1239           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1240                PRIV(ucp_gentype)[chartype] == ucp_N;
1241           break;
1242
1243           case PT_SPACE:    /* Perl space */
1244           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1245                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1246           break;
1247
1248           case PT_PXSPACE:  /* POSIX space */
1249           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1250                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1251                c == CHAR_FF || c == CHAR_CR;
1252           break;
1253
1254           case PT_WORD:
1255           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1256                PRIV(ucp_gentype)[chartype] == ucp_N ||
1257                c == CHAR_UNDERSCORE;
1258           break;
1259
1260           /* Should never occur, but keep compilers from grumbling. */
1261
1262           default:
1263           OK = codevalue != OP_PROP;
1264           break;
1265           }
1266
1267         if (OK == (d == OP_PROP))
1268           {
1269           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1270             {
1271             active_count--;           /* Remove non-match possibility */
1272             next_active_state--;
1273             }
1274           count++;
1275           ADD_NEW(state_offset, count);
1276           }
1277         }
1278       break;
1279
1280       /*-----------------------------------------------------------------*/
1281       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1282       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1283       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1284       count = current_state->count;  /* Already matched */
1285       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1286       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1287         {
1288         const pcre_uchar *nptr = ptr + clen;
1289         int ncount = 0;
1290         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1291           {
1292           active_count--;           /* Remove non-match possibility */
1293           next_active_state--;
1294           }
1295         while (nptr < end_subject)
1296           {
1297           int nd;
1298           int ndlen = 1;
1299           GETCHARLEN(nd, nptr, ndlen);
1300           if (UCD_CATEGORY(nd) != ucp_M) break;
1301           ncount++;
1302           nptr += ndlen;
1303           }
1304         count++;
1305         ADD_NEW_DATA(-state_offset, count, ncount);
1306         }
1307       break;
1308 #endif
1309
1310       /*-----------------------------------------------------------------*/
1311       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1312       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1313       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1314       count = current_state->count;  /* Already matched */
1315       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1316       if (clen > 0)
1317         {
1318         int ncount = 0;
1319         switch (c)
1320           {
1321           case 0x000b:
1322           case 0x000c:
1323           case 0x0085:
1324           case 0x2028:
1325           case 0x2029:
1326           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1327           goto ANYNL01;
1328
1329           case 0x000d:
1330           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1331           /* Fall through */
1332
1333           ANYNL01:
1334           case 0x000a:
1335           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1336             {
1337             active_count--;           /* Remove non-match possibility */
1338             next_active_state--;
1339             }
1340           count++;
1341           ADD_NEW_DATA(-state_offset, count, ncount);
1342           break;
1343
1344           default:
1345           break;
1346           }
1347         }
1348       break;
1349
1350       /*-----------------------------------------------------------------*/
1351       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1352       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1353       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1354       count = current_state->count;  /* Already matched */
1355       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1356       if (clen > 0)
1357         {
1358         BOOL OK;
1359         switch (c)
1360           {
1361           case 0x000a:
1362           case 0x000b:
1363           case 0x000c:
1364           case 0x000d:
1365           case 0x0085:
1366           case 0x2028:
1367           case 0x2029:
1368           OK = TRUE;
1369           break;
1370
1371           default:
1372           OK = FALSE;
1373           break;
1374           }
1375
1376         if (OK == (d == OP_VSPACE))
1377           {
1378           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1379             {
1380             active_count--;           /* Remove non-match possibility */
1381             next_active_state--;
1382             }
1383           count++;
1384           ADD_NEW_DATA(-state_offset, count, 0);
1385           }
1386         }
1387       break;
1388
1389       /*-----------------------------------------------------------------*/
1390       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1391       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1392       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1393       count = current_state->count;  /* Already matched */
1394       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1395       if (clen > 0)
1396         {
1397         BOOL OK;
1398         switch (c)
1399           {
1400           case 0x09:      /* HT */
1401           case 0x20:      /* SPACE */
1402           case 0xa0:      /* NBSP */
1403           case 0x1680:    /* OGHAM SPACE MARK */
1404           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1405           case 0x2000:    /* EN QUAD */
1406           case 0x2001:    /* EM QUAD */
1407           case 0x2002:    /* EN SPACE */
1408           case 0x2003:    /* EM SPACE */
1409           case 0x2004:    /* THREE-PER-EM SPACE */
1410           case 0x2005:    /* FOUR-PER-EM SPACE */
1411           case 0x2006:    /* SIX-PER-EM SPACE */
1412           case 0x2007:    /* FIGURE SPACE */
1413           case 0x2008:    /* PUNCTUATION SPACE */
1414           case 0x2009:    /* THIN SPACE */
1415           case 0x200A:    /* HAIR SPACE */
1416           case 0x202f:    /* NARROW NO-BREAK SPACE */
1417           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1418           case 0x3000:    /* IDEOGRAPHIC SPACE */
1419           OK = TRUE;
1420           break;
1421
1422           default:
1423           OK = FALSE;
1424           break;
1425           }
1426
1427         if (OK == (d == OP_HSPACE))
1428           {
1429           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1430             {
1431             active_count--;           /* Remove non-match possibility */
1432             next_active_state--;
1433             }
1434           count++;
1435           ADD_NEW_DATA(-state_offset, count, 0);
1436           }
1437         }
1438       break;
1439
1440       /*-----------------------------------------------------------------*/
1441 #ifdef SUPPORT_UCP
1442       case OP_PROP_EXTRA + OP_TYPEQUERY:
1443       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1444       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1445       count = 4;
1446       goto QS1;
1447
1448       case OP_PROP_EXTRA + OP_TYPESTAR:
1449       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1450       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1451       count = 0;
1452
1453       QS1:
1454
1455       ADD_ACTIVE(state_offset + 4, 0);
1456       if (clen > 0)
1457         {
1458         BOOL OK;
1459         const pcre_uint8 chartype = UCD_CHARTYPE(c);
1460         switch(code[2])
1461           {
1462           case PT_ANY:
1463           OK = TRUE;
1464           break;
1465
1466           case PT_LAMP:
1467           OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1468             chartype == ucp_Lt;
1469           break;
1470
1471           case PT_GC:
1472           OK = PRIV(ucp_gentype)[chartype] == code[3];
1473           break;
1474
1475           case PT_PC:
1476           OK = chartype == code[3];
1477           break;
1478
1479           case PT_SC:
1480           OK = UCD_SCRIPT(c) == code[3];
1481           break;
1482
1483           /* These are specials for combination cases. */
1484
1485           case PT_ALNUM:
1486           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1487                PRIV(ucp_gentype)[chartype] == ucp_N;
1488           break;
1489
1490           case PT_SPACE:    /* Perl space */
1491           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1492                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1493           break;
1494
1495           case PT_PXSPACE:  /* POSIX space */
1496           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1497                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1498                c == CHAR_FF || c == CHAR_CR;
1499           break;
1500
1501           case PT_WORD:
1502           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1503                PRIV(ucp_gentype)[chartype] == ucp_N ||
1504                c == CHAR_UNDERSCORE;
1505           break;
1506
1507           /* Should never occur, but keep compilers from grumbling. */
1508
1509           default:
1510           OK = codevalue != OP_PROP;
1511           break;
1512           }
1513
1514         if (OK == (d == OP_PROP))
1515           {
1516           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1517               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1518             {
1519             active_count--;           /* Remove non-match possibility */
1520             next_active_state--;
1521             }
1522           ADD_NEW(state_offset + count, 0);
1523           }
1524         }
1525       break;
1526
1527       /*-----------------------------------------------------------------*/
1528       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1529       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1530       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1531       count = 2;
1532       goto QS2;
1533
1534       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1535       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1536       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1537       count = 0;
1538
1539       QS2:
1540
1541       ADD_ACTIVE(state_offset + 2, 0);
1542       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1543         {
1544         const pcre_uchar *nptr = ptr + clen;
1545         int ncount = 0;
1546         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1547             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1548           {
1549           active_count--;           /* Remove non-match possibility */
1550           next_active_state--;
1551           }
1552         while (nptr < end_subject)
1553           {
1554           int nd;
1555           int ndlen = 1;
1556           GETCHARLEN(nd, nptr, ndlen);
1557           if (UCD_CATEGORY(nd) != ucp_M) break;
1558           ncount++;
1559           nptr += ndlen;
1560           }
1561         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1562         }
1563       break;
1564 #endif
1565
1566       /*-----------------------------------------------------------------*/
1567       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1568       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1569       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1570       count = 2;
1571       goto QS3;
1572
1573       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1574       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1575       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1576       count = 0;
1577
1578       QS3:
1579       ADD_ACTIVE(state_offset + 2, 0);
1580       if (clen > 0)
1581         {
1582         int ncount = 0;
1583         switch (c)
1584           {
1585           case 0x000b:
1586           case 0x000c:
1587           case 0x0085:
1588           case 0x2028:
1589           case 0x2029:
1590           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1591           goto ANYNL02;
1592
1593           case 0x000d:
1594           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1595           /* Fall through */
1596
1597           ANYNL02:
1598           case 0x000a:
1599           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1600               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1601             {
1602             active_count--;           /* Remove non-match possibility */
1603             next_active_state--;
1604             }
1605           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1606           break;
1607
1608           default:
1609           break;
1610           }
1611         }
1612       break;
1613
1614       /*-----------------------------------------------------------------*/
1615       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1616       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1617       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1618       count = 2;
1619       goto QS4;
1620
1621       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1622       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1623       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1624       count = 0;
1625
1626       QS4:
1627       ADD_ACTIVE(state_offset + 2, 0);
1628       if (clen > 0)
1629         {
1630         BOOL OK;
1631         switch (c)
1632           {
1633           case 0x000a:
1634           case 0x000b:
1635           case 0x000c:
1636           case 0x000d:
1637           case 0x0085:
1638           case 0x2028:
1639           case 0x2029:
1640           OK = TRUE;
1641           break;
1642
1643           default:
1644           OK = FALSE;
1645           break;
1646           }
1647         if (OK == (d == OP_VSPACE))
1648           {
1649           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1650               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1651             {
1652             active_count--;           /* Remove non-match possibility */
1653             next_active_state--;
1654             }
1655           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1656           }
1657         }
1658       break;
1659
1660       /*-----------------------------------------------------------------*/
1661       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1662       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1663       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1664       count = 2;
1665       goto QS5;
1666
1667       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1668       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1669       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1670       count = 0;
1671
1672       QS5:
1673       ADD_ACTIVE(state_offset + 2, 0);
1674       if (clen > 0)
1675         {
1676         BOOL OK;
1677         switch (c)
1678           {
1679           case 0x09:      /* HT */
1680           case 0x20:      /* SPACE */
1681           case 0xa0:      /* NBSP */
1682           case 0x1680:    /* OGHAM SPACE MARK */
1683           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1684           case 0x2000:    /* EN QUAD */
1685           case 0x2001:    /* EM QUAD */
1686           case 0x2002:    /* EN SPACE */
1687           case 0x2003:    /* EM SPACE */
1688           case 0x2004:    /* THREE-PER-EM SPACE */
1689           case 0x2005:    /* FOUR-PER-EM SPACE */
1690           case 0x2006:    /* SIX-PER-EM SPACE */
1691           case 0x2007:    /* FIGURE SPACE */
1692           case 0x2008:    /* PUNCTUATION SPACE */
1693           case 0x2009:    /* THIN SPACE */
1694           case 0x200A:    /* HAIR SPACE */
1695           case 0x202f:    /* NARROW NO-BREAK SPACE */
1696           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1697           case 0x3000:    /* IDEOGRAPHIC SPACE */
1698           OK = TRUE;
1699           break;
1700
1701           default:
1702           OK = FALSE;
1703           break;
1704           }
1705
1706         if (OK == (d == OP_HSPACE))
1707           {
1708           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1709               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1710             {
1711             active_count--;           /* Remove non-match possibility */
1712             next_active_state--;
1713             }
1714           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1715           }
1716         }
1717       break;
1718
1719       /*-----------------------------------------------------------------*/
1720 #ifdef SUPPORT_UCP
1721       case OP_PROP_EXTRA + OP_TYPEEXACT:
1722       case OP_PROP_EXTRA + OP_TYPEUPTO:
1723       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1724       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1725       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1726         { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1727       count = current_state->count;  /* Number already matched */
1728       if (clen > 0)
1729         {
1730         BOOL OK;
1731         const pcre_uint8 chartype = UCD_CHARTYPE(c);
1732         switch(code[1 + IMM2_SIZE + 1])
1733           {
1734           case PT_ANY:
1735           OK = TRUE;
1736           break;
1737
1738           case PT_LAMP:
1739           OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1740             chartype == ucp_Lt;
1741           break;
1742
1743           case PT_GC:
1744           OK = PRIV(ucp_gentype)[chartype] == code[1 + IMM2_SIZE + 2];
1745           break;
1746
1747           case PT_PC:
1748           OK = chartype == code[1 + IMM2_SIZE + 2];
1749           break;
1750
1751           case PT_SC:
1752           OK = UCD_SCRIPT(c) == code[1 + IMM2_SIZE + 2];
1753           break;
1754
1755           /* These are specials for combination cases. */
1756
1757           case PT_ALNUM:
1758           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1759                PRIV(ucp_gentype)[chartype] == ucp_N;
1760           break;
1761
1762           case PT_SPACE:    /* Perl space */
1763           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1764                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1765           break;
1766
1767           case PT_PXSPACE:  /* POSIX space */
1768           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1769                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1770                c == CHAR_FF || c == CHAR_CR;
1771           break;
1772
1773           case PT_WORD:
1774           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1775                PRIV(ucp_gentype)[chartype] == ucp_N ||
1776                c == CHAR_UNDERSCORE;
1777           break;
1778
1779           /* Should never occur, but keep compilers from grumbling. */
1780
1781           default:
1782           OK = codevalue != OP_PROP;
1783           break;
1784           }
1785
1786         if (OK == (d == OP_PROP))
1787           {
1788           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1789             {
1790             active_count--;           /* Remove non-match possibility */
1791             next_active_state--;
1792             }
1793           if (++count >= GET2(code, 1))
1794             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1795           else
1796             { ADD_NEW(state_offset, count); }
1797           }
1798         }
1799       break;
1800
1801       /*-----------------------------------------------------------------*/
1802       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1803       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1804       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1805       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1806       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1807         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1808       count = current_state->count;  /* Number already matched */
1809       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1810         {
1811         const pcre_uchar *nptr = ptr + clen;
1812         int ncount = 0;
1813         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1814           {
1815           active_count--;           /* Remove non-match possibility */
1816           next_active_state--;
1817           }
1818         while (nptr < end_subject)
1819           {
1820           int nd;
1821           int ndlen = 1;
1822           GETCHARLEN(nd, nptr, ndlen);
1823           if (UCD_CATEGORY(nd) != ucp_M) break;
1824           ncount++;
1825           nptr += ndlen;
1826           }
1827         if (++count >= GET2(code, 1))
1828           { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1829         else
1830           { ADD_NEW_DATA(-state_offset, count, ncount); }
1831         }
1832       break;
1833 #endif
1834
1835       /*-----------------------------------------------------------------*/
1836       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1837       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1838       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1839       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1840       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1841         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1842       count = current_state->count;  /* Number already matched */
1843       if (clen > 0)
1844         {
1845         int ncount = 0;
1846         switch (c)
1847           {
1848           case 0x000b:
1849           case 0x000c:
1850           case 0x0085:
1851           case 0x2028:
1852           case 0x2029:
1853           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1854           goto ANYNL03;
1855
1856           case 0x000d:
1857           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1858           /* Fall through */
1859
1860           ANYNL03:
1861           case 0x000a:
1862           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1863             {
1864             active_count--;           /* Remove non-match possibility */
1865             next_active_state--;
1866             }
1867           if (++count >= GET2(code, 1))
1868             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1869           else
1870             { ADD_NEW_DATA(-state_offset, count, ncount); }
1871           break;
1872
1873           default:
1874           break;
1875           }
1876         }
1877       break;
1878
1879       /*-----------------------------------------------------------------*/
1880       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1881       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1882       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1883       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1884       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1885         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1886       count = current_state->count;  /* Number already matched */
1887       if (clen > 0)
1888         {
1889         BOOL OK;
1890         switch (c)
1891           {
1892           case 0x000a:
1893           case 0x000b:
1894           case 0x000c:
1895           case 0x000d:
1896           case 0x0085:
1897           case 0x2028:
1898           case 0x2029:
1899           OK = TRUE;
1900           break;
1901
1902           default:
1903           OK = FALSE;
1904           }
1905
1906         if (OK == (d == OP_VSPACE))
1907           {
1908           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1909             {
1910             active_count--;           /* Remove non-match possibility */
1911             next_active_state--;
1912             }
1913           if (++count >= GET2(code, 1))
1914             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1915           else
1916             { ADD_NEW_DATA(-state_offset, count, 0); }
1917           }
1918         }
1919       break;
1920
1921       /*-----------------------------------------------------------------*/
1922       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1923       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1924       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1925       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1926       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1927         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1928       count = current_state->count;  /* Number already matched */
1929       if (clen > 0)
1930         {
1931         BOOL OK;
1932         switch (c)
1933           {
1934           case 0x09:      /* HT */
1935           case 0x20:      /* SPACE */
1936           case 0xa0:      /* NBSP */
1937           case 0x1680:    /* OGHAM SPACE MARK */
1938           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1939           case 0x2000:    /* EN QUAD */
1940           case 0x2001:    /* EM QUAD */
1941           case 0x2002:    /* EN SPACE */
1942           case 0x2003:    /* EM SPACE */
1943           case 0x2004:    /* THREE-PER-EM SPACE */
1944           case 0x2005:    /* FOUR-PER-EM SPACE */
1945           case 0x2006:    /* SIX-PER-EM SPACE */
1946           case 0x2007:    /* FIGURE SPACE */
1947           case 0x2008:    /* PUNCTUATION SPACE */
1948           case 0x2009:    /* THIN SPACE */
1949           case 0x200A:    /* HAIR SPACE */
1950           case 0x202f:    /* NARROW NO-BREAK SPACE */
1951           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1952           case 0x3000:    /* IDEOGRAPHIC SPACE */
1953           OK = TRUE;
1954           break;
1955
1956           default:
1957           OK = FALSE;
1958           break;
1959           }
1960
1961         if (OK == (d == OP_HSPACE))
1962           {
1963           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1964             {
1965             active_count--;           /* Remove non-match possibility */
1966             next_active_state--;
1967             }
1968           if (++count >= GET2(code, 1))
1969             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1970           else
1971             { ADD_NEW_DATA(-state_offset, count, 0); }
1972           }
1973         }
1974       break;
1975
1976 /* ========================================================================== */
1977       /* These opcodes are followed by a character that is usually compared
1978       to the current subject character; it is loaded into d. We still get
1979       here even if there is no subject character, because in some cases zero
1980       repetitions are permitted. */
1981
1982       /*-----------------------------------------------------------------*/
1983       case OP_CHAR:
1984       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1985       break;
1986
1987       /*-----------------------------------------------------------------*/
1988       case OP_CHARI:
1989       if (clen == 0) break;
1990
1991 #ifdef SUPPORT_UTF
1992       if (utf)
1993         {
1994         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1995           {
1996           unsigned int othercase;
1997           if (c < 128)
1998             othercase = fcc[c];
1999           else
2000             /* If we have Unicode property support, we can use it to test the
2001             other case of the character. */
2002 #ifdef SUPPORT_UCP
2003             othercase = UCD_OTHERCASE(c);
2004 #else
2005             othercase = NOTACHAR;
2006 #endif
2007
2008           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2009           }
2010         }
2011       else
2012 #endif  /* SUPPORT_UTF */
2013       /* Not UTF mode */
2014         {
2015         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2016           { ADD_NEW(state_offset + 2, 0); }
2017         }
2018       break;
2019
2020
2021 #ifdef SUPPORT_UCP
2022       /*-----------------------------------------------------------------*/
2023       /* This is a tricky one because it can match more than one character.
2024       Find out how many characters to skip, and then set up a negative state
2025       to wait for them to pass before continuing. */
2026
2027       case OP_EXTUNI:
2028       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2029         {
2030         const pcre_uchar *nptr = ptr + clen;
2031         int ncount = 0;
2032         while (nptr < end_subject)
2033           {
2034           int nclen = 1;
2035           GETCHARLEN(c, nptr, nclen);
2036           if (UCD_CATEGORY(c) != ucp_M) break;
2037           ncount++;
2038           nptr += nclen;
2039           }
2040         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2041         }
2042       break;
2043 #endif
2044
2045       /*-----------------------------------------------------------------*/
2046       /* This is a tricky like EXTUNI because it too can match more than one
2047       character (when CR is followed by LF). In this case, set up a negative
2048       state to wait for one character to pass before continuing. */
2049
2050       case OP_ANYNL:
2051       if (clen > 0) switch(c)
2052         {
2053         case 0x000b:
2054         case 0x000c:
2055         case 0x0085:
2056         case 0x2028:
2057         case 0x2029:
2058         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2059
2060         case 0x000a:
2061         ADD_NEW(state_offset + 1, 0);
2062         break;
2063
2064         case 0x000d:
2065         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2066           {
2067           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2068           }
2069         else
2070           {
2071           ADD_NEW(state_offset + 1, 0);
2072           }
2073         break;
2074         }
2075       break;
2076
2077       /*-----------------------------------------------------------------*/
2078       case OP_NOT_VSPACE:
2079       if (clen > 0) switch(c)
2080         {
2081         case 0x000a:
2082         case 0x000b:
2083         case 0x000c:
2084         case 0x000d:
2085         case 0x0085:
2086         case 0x2028:
2087         case 0x2029:
2088         break;
2089
2090         default:
2091         ADD_NEW(state_offset + 1, 0);
2092         break;
2093         }
2094       break;
2095
2096       /*-----------------------------------------------------------------*/
2097       case OP_VSPACE:
2098       if (clen > 0) switch(c)
2099         {
2100         case 0x000a:
2101         case 0x000b:
2102         case 0x000c:
2103         case 0x000d:
2104         case 0x0085:
2105         case 0x2028:
2106         case 0x2029:
2107         ADD_NEW(state_offset + 1, 0);
2108         break;
2109
2110         default: break;
2111         }
2112       break;
2113
2114       /*-----------------------------------------------------------------*/
2115       case OP_NOT_HSPACE:
2116       if (clen > 0) switch(c)
2117         {
2118         case 0x09:      /* HT */
2119         case 0x20:      /* SPACE */
2120         case 0xa0:      /* NBSP */
2121         case 0x1680:    /* OGHAM SPACE MARK */
2122         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2123         case 0x2000:    /* EN QUAD */
2124         case 0x2001:    /* EM QUAD */
2125         case 0x2002:    /* EN SPACE */
2126         case 0x2003:    /* EM SPACE */
2127         case 0x2004:    /* THREE-PER-EM SPACE */
2128         case 0x2005:    /* FOUR-PER-EM SPACE */
2129         case 0x2006:    /* SIX-PER-EM SPACE */
2130         case 0x2007:    /* FIGURE SPACE */
2131         case 0x2008:    /* PUNCTUATION SPACE */
2132         case 0x2009:    /* THIN SPACE */
2133         case 0x200A:    /* HAIR SPACE */
2134         case 0x202f:    /* NARROW NO-BREAK SPACE */
2135         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2136         case 0x3000:    /* IDEOGRAPHIC SPACE */
2137         break;
2138
2139         default:
2140         ADD_NEW(state_offset + 1, 0);
2141         break;
2142         }
2143       break;
2144
2145       /*-----------------------------------------------------------------*/
2146       case OP_HSPACE:
2147       if (clen > 0) switch(c)
2148         {
2149         case 0x09:      /* HT */
2150         case 0x20:      /* SPACE */
2151         case 0xa0:      /* NBSP */
2152         case 0x1680:    /* OGHAM SPACE MARK */
2153         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2154         case 0x2000:    /* EN QUAD */
2155         case 0x2001:    /* EM QUAD */
2156         case 0x2002:    /* EN SPACE */
2157         case 0x2003:    /* EM SPACE */
2158         case 0x2004:    /* THREE-PER-EM SPACE */
2159         case 0x2005:    /* FOUR-PER-EM SPACE */
2160         case 0x2006:    /* SIX-PER-EM SPACE */
2161         case 0x2007:    /* FIGURE SPACE */
2162         case 0x2008:    /* PUNCTUATION SPACE */
2163         case 0x2009:    /* THIN SPACE */
2164         case 0x200A:    /* HAIR SPACE */
2165         case 0x202f:    /* NARROW NO-BREAK SPACE */
2166         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2167         case 0x3000:    /* IDEOGRAPHIC SPACE */
2168         ADD_NEW(state_offset + 1, 0);
2169         break;
2170         }
2171       break;
2172
2173       /*-----------------------------------------------------------------*/
2174       /* Match a negated single character casefully. This is only used for
2175       one-byte characters, that is, we know that d < 256. The character we are
2176       checking (c) can be multibyte. */
2177
2178       case OP_NOT:
2179       if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2180       break;
2181
2182       /*-----------------------------------------------------------------*/
2183       /* Match a negated single character caselessly. This is only used for
2184       one-byte characters, that is, we know that d < 256. The character we are
2185       checking (c) can be multibyte. */
2186
2187       case OP_NOTI:
2188       if (clen > 0 && c != d && c != fcc[d])
2189         { ADD_NEW(state_offset + dlen + 1, 0); }
2190       break;
2191
2192       /*-----------------------------------------------------------------*/
2193       case OP_PLUSI:
2194       case OP_MINPLUSI:
2195       case OP_POSPLUSI:
2196       case OP_NOTPLUSI:
2197       case OP_NOTMINPLUSI:
2198       case OP_NOTPOSPLUSI:
2199       caseless = TRUE;
2200       codevalue -= OP_STARI - OP_STAR;
2201
2202       /* Fall through */
2203       case OP_PLUS:
2204       case OP_MINPLUS:
2205       case OP_POSPLUS:
2206       case OP_NOTPLUS:
2207       case OP_NOTMINPLUS:
2208       case OP_NOTPOSPLUS:
2209       count = current_state->count;  /* Already matched */
2210       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2211       if (clen > 0)
2212         {
2213         unsigned int otherd = NOTACHAR;
2214         if (caseless)
2215           {
2216 #ifdef SUPPORT_UTF
2217           if (utf && d >= 128)
2218             {
2219 #ifdef SUPPORT_UCP
2220             otherd = UCD_OTHERCASE(d);
2221 #endif  /* SUPPORT_UCP */
2222             }
2223           else
2224 #endif  /* SUPPORT_UTF */
2225           otherd = TABLE_GET(d, fcc, d);
2226           }
2227         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2228           {
2229           if (count > 0 &&
2230               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2231             {
2232             active_count--;             /* Remove non-match possibility */
2233             next_active_state--;
2234             }
2235           count++;
2236           ADD_NEW(state_offset, count);
2237           }
2238         }
2239       break;
2240
2241       /*-----------------------------------------------------------------*/
2242       case OP_QUERYI:
2243       case OP_MINQUERYI:
2244       case OP_POSQUERYI:
2245       case OP_NOTQUERYI:
2246       case OP_NOTMINQUERYI:
2247       case OP_NOTPOSQUERYI:
2248       caseless = TRUE;
2249       codevalue -= OP_STARI - OP_STAR;
2250       /* Fall through */
2251       case OP_QUERY:
2252       case OP_MINQUERY:
2253       case OP_POSQUERY:
2254       case OP_NOTQUERY:
2255       case OP_NOTMINQUERY:
2256       case OP_NOTPOSQUERY:
2257       ADD_ACTIVE(state_offset + dlen + 1, 0);
2258       if (clen > 0)
2259         {
2260         unsigned int otherd = NOTACHAR;
2261         if (caseless)
2262           {
2263 #ifdef SUPPORT_UTF
2264           if (utf && d >= 128)
2265             {
2266 #ifdef SUPPORT_UCP
2267             otherd = UCD_OTHERCASE(d);
2268 #endif  /* SUPPORT_UCP */
2269             }
2270           else
2271 #endif  /* SUPPORT_UTF */
2272           otherd = TABLE_GET(d, fcc, d);
2273           }
2274         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2275           {
2276           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2277             {
2278             active_count--;            /* Remove non-match possibility */
2279             next_active_state--;
2280             }
2281           ADD_NEW(state_offset + dlen + 1, 0);
2282           }
2283         }
2284       break;
2285
2286       /*-----------------------------------------------------------------*/
2287       case OP_STARI:
2288       case OP_MINSTARI:
2289       case OP_POSSTARI:
2290       case OP_NOTSTARI:
2291       case OP_NOTMINSTARI:
2292       case OP_NOTPOSSTARI:
2293       caseless = TRUE;
2294       codevalue -= OP_STARI - OP_STAR;
2295       /* Fall through */
2296       case OP_STAR:
2297       case OP_MINSTAR:
2298       case OP_POSSTAR:
2299       case OP_NOTSTAR:
2300       case OP_NOTMINSTAR:
2301       case OP_NOTPOSSTAR:
2302       ADD_ACTIVE(state_offset + dlen + 1, 0);
2303       if (clen > 0)
2304         {
2305         unsigned int otherd = NOTACHAR;
2306         if (caseless)
2307           {
2308 #ifdef SUPPORT_UTF
2309           if (utf && d >= 128)
2310             {
2311 #ifdef SUPPORT_UCP
2312             otherd = UCD_OTHERCASE(d);
2313 #endif  /* SUPPORT_UCP */
2314             }
2315           else
2316 #endif  /* SUPPORT_UTF */
2317           otherd = TABLE_GET(d, fcc, d);
2318           }
2319         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2320           {
2321           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2322             {
2323             active_count--;            /* Remove non-match possibility */
2324             next_active_state--;
2325             }
2326           ADD_NEW(state_offset, 0);
2327           }
2328         }
2329       break;
2330
2331       /*-----------------------------------------------------------------*/
2332       case OP_EXACTI:
2333       case OP_NOTEXACTI:
2334       caseless = TRUE;
2335       codevalue -= OP_STARI - OP_STAR;
2336       /* Fall through */
2337       case OP_EXACT:
2338       case OP_NOTEXACT:
2339       count = current_state->count;  /* Number already matched */
2340       if (clen > 0)
2341         {
2342         unsigned int otherd = NOTACHAR;
2343         if (caseless)
2344           {
2345 #ifdef SUPPORT_UTF
2346           if (utf && d >= 128)
2347             {
2348 #ifdef SUPPORT_UCP
2349             otherd = UCD_OTHERCASE(d);
2350 #endif  /* SUPPORT_UCP */
2351             }
2352           else
2353 #endif  /* SUPPORT_UTF */
2354           otherd = TABLE_GET(d, fcc, d);
2355           }
2356         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2357           {
2358           if (++count >= GET2(code, 1))
2359             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2360           else
2361             { ADD_NEW(state_offset, count); }
2362           }
2363         }
2364       break;
2365
2366       /*-----------------------------------------------------------------*/
2367       case OP_UPTOI:
2368       case OP_MINUPTOI:
2369       case OP_POSUPTOI:
2370       case OP_NOTUPTOI:
2371       case OP_NOTMINUPTOI:
2372       case OP_NOTPOSUPTOI:
2373       caseless = TRUE;
2374       codevalue -= OP_STARI - OP_STAR;
2375       /* Fall through */
2376       case OP_UPTO:
2377       case OP_MINUPTO:
2378       case OP_POSUPTO:
2379       case OP_NOTUPTO:
2380       case OP_NOTMINUPTO:
2381       case OP_NOTPOSUPTO:
2382       ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2383       count = current_state->count;  /* Number already matched */
2384       if (clen > 0)
2385         {
2386         unsigned int otherd = NOTACHAR;
2387         if (caseless)
2388           {
2389 #ifdef SUPPORT_UTF
2390           if (utf && d >= 128)
2391             {
2392 #ifdef SUPPORT_UCP
2393             otherd = UCD_OTHERCASE(d);
2394 #endif  /* SUPPORT_UCP */
2395             }
2396           else
2397 #endif  /* SUPPORT_UTF */
2398           otherd = TABLE_GET(d, fcc, d);
2399           }
2400         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2401           {
2402           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2403             {
2404             active_count--;             /* Remove non-match possibility */
2405             next_active_state--;
2406             }
2407           if (++count >= GET2(code, 1))
2408             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2409           else
2410             { ADD_NEW(state_offset, count); }
2411           }
2412         }
2413       break;
2414
2415
2416 /* ========================================================================== */
2417       /* These are the class-handling opcodes */
2418
2419       case OP_CLASS:
2420       case OP_NCLASS:
2421       case OP_XCLASS:
2422         {
2423         BOOL isinclass = FALSE;
2424         int next_state_offset;
2425         const pcre_uchar *ecode;
2426
2427         /* For a simple class, there is always just a 32-byte table, and we
2428         can set isinclass from it. */
2429
2430         if (codevalue != OP_XCLASS)
2431           {
2432           ecode = code + 1 + (32 / sizeof(pcre_uchar));
2433           if (clen > 0)
2434             {
2435             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2436               ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2437             }
2438           }
2439
2440         /* An extended class may have a table or a list of single characters,
2441         ranges, or both, and it may be positive or negative. There's a
2442         function that sorts all this out. */
2443
2444         else
2445          {
2446          ecode = code + GET(code, 1);
2447          if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2448          }
2449
2450         /* At this point, isinclass is set for all kinds of class, and ecode
2451         points to the byte after the end of the class. If there is a
2452         quantifier, this is where it will be. */
2453
2454         next_state_offset = (int)(ecode - start_code);
2455
2456         switch (*ecode)
2457           {
2458           case OP_CRSTAR:
2459           case OP_CRMINSTAR:
2460           ADD_ACTIVE(next_state_offset + 1, 0);
2461           if (isinclass) { ADD_NEW(state_offset, 0); }
2462           break;
2463
2464           case OP_CRPLUS:
2465           case OP_CRMINPLUS:
2466           count = current_state->count;  /* Already matched */
2467           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2468           if (isinclass) { count++; ADD_NEW(state_offset, count); }
2469           break;
2470
2471           case OP_CRQUERY:
2472           case OP_CRMINQUERY:
2473           ADD_ACTIVE(next_state_offset + 1, 0);
2474           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2475           break;
2476
2477           case OP_CRRANGE:
2478           case OP_CRMINRANGE:
2479           count = current_state->count;  /* Already matched */
2480           if (count >= GET2(ecode, 1))
2481             { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2482           if (isinclass)
2483             {
2484             int max = GET2(ecode, 1 + IMM2_SIZE);
2485             if (++count >= max && max != 0)   /* Max 0 => no limit */
2486               { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2487             else
2488               { ADD_NEW(state_offset, count); }
2489             }
2490           break;
2491
2492           default:
2493           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2494           break;
2495           }
2496         }
2497       break;
2498
2499 /* ========================================================================== */
2500       /* These are the opcodes for fancy brackets of various kinds. We have
2501       to use recursion in order to handle them. The "always failing" assertion
2502       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2503       though the other "backtracking verbs" are not supported. */
2504
2505       case OP_FAIL:
2506       forced_fail++;    /* Count FAILs for multiple states */
2507       break;
2508
2509       case OP_ASSERT:
2510       case OP_ASSERT_NOT:
2511       case OP_ASSERTBACK:
2512       case OP_ASSERTBACK_NOT:
2513         {
2514         int rc;
2515         int local_offsets[2];
2516         int local_workspace[1000];
2517         const pcre_uchar *endasscode = code + GET(code, 1);
2518
2519         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2520
2521         rc = internal_dfa_exec(
2522           md,                                   /* static match data */
2523           code,                                 /* this subexpression's code */
2524           ptr,                                  /* where we currently are */
2525           (int)(ptr - start_subject),           /* start offset */
2526           local_offsets,                        /* offset vector */
2527           sizeof(local_offsets)/sizeof(int),    /* size of same */
2528           local_workspace,                      /* workspace vector */
2529           sizeof(local_workspace)/sizeof(int),  /* size of same */
2530           rlevel);                              /* function recursion level */
2531
2532         if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2533         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2534             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2535         }
2536       break;
2537
2538       /*-----------------------------------------------------------------*/
2539       case OP_COND:
2540       case OP_SCOND:
2541         {
2542         int local_offsets[1000];
2543         int local_workspace[1000];
2544         int codelink = GET(code, 1);
2545         int condcode;
2546
2547         /* Because of the way auto-callout works during compile, a callout item
2548         is inserted between OP_COND and an assertion condition. This does not
2549         happen for the other conditions. */
2550
2551         if (code[LINK_SIZE+1] == OP_CALLOUT)
2552           {
2553           rrc = 0;
2554           if (PUBL(callout) != NULL)
2555             {
2556             PUBL(callout_block) cb;
2557             cb.version          = 1;   /* Version 1 of the callout block */
2558             cb.callout_number   = code[LINK_SIZE+2];
2559             cb.offset_vector    = offsets;
2560 #ifdef COMPILE_PCRE8
2561             cb.subject          = (PCRE_SPTR)start_subject;
2562 #else
2563             cb.subject          = (PCRE_SPTR16)start_subject;
2564 #endif
2565             cb.subject_length   = (int)(end_subject - start_subject);
2566             cb.start_match      = (int)(current_subject - start_subject);
2567             cb.current_position = (int)(ptr - start_subject);
2568             cb.pattern_position = GET(code, LINK_SIZE + 3);
2569             cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2570             cb.capture_top      = 1;
2571             cb.capture_last     = -1;
2572             cb.callout_data     = md->callout_data;
2573             cb.mark             = NULL;   /* No (*MARK) support */
2574             if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2575             }
2576           if (rrc > 0) break;                      /* Fail this thread */
2577           code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2578           }
2579
2580         condcode = code[LINK_SIZE+1];
2581
2582         /* Back reference conditions are not supported */
2583
2584         if (condcode == OP_CREF || condcode == OP_NCREF)
2585           return PCRE_ERROR_DFA_UCOND;
2586
2587         /* The DEFINE condition is always false */
2588
2589         if (condcode == OP_DEF)
2590           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2591
2592         /* The only supported version of OP_RREF is for the value RREF_ANY,
2593         which means "test if in any recursion". We can't test for specifically
2594         recursed groups. */
2595
2596         else if (condcode == OP_RREF || condcode == OP_NRREF)
2597           {
2598           int value = GET2(code, LINK_SIZE + 2);
2599           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2600           if (md->recursive != NULL)
2601             { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2602           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2603           }
2604
2605         /* Otherwise, the condition is an assertion */
2606
2607         else
2608           {
2609           int rc;
2610           const pcre_uchar *asscode = code + LINK_SIZE + 1;
2611           const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2612
2613           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2614
2615           rc = internal_dfa_exec(
2616             md,                                   /* fixed match data */
2617             asscode,                              /* this subexpression's code */
2618             ptr,                                  /* where we currently are */
2619             (int)(ptr - start_subject),           /* start offset */
2620             local_offsets,                        /* offset vector */
2621             sizeof(local_offsets)/sizeof(int),    /* size of same */
2622             local_workspace,                      /* workspace vector */
2623             sizeof(local_workspace)/sizeof(int),  /* size of same */
2624             rlevel);                              /* function recursion level */
2625
2626           if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2627           if ((rc >= 0) ==
2628                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2629             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2630           else
2631             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2632           }
2633         }
2634       break;
2635
2636       /*-----------------------------------------------------------------*/
2637       case OP_RECURSE:
2638         {
2639         dfa_recursion_info *ri;
2640         int local_offsets[1000];
2641         int local_workspace[1000];
2642         const pcre_uchar *callpat = start_code + GET(code, 1);
2643         int recno = (callpat == md->start_code)? 0 :
2644           GET2(callpat, 1 + LINK_SIZE);
2645         int rc;
2646
2647         DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2648
2649         /* Check for repeating a recursion without advancing the subject
2650         pointer. This should catch convoluted mutual recursions. (Some simple
2651         cases are caught at compile time.) */
2652
2653         for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2654           if (recno == ri->group_num && ptr == ri->subject_position)
2655             return PCRE_ERROR_RECURSELOOP;
2656
2657         /* Remember this recursion and where we started it so as to
2658         catch infinite loops. */
2659
2660         new_recursive.group_num = recno;
2661         new_recursive.subject_position = ptr;
2662         new_recursive.prevrec = md->recursive;
2663         md->recursive = &new_recursive;
2664
2665         rc = internal_dfa_exec(
2666           md,                                   /* fixed match data */
2667           callpat,                              /* this subexpression's code */
2668           ptr,                                  /* where we currently are */
2669           (int)(ptr - start_subject),           /* start offset */
2670           local_offsets,                        /* offset vector */
2671           sizeof(local_offsets)/sizeof(int),    /* size of same */
2672           local_workspace,                      /* workspace vector */
2673           sizeof(local_workspace)/sizeof(int),  /* size of same */
2674           rlevel);                              /* function recursion level */
2675
2676         md->recursive = new_recursive.prevrec;  /* Done this recursion */
2677
2678         DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2679           rc));
2680
2681         /* Ran out of internal offsets */
2682
2683         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2684
2685         /* For each successful matched substring, set up the next state with a
2686         count of characters to skip before trying it. Note that the count is in
2687         characters, not bytes. */
2688
2689         if (rc > 0)
2690           {
2691           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2692             {
2693             int charcount = local_offsets[rc+1] - local_offsets[rc];
2694 #ifdef SUPPORT_UTF
2695             const pcre_uchar *p = start_subject + local_offsets[rc];
2696             const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2697             while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2698 #endif
2699             if (charcount > 0)
2700               {
2701               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2702               }
2703             else
2704               {
2705               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2706               }
2707             }
2708           }
2709         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2710         }
2711       break;
2712
2713       /*-----------------------------------------------------------------*/
2714       case OP_BRAPOS:
2715       case OP_SBRAPOS:
2716       case OP_CBRAPOS:
2717       case OP_SCBRAPOS:
2718       case OP_BRAPOSZERO:
2719         {
2720         int charcount, matched_count;
2721         const pcre_uchar *local_ptr = ptr;
2722         BOOL allow_zero;
2723
2724         if (codevalue == OP_BRAPOSZERO)
2725           {
2726           allow_zero = TRUE;
2727           codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2728           }
2729         else allow_zero = FALSE;
2730
2731         /* Loop to match the subpattern as many times as possible as if it were
2732         a complete pattern. */
2733
2734         for (matched_count = 0;; matched_count++)
2735           {
2736           int local_offsets[2];
2737           int local_workspace[1000];
2738
2739           int rc = internal_dfa_exec(
2740             md,                                   /* fixed match data */
2741             code,                                 /* this subexpression's code */
2742             local_ptr,                            /* where we currently are */
2743             (int)(ptr - start_subject),           /* start offset */
2744             local_offsets,                        /* offset vector */
2745             sizeof(local_offsets)/sizeof(int),    /* size of same */
2746             local_workspace,                      /* workspace vector */
2747             sizeof(local_workspace)/sizeof(int),  /* size of same */
2748             rlevel);                              /* function recursion level */
2749
2750           /* Failed to match */
2751
2752           if (rc < 0)
2753             {
2754             if (rc != PCRE_ERROR_NOMATCH) return rc;
2755             break;
2756             }
2757
2758           /* Matched: break the loop if zero characters matched. */
2759
2760           charcount = local_offsets[1] - local_offsets[0];
2761           if (charcount == 0) break;
2762           local_ptr += charcount;    /* Advance temporary position ptr */
2763           }
2764
2765         /* At this point we have matched the subpattern matched_count
2766         times, and local_ptr is pointing to the character after the end of the
2767         last match. */
2768
2769         if (matched_count > 0 || allow_zero)
2770           {
2771           const pcre_uchar *end_subpattern = code;
2772           int next_state_offset;
2773
2774           do { end_subpattern += GET(end_subpattern, 1); }
2775             while (*end_subpattern == OP_ALT);
2776           next_state_offset =
2777             (int)(end_subpattern - start_code + LINK_SIZE + 1);
2778
2779           /* Optimization: if there are no more active states, and there
2780           are no new states yet set up, then skip over the subject string
2781           right here, to save looping. Otherwise, set up the new state to swing
2782           into action when the end of the matched substring is reached. */
2783
2784           if (i + 1 >= active_count && new_count == 0)
2785             {
2786             ptr = local_ptr;
2787             clen = 0;
2788             ADD_NEW(next_state_offset, 0);
2789             }
2790           else
2791             {
2792             const pcre_uchar *p = ptr;
2793             const pcre_uchar *pp = local_ptr;
2794             charcount = (int)(pp - p);
2795 #ifdef SUPPORT_UTF
2796             while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2797 #endif
2798             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2799             }
2800           }
2801         }
2802       break;
2803
2804       /*-----------------------------------------------------------------*/
2805       case OP_ONCE:
2806       case OP_ONCE_NC:
2807         {
2808         int local_offsets[2];
2809         int local_workspace[1000];
2810
2811         int rc = internal_dfa_exec(
2812           md,                                   /* fixed match data */
2813           code,                                 /* this subexpression's code */
2814           ptr,                                  /* where we currently are */
2815           (int)(ptr - start_subject),           /* start offset */
2816           local_offsets,                        /* offset vector */
2817           sizeof(local_offsets)/sizeof(int),    /* size of same */
2818           local_workspace,                      /* workspace vector */
2819           sizeof(local_workspace)/sizeof(int),  /* size of same */
2820           rlevel);                              /* function recursion level */
2821
2822         if (rc >= 0)
2823           {
2824           const pcre_uchar *end_subpattern = code;
2825           int charcount = local_offsets[1] - local_offsets[0];
2826           int next_state_offset, repeat_state_offset;
2827
2828           do { end_subpattern += GET(end_subpattern, 1); }
2829             while (*end_subpattern == OP_ALT);
2830           next_state_offset =
2831             (int)(end_subpattern - start_code + LINK_SIZE + 1);
2832
2833           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2834           arrange for the repeat state also to be added to the relevant list.
2835           Calculate the offset, or set -1 for no repeat. */
2836
2837           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2838                                  *end_subpattern == OP_KETRMIN)?
2839             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2840
2841           /* If we have matched an empty string, add the next state at the
2842           current character pointer. This is important so that the duplicate
2843           checking kicks in, which is what breaks infinite loops that match an
2844           empty string. */
2845
2846           if (charcount == 0)
2847             {
2848             ADD_ACTIVE(next_state_offset, 0);
2849             }
2850
2851           /* Optimization: if there are no more active states, and there
2852           are no new states yet set up, then skip over the subject string
2853           right here, to save looping. Otherwise, set up the new state to swing
2854           into action when the end of the matched substring is reached. */
2855
2856           else if (i + 1 >= active_count && new_count == 0)
2857             {
2858             ptr += charcount;
2859             clen = 0;
2860             ADD_NEW(next_state_offset, 0);
2861
2862             /* If we are adding a repeat state at the new character position,
2863             we must fudge things so that it is the only current state.
2864             Otherwise, it might be a duplicate of one we processed before, and
2865             that would cause it to be skipped. */
2866
2867             if (repeat_state_offset >= 0)
2868               {
2869               next_active_state = active_states;
2870               active_count = 0;
2871               i = -1;
2872               ADD_ACTIVE(repeat_state_offset, 0);
2873               }
2874             }
2875           else
2876             {
2877 #ifdef SUPPORT_UTF
2878             const pcre_uchar *p = start_subject + local_offsets[0];
2879             const pcre_uchar *pp = start_subject + local_offsets[1];
2880             while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2881 #endif
2882             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2883             if (repeat_state_offset >= 0)
2884               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2885             }
2886           }
2887         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2888         }
2889       break;
2890
2891
2892 /* ========================================================================== */
2893       /* Handle callouts */
2894
2895       case OP_CALLOUT:
2896       rrc = 0;
2897       if (PUBL(callout) != NULL)
2898         {
2899         PUBL(callout_block) cb;
2900         cb.version          = 1;   /* Version 1 of the callout block */
2901         cb.callout_number   = code[1];
2902         cb.offset_vector    = offsets;
2903 #ifdef COMPILE_PCRE8
2904         cb.subject          = (PCRE_SPTR)start_subject;
2905 #else
2906         cb.subject          = (PCRE_SPTR16)start_subject;
2907 #endif
2908         cb.subject_length   = (int)(end_subject - start_subject);
2909         cb.start_match      = (int)(current_subject - start_subject);
2910         cb.current_position = (int)(ptr - start_subject);
2911         cb.pattern_position = GET(code, 2);
2912         cb.next_item_length = GET(code, 2 + LINK_SIZE);
2913         cb.capture_top      = 1;
2914         cb.capture_last     = -1;
2915         cb.callout_data     = md->callout_data;
2916         cb.mark             = NULL;   /* No (*MARK) support */
2917         if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2918         }
2919       if (rrc == 0)
2920         { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2921       break;
2922
2923
2924 /* ========================================================================== */
2925       default:        /* Unsupported opcode */
2926       return PCRE_ERROR_DFA_UITEM;
2927       }
2928
2929     NEXT_ACTIVE_STATE: continue;
2930
2931     }      /* End of loop scanning active states */
2932
2933   /* We have finished the processing at the current subject character. If no
2934   new states have been set for the next character, we have found all the
2935   matches that we are going to find. If we are at the top level and partial
2936   matching has been requested, check for appropriate conditions.
2937
2938   The "forced_ fail" variable counts the number of (*F) encountered for the
2939   character. If it is equal to the original active_count (saved in
2940   workspace[1]) it means that (*F) was found on every active state. In this
2941   case we don't want to give a partial match.
2942
2943   The "could_continue" variable is true if a state could have continued but
2944   for the fact that the end of the subject was reached. */
2945
2946   if (new_count <= 0)
2947     {
2948     if (rlevel == 1 &&                               /* Top level, and */
2949         could_continue &&                            /* Some could go on */
2950         forced_fail != workspace[1] &&               /* Not all forced fail & */
2951         (                                            /* either... */
2952         (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2953         ||                                           /* or... */
2954         ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
2955          match_count < 0)                            /* no matches */
2956         ) &&                                         /* And... */
2957         ptr >= end_subject &&                  /* Reached end of subject */
2958         ptr > md->start_used_ptr)              /* Inspected non-empty string */
2959       {
2960       if (offsetcount >= 2)
2961         {
2962         offsets[0] = (int)(md->start_used_ptr - start_subject);
2963         offsets[1] = (int)(end_subject - start_subject);
2964         }
2965       match_count = PCRE_ERROR_PARTIAL;
2966       }
2967
2968     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2969       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2970       rlevel*2-2, SP));
2971     break;        /* In effect, "return", but see the comment below */
2972     }
2973
2974   /* One or more states are active for the next character. */
2975
2976   ptr += clen;    /* Advance to next subject character */
2977   }               /* Loop to move along the subject string */
2978
2979 /* Control gets here from "break" a few lines above. We do it this way because
2980 if we use "return" above, we have compiler trouble. Some compilers warn if
2981 there's nothing here because they think the function doesn't return a value. On
2982 the other hand, if we put a dummy statement here, some more clever compilers
2983 complain that it can't be reached. Sigh. */
2984
2985 return match_count;
2986 }
2987
2988
2989
2990
2991 /*************************************************
2992 *    Execute a Regular Expression - DFA engine   *
2993 *************************************************/
2994
2995 /* This external function applies a compiled re to a subject string using a DFA
2996 engine. This function calls the internal function multiple times if the pattern
2997 is not anchored.
2998
2999 Arguments:
3000   argument_re     points to the compiled expression
3001   extra_data      points to extra data or is NULL
3002   subject         points to the subject string
3003   length          length of subject string (may contain binary zeros)
3004   start_offset    where to start in the subject string
3005   options         option bits
3006   offsets         vector of match offsets
3007   offsetcount     size of same
3008   workspace       workspace vector
3009   wscount         size of same
3010
3011 Returns:          > 0 => number of match offset pairs placed in offsets
3012                   = 0 => offsets overflowed; longest matches are present
3013                    -1 => failed to match
3014                  < -1 => some kind of unexpected problem
3015 */
3016
3017 #ifdef COMPILE_PCRE8
3018 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3019 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3020   const char *subject, int length, int start_offset, int options, int *offsets,
3021   int offsetcount, int *workspace, int wscount)
3022 #else
3023 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3024 pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3025   PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3026   int offsetcount, int *workspace, int wscount)
3027 #endif
3028 {
3029 REAL_PCRE *re = (REAL_PCRE *)argument_re;
3030 dfa_match_data match_block;
3031 dfa_match_data *md = &match_block;
3032 BOOL utf, anchored, startline, firstline;
3033 const pcre_uchar *current_subject, *end_subject;
3034 const pcre_study_data *study = NULL;
3035
3036 const pcre_uchar *req_char_ptr;
3037 const pcre_uint8 *start_bits = NULL;
3038 BOOL has_first_char = FALSE;
3039 BOOL has_req_char = FALSE;
3040 pcre_uchar first_char = 0;
3041 pcre_uchar first_char2 = 0;
3042 pcre_uchar req_char = 0;
3043 pcre_uchar req_char2 = 0;
3044 int newline;
3045
3046 /* Plausibility checks */
3047
3048 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3049 if (re == NULL || subject == NULL || workspace == NULL ||
3050    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3051 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3052 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3053 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3054
3055 /* We need to find the pointer to any study data before we test for byte
3056 flipping, so we scan the extra_data block first. This may set two fields in the
3057 match block, so we must initialize them beforehand. However, the other fields
3058 in the match block must not be set until after the byte flipping. */
3059
3060 md->tables = re->tables;
3061 md->callout_data = NULL;
3062
3063 if (extra_data != NULL)
3064   {
3065   unsigned int flags = extra_data->flags;
3066   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3067     study = (const pcre_study_data *)extra_data->study_data;
3068   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3069   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3070     return PCRE_ERROR_DFA_UMLIMIT;
3071   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3072     md->callout_data = extra_data->callout_data;
3073   if ((flags & PCRE_EXTRA_TABLES) != 0)
3074     md->tables = extra_data->tables;
3075   }
3076
3077 /* Check that the first field in the block is the magic number. If it is not,
3078 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3079 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3080 means that the pattern is likely compiled with different endianness. */
3081
3082 if (re->magic_number != MAGIC_NUMBER)
3083   return re->magic_number == REVERSED_MAGIC_NUMBER?
3084     PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3085 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3086
3087 /* Set some local values */
3088
3089 current_subject = (const pcre_uchar *)subject + start_offset;
3090 end_subject = (const pcre_uchar *)subject + length;
3091 req_char_ptr = current_subject - 1;
3092
3093 #ifdef SUPPORT_UTF
3094 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3095 utf = (re->options & PCRE_UTF8) != 0;
3096 #else
3097 utf = FALSE;
3098 #endif
3099
3100 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3101   (re->options & PCRE_ANCHORED) != 0;
3102
3103 /* The remaining fixed data for passing around. */
3104
3105 md->start_code = (const pcre_uchar *)argument_re +
3106     re->name_table_offset + re->name_count * re->name_entry_size;
3107 md->start_subject = (const pcre_uchar *)subject;
3108 md->end_subject = end_subject;
3109 md->start_offset = start_offset;
3110 md->moptions = options;
3111 md->poptions = re->options;
3112
3113 /* If the BSR option is not set at match time, copy what was set
3114 at compile time. */
3115
3116 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3117   {
3118   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3119     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3120 #ifdef BSR_ANYCRLF
3121   else md->moptions |= PCRE_BSR_ANYCRLF;
3122 #endif
3123   }
3124
3125 /* Handle different types of newline. The three bits give eight cases. If
3126 nothing is set at run time, whatever was used at compile time applies. */
3127
3128 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3129          PCRE_NEWLINE_BITS)
3130   {
3131   case 0: newline = NEWLINE; break;   /* Compile-time default */
3132   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3133   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3134   case PCRE_NEWLINE_CR+
3135        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3136   case PCRE_NEWLINE_ANY: newline = -1; break;
3137   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3138   default: return PCRE_ERROR_BADNEWLINE;
3139   }
3140
3141 if (newline == -2)
3142   {
3143   md->nltype = NLTYPE_ANYCRLF;
3144   }
3145 else if (newline < 0)
3146   {
3147   md->nltype = NLTYPE_ANY;
3148   }
3149 else
3150   {
3151   md->nltype = NLTYPE_FIXED;
3152   if (newline > 255)
3153     {
3154     md->nllen = 2;
3155     md->nl[0] = (newline >> 8) & 255;
3156     md->nl[1] = newline & 255;
3157     }
3158   else
3159     {
3160     md->nllen = 1;
3161     md->nl[0] = newline;
3162     }
3163   }
3164
3165 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3166 back the character offset. */
3167
3168 #ifdef SUPPORT_UTF
3169 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3170   {
3171   int erroroffset;
3172   int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3173   if (errorcode != 0)
3174     {
3175     if (offsetcount >= 2)
3176       {
3177       offsets[0] = erroroffset;
3178       offsets[1] = errorcode;
3179       }
3180     return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3181       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3182     }
3183   if (start_offset > 0 && start_offset < length &&
3184         NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3185     return PCRE_ERROR_BADUTF8_OFFSET;
3186   }
3187 #endif
3188
3189 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3190 is a feature that makes it possible to save compiled regex and re-use them
3191 in other programs later. */
3192
3193 if (md->tables == NULL) md->tables = PRIV(default_tables);
3194
3195 /* The "must be at the start of a line" flags are used in a loop when finding
3196 where to start. */
3197
3198 startline = (re->flags & PCRE_STARTLINE) != 0;
3199 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3200
3201 /* Set up the first character to match, if available. The first_byte value is
3202 never set for an anchored regular expression, but the anchoring may be forced
3203 at run time, so we have to test for anchoring. The first char may be unset for
3204 an unanchored pattern, of course. If there's no first char and the pattern was
3205 studied, there may be a bitmap of possible first characters. */
3206
3207 if (!anchored)
3208   {
3209   if ((re->flags & PCRE_FIRSTSET) != 0)
3210     {
3211     has_first_char = TRUE;
3212     first_char = first_char2 = (pcre_uchar)(re->first_char);
3213     if ((re->flags & PCRE_FCH_CASELESS) != 0)
3214       {
3215       first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3216 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3217       if (utf && first_char > 127)
3218         first_char2 = UCD_OTHERCASE(first_char);
3219 #endif
3220       }
3221     }
3222   else
3223     {
3224     if (!startline && study != NULL &&
3225          (study->flags & PCRE_STUDY_MAPPED) != 0)
3226       start_bits = study->start_bits;
3227     }
3228   }
3229
3230 /* For anchored or unanchored matches, there may be a "last known required
3231 character" set. */
3232
3233 if ((re->flags & PCRE_REQCHSET) != 0)
3234   {
3235   has_req_char = TRUE;
3236   req_char = req_char2 = (pcre_uchar)(re->req_char);
3237   if ((re->flags & PCRE_RCH_CASELESS) != 0)
3238     {
3239     req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3240 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3241     if (utf && req_char > 127)
3242       req_char2 = UCD_OTHERCASE(req_char);
3243 #endif
3244     }
3245   }
3246
3247 /* Call the main matching function, looping for a non-anchored regex after a
3248 failed match. If not restarting, perform certain optimizations at the start of
3249 a match. */
3250
3251 for (;;)
3252   {
3253   int rc;
3254
3255   if ((options & PCRE_DFA_RESTART) == 0)
3256     {
3257     const pcre_uchar *save_end_subject = end_subject;
3258
3259     /* If firstline is TRUE, the start of the match is constrained to the first
3260     line of a multiline string. Implement this by temporarily adjusting
3261     end_subject so that we stop scanning at a newline. If the match fails at
3262     the newline, later code breaks this loop. */
3263
3264     if (firstline)
3265       {
3266       PCRE_PUCHAR t = current_subject;
3267 #ifdef SUPPORT_UTF
3268       if (utf)
3269         {
3270         while (t < md->end_subject && !IS_NEWLINE(t))
3271           {
3272           t++;
3273           ACROSSCHAR(t < end_subject, *t, t++);
3274           }
3275         }
3276       else
3277 #endif
3278       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3279       end_subject = t;
3280       }
3281
3282     /* There are some optimizations that avoid running the match if a known
3283     starting point is not found. However, there is an option that disables
3284     these, for testing and for ensuring that all callouts do actually occur.
3285     The option can be set in the regex by (*NO_START_OPT) or passed in
3286     match-time options. */
3287
3288     if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3289       {
3290       /* Advance to a known first char. */
3291
3292       if (has_first_char)
3293         {
3294         if (first_char != first_char2)
3295           while (current_subject < end_subject &&
3296               *current_subject != first_char && *current_subject != first_char2)
3297             current_subject++;
3298         else
3299           while (current_subject < end_subject &&
3300                  *current_subject != first_char)
3301             current_subject++;
3302         }
3303
3304       /* Or to just after a linebreak for a multiline match if possible */
3305
3306       else if (startline)
3307         {
3308         if (current_subject > md->start_subject + start_offset)
3309           {
3310 #ifdef SUPPORT_UTF
3311           if (utf)
3312             {
3313             while (current_subject < end_subject &&
3314                    !WAS_NEWLINE(current_subject))
3315               {
3316               current_subject++;
3317               ACROSSCHAR(current_subject < end_subject, *current_subject,
3318                 current_subject++);
3319               }
3320             }
3321           else
3322 #endif
3323           while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3324             current_subject++;
3325
3326           /* If we have just passed a CR and the newline option is ANY or
3327           ANYCRLF, and we are now at a LF, advance the match position by one
3328           more character. */
3329
3330           if (current_subject[-1] == CHAR_CR &&
3331                (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3332                current_subject < end_subject &&
3333                *current_subject == CHAR_NL)
3334             current_subject++;
3335           }
3336         }
3337
3338       /* Or to a non-unique first char after study */
3339
3340       else if (start_bits != NULL)
3341         {
3342         while (current_subject < end_subject)
3343           {
3344           register unsigned int c = *current_subject;
3345 #ifndef COMPILE_PCRE8
3346           if (c > 255) c = 255;
3347 #endif
3348           if ((start_bits[c/8] & (1 << (c&7))) == 0)
3349             {
3350             current_subject++;
3351 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3352             /* In non 8-bit mode, the iteration will stop for
3353             characters > 255 at the beginning or not stop at all. */
3354             if (utf)
3355               ACROSSCHAR(current_subject < end_subject, *current_subject,
3356                 current_subject++);
3357 #endif
3358             }
3359           else break;
3360           }
3361         }
3362       }
3363
3364     /* Restore fudged end_subject */
3365
3366     end_subject = save_end_subject;
3367
3368     /* The following two optimizations are disabled for partial matching or if
3369     disabling is explicitly requested (and of course, by the test above, this
3370     code is not obeyed when restarting after a partial match). */
3371
3372     if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3373         (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3374       {
3375       /* If the pattern was studied, a minimum subject length may be set. This
3376       is a lower bound; no actual string of that length may actually match the
3377       pattern. Although the value is, strictly, in characters, we treat it as
3378       bytes to avoid spending too much time in this optimization. */
3379
3380       if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3381           (pcre_uint32)(end_subject - current_subject) < study->minlength)
3382         return PCRE_ERROR_NOMATCH;
3383
3384       /* If req_char is set, we know that that character must appear in the
3385       subject for the match to succeed. If the first character is set, req_char
3386       must be later in the subject; otherwise the test starts at the match
3387       point. This optimization can save a huge amount of work in patterns with
3388       nested unlimited repeats that aren't going to match. Writing separate
3389       code for cased/caseless versions makes it go faster, as does using an
3390       autoincrement and backing off on a match.
3391
3392       HOWEVER: when the subject string is very, very long, searching to its end
3393       can take a long time, and give bad performance on quite ordinary
3394       patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3395       string... so we don't do this when the string is sufficiently long. */
3396
3397       if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3398         {
3399         register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3400
3401         /* We don't need to repeat the search if we haven't yet reached the
3402         place we found it at last time. */
3403
3404         if (p > req_char_ptr)
3405           {
3406           if (req_char != req_char2)
3407             {
3408             while (p < end_subject)
3409               {
3410               register int pp = *p++;
3411               if (pp == req_char || pp == req_char2) { p--; break; }
3412               }
3413             }
3414           else
3415             {
3416             while (p < end_subject)
3417               {
3418               if (*p++ == req_char) { p--; break; }
3419               }
3420             }
3421
3422           /* If we can't find the required character, break the matching loop,
3423           which will cause a return or PCRE_ERROR_NOMATCH. */
3424
3425           if (p >= end_subject) break;
3426
3427           /* If we have found the required character, save the point where we
3428           found it, so that we don't search again next time round the loop if
3429           the start hasn't passed this character yet. */
3430
3431           req_char_ptr = p;
3432           }
3433         }
3434       }
3435     }   /* End of optimizations that are done when not restarting */
3436
3437   /* OK, now we can do the business */
3438
3439   md->start_used_ptr = current_subject;
3440   md->recursive = NULL;
3441
3442   rc = internal_dfa_exec(
3443     md,                                /* fixed match data */
3444     md->start_code,                    /* this subexpression's code */
3445     current_subject,                   /* where we currently are */
3446     start_offset,                      /* start offset in subject */
3447     offsets,                           /* offset vector */
3448     offsetcount,                       /* size of same */
3449     workspace,                         /* workspace vector */
3450     wscount,                           /* size of same */
3451     0);                                /* function recurse level */
3452
3453   /* Anything other than "no match" means we are done, always; otherwise, carry
3454   on only if not anchored. */
3455
3456   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3457
3458   /* Advance to the next subject character unless we are at the end of a line
3459   and firstline is set. */
3460
3461   if (firstline && IS_NEWLINE(current_subject)) break;
3462   current_subject++;
3463 #ifdef SUPPORT_UTF
3464   if (utf)
3465     {
3466     ACROSSCHAR(current_subject < end_subject, *current_subject,
3467       current_subject++);
3468     }
3469 #endif
3470   if (current_subject > end_subject) break;
3471
3472   /* If we have just passed a CR and we are now at a LF, and the pattern does
3473   not contain any explicit matches for \r or \n, and the newline option is CRLF
3474   or ANY or ANYCRLF, advance the match position by one more character. */
3475
3476   if (current_subject[-1] == CHAR_CR &&
3477       current_subject < end_subject &&
3478       *current_subject == CHAR_NL &&
3479       (re->flags & PCRE_HASCRORLF) == 0 &&
3480         (md->nltype == NLTYPE_ANY ||
3481          md->nltype == NLTYPE_ANYCRLF ||
3482          md->nllen == 2))
3483     current_subject++;
3484
3485   }   /* "Bumpalong" loop */
3486
3487 return PCRE_ERROR_NOMATCH;
3488 }
3489
3490 /* End of pcre_dfa_exec.c */