glib/pcre/pcre_dfa_exec.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language (but see
   7 below for why this module is different).
   8
   9                        Written by Philip Hazel
  10            Copyright (c) 1997-2012 University of Cambridge
  11
  12 -----------------------------------------------------------------------------
  13 Redistribution and use in source and binary forms, with or without
  14 modification, are permitted provided that the following conditions are met:
  15
  16     * Redistributions of source code must retain the above copyright notice,
  17       this list of conditions and the following disclaimer.
  18
  19     * Redistributions in binary form must reproduce the above copyright
  20       notice, this list of conditions and the following disclaimer in the
  21       documentation and/or other materials provided with the distribution.
  22
  23     * Neither the name of the University of Cambridge nor the names of its
  24       contributors may be used to endorse or promote products derived from
  25       this software without specific prior written permission.
  26
  27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  37 POSSIBILITY OF SUCH DAMAGE.
  38 -----------------------------------------------------------------------------
  39 */
  40
  41 /* This module contains the external function pcre_dfa_exec(), which is an
  42 alternative matching function that uses a sort of DFA algorithm (not a true
  43 FSM). This is NOT Perl-compatible, but it has advantages in certain
  44 applications. */
  45
  46
  47 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
  48 the performance of his patterns greatly. I could not use it as it stood, as it
  49 was not thread safe, and made assumptions about pattern sizes. Also, it caused
  50 test 7 to loop, and test 9 to crash with a segfault.
  51
  52 The issue is the check for duplicate states, which is done by a simple linear
  53 search up the state list. (Grep for "duplicate" below to find the code.) For
  54 many patterns, there will never be many states active at one time, so a simple
  55 linear search is fine. In patterns that have many active states, it might be a
  56 bottleneck. The suggested code used an indexing scheme to remember which states
  57 had previously been used for each character, and avoided the linear search when
  58 it knew there was no chance of a duplicate. This was implemented when adding
  59 states to the state lists.
  60
  61 I wrote some thread-safe, not-limited code to try something similar at the time
  62 of checking for duplicates (instead of when adding states), using index vectors
  63 on the stack. It did give a 13% improvement with one specially constructed
  64 pattern for certain subject strings, but on other strings and on many of the
  65 simpler patterns in the test suite it did worse. The major problem, I think,
  66 was the extra time to initialize the index. This had to be done for each call
  67 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
  68 only once - I suspect this was the cause of the problems with the tests.)
  69
  70 Overall, I concluded that the gains in some cases did not outweigh the losses
  71 in others, so I abandoned this code. */
  72
  73
  74
  75 #ifdef HAVE_CONFIG_H
  76 #include "config.h"
  77 #endif
  78
  79 #define NLBLOCK md             /* Block containing newline information */
  80 #define PSSTART start_subject  /* Field containing processed string start */
  81 #define PSEND   end_subject    /* Field containing processed string end */
  82
  83 #include "pcre_internal.h"
  84
  85
  86 /* For use to indent debugging output */
  87
  88 #define SP "                   "
  89
  90
  91 /*************************************************
  92 *      Code parameters and static tables         *
  93 *************************************************/
  94
  95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
  96 into others, under special conditions. A gap of 20 between the blocks should be
  97 enough. The resulting opcodes don't have to be less than 256 because they are
  98 never stored, so we push them well clear of the normal opcodes. */
  99
 100 #define OP_PROP_EXTRA       300
 101 #define OP_EXTUNI_EXTRA     320
 102 #define OP_ANYNL_EXTRA      340
 103 #define OP_HSPACE_EXTRA     360
 104 #define OP_VSPACE_EXTRA     380
 105
 106
 107 /* This table identifies those opcodes that are followed immediately by a
 108 character that is to be tested in some way. This makes it possible to
 109 centralize the loading of these characters. In the case of Type * etc, the
 110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
 111 small value. Non-zero values in the table are the offsets from the opcode where
 112 the character is to be found. ***NOTE*** If the start of this table is
 113 modified, the three tables that follow must also be modified. */
 114
 115 static const pcre_uint8 coptable[] = {
 116   0,                             /* End                                    */
 117   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
 118   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
 119   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
 120   0, 0,                          /* \P, \p                                 */
 121   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
 122   0,                             /* \X                                     */
 123   0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
 124   1,                             /* Char                                   */
 125   1,                             /* Chari                                  */
 126   1,                             /* not                                    */
 127   1,                             /* noti                                   */
 128   /* Positive single-char repeats                                          */
 129   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 130   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
 131   1+IMM2_SIZE,                   /* exact                                  */
 132   1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
 133   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
 134   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
 135   1+IMM2_SIZE,                   /* exact I                                */
 136   1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
 137   /* Negative single-char repeats - only for chars < 256                   */
 138   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
 139   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
 140   1+IMM2_SIZE,                   /* NOT exact                              */
 141   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
 142   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
 143   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
 144   1+IMM2_SIZE,                   /* NOT exact I                            */
 145   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
 146   /* Positive type repeats                                                 */
 147   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
 148   1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
 149   1+IMM2_SIZE,                   /* Type exact                             */
 150   1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
 151   /* Character class & ref repeats                                         */
 152   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
 153   0, 0,                          /* CRRANGE, CRMINRANGE                    */
 154   0,                             /* CLASS                                  */
 155   0,                             /* NCLASS                                 */
 156   0,                             /* XCLASS - variable length               */
 157   0,                             /* REF                                    */
 158   0,                             /* REFI                                   */
 159   0,                             /* RECURSE                                */
 160   0,                             /* CALLOUT                                */
 161   0,                             /* Alt                                    */
 162   0,                             /* Ket                                    */
 163   0,                             /* KetRmax                                */
 164   0,                             /* KetRmin                                */
 165   0,                             /* KetRpos                                */
 166   0,                             /* Reverse                                */
 167   0,                             /* Assert                                 */
 168   0,                             /* Assert not                             */
 169   0,                             /* Assert behind                          */
 170   0,                             /* Assert behind not                      */
 171   0, 0,                          /* ONCE, ONCE_NC                          */
 172   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
 173   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
 174   0, 0,                          /* CREF, NCREF                            */
 175   0, 0,                          /* RREF, NRREF                            */
 176   0,                             /* DEF                                    */
 177   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
 178   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
 179   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
 180   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
 181   0, 0                           /* CLOSE, SKIPZERO  */
 182 };
 183
 184 /* This table identifies those opcodes that inspect a character. It is used to
 185 remember the fact that a character could have been inspected when the end of
 186 the subject is reached. ***NOTE*** If the start of this table is modified, the
 187 two tables that follow must also be modified. */
 188
 189 static const pcre_uint8 poptable[] = {
 190   0,                             /* End                                    */
 191   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
 192   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
 193   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
 194   1, 1,                          /* \P, \p                                 */
 195   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
 196   1,                             /* \X                                     */
 197   0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
 198   1,                             /* Char                                   */
 199   1,                             /* Chari                                  */
 200   1,                             /* not                                    */
 201   1,                             /* noti                                   */
 202   /* Positive single-char repeats                                          */
 203   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 204   1, 1, 1,                       /* upto, minupto, exact                   */
 205   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
 206   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
 207   1, 1, 1,                       /* upto I, minupto I, exact I             */
 208   1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
 209   /* Negative single-char repeats - only for chars < 256                   */
 210   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
 211   1, 1, 1,                       /* NOT upto, minupto, exact               */
 212   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
 213   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
 214   1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
 215   1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
 216   /* Positive type repeats                                                 */
 217   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
 218   1, 1, 1,                       /* Type upto, minupto, exact              */
 219   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
 220   /* Character class & ref repeats                                         */
 221   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 222   1, 1,                          /* CRRANGE, CRMINRANGE                    */
 223   1,                             /* CLASS                                  */
 224   1,                             /* NCLASS                                 */
 225   1,                             /* XCLASS - variable length               */
 226   0,                             /* REF                                    */
 227   0,                             /* REFI                                   */
 228   0,                             /* RECURSE                                */
 229   0,                             /* CALLOUT                                */
 230   0,                             /* Alt                                    */
 231   0,                             /* Ket                                    */
 232   0,                             /* KetRmax                                */
 233   0,                             /* KetRmin                                */
 234   0,                             /* KetRpos                                */
 235   0,                             /* Reverse                                */
 236   0,                             /* Assert                                 */
 237   0,                             /* Assert not                             */
 238   0,                             /* Assert behind                          */
 239   0,                             /* Assert behind not                      */
 240   0, 0,                          /* ONCE, ONCE_NC                          */
 241   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
 242   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
 243   0, 0,                          /* CREF, NCREF                            */
 244   0, 0,                          /* RREF, NRREF                            */
 245   0,                             /* DEF                                    */
 246   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
 247   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
 248   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
 249   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
 250   0, 0                           /* CLOSE, SKIPZERO                        */
 251 };
 252
 253 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
 254 and \w */
 255
 256 static const pcre_uint8 toptable1[] = {
 257   0, 0, 0, 0, 0, 0,
 258   ctype_digit, ctype_digit,
 259   ctype_space, ctype_space,
 260   ctype_word,  ctype_word,
 261   0, 0                            /* OP_ANY, OP_ALLANY */
 262 };
 263
 264 static const pcre_uint8 toptable2[] = {
 265   0, 0, 0, 0, 0, 0,
 266   ctype_digit, 0,
 267   ctype_space, 0,
 268   ctype_word,  0,
 269   1, 1                            /* OP_ANY, OP_ALLANY */
 270 };
 271
 272
 273 /* Structure for holding data about a particular state, which is in effect the
 274 current data for an active path through the match tree. It must consist
 275 entirely of ints because the working vector we are passed, and which we put
 276 these structures in, is a vector of ints. */
 277
 278 typedef struct stateblock {
 279   int offset;                     /* Offset to opcode */
 280   int count;                      /* Count for repeats */
 281   int data;                       /* Some use extra data */
 282 } stateblock;
 283
 284 #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
 285
 286
 287 #ifdef PCRE_DEBUG
 288 /*************************************************
 289 *             Print character string             *
 290 *************************************************/
 291
 292 /* Character string printing function for debugging.
 293
 294 Arguments:
 295   p            points to string
 296   length       number of bytes
 297   f            where to print
 298
 299 Returns:       nothing
 300 */
 301
 302 static void
 303 pchars(const pcre_uchar *p, int length, FILE *f)
 304 {
 305 int c;
 306 while (length-- > 0)
 307   {
 308   if (isprint(c = *(p++)))
 309     fprintf(f, "%c", c);
 310   else
 311     fprintf(f, "\\x%02x", c);
 312   }
 313 }
 314 #endif
 315
 316
 317
 318 /*************************************************
 319 *    Execute a Regular Expression - DFA engine   *
 320 *************************************************/
 321
 322 /* This internal function applies a compiled pattern to a subject string,
 323 starting at a given point, using a DFA engine. This function is called from the
 324 external one, possibly multiple times if the pattern is not anchored. The
 325 function calls itself recursively for some kinds of subpattern.
 326
 327 Arguments:
 328   md                the match_data block with fixed information
 329   this_start_code   the opening bracket of this subexpression's code
 330   current_subject   where we currently are in the subject string
 331   start_offset      start offset in the subject string
 332   offsets           vector to contain the matching string offsets
 333   offsetcount       size of same
 334   workspace         vector of workspace
 335   wscount           size of same
 336   rlevel            function call recursion level
 337
 338 Returns:            > 0 => number of match offset pairs placed in offsets
 339                     = 0 => offsets overflowed; longest matches are present
 340                      -1 => failed to match
 341                    < -1 => some kind of unexpected problem
 342
 343 The following macros are used for adding states to the two state vectors (one
 344 for the current character, one for the following character). */
 345
 346 #define ADD_ACTIVE(x,y) \
 347   if (active_count++ < wscount) \
 348     { \
 349     next_active_state->offset = (x); \
 350     next_active_state->count  = (y); \
 351     next_active_state++; \
 352     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 353     } \
 354   else return PCRE_ERROR_DFA_WSSIZE
 355
 356 #define ADD_ACTIVE_DATA(x,y,z) \
 357   if (active_count++ < wscount) \
 358     { \
 359     next_active_state->offset = (x); \
 360     next_active_state->count  = (y); \
 361     next_active_state->data   = (z); \
 362     next_active_state++; \
 363     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 364     } \
 365   else return PCRE_ERROR_DFA_WSSIZE
 366
 367 #define ADD_NEW(x,y) \
 368   if (new_count++ < wscount) \
 369     { \
 370     next_new_state->offset = (x); \
 371     next_new_state->count  = (y); \
 372     next_new_state++; \
 373     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 374     } \
 375   else return PCRE_ERROR_DFA_WSSIZE
 376
 377 #define ADD_NEW_DATA(x,y,z) \
 378   if (new_count++ < wscount) \
 379     { \
 380     next_new_state->offset = (x); \
 381     next_new_state->count  = (y); \
 382     next_new_state->data   = (z); \
 383     next_new_state++; \
 384     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
 385       (x), (y), (z), __LINE__)); \
 386     } \
 387   else return PCRE_ERROR_DFA_WSSIZE
 388
 389 /* And now, here is the code */
 390
 391 static int
 392 internal_dfa_exec(
 393   dfa_match_data *md,
 394   const pcre_uchar *this_start_code,
 395   const pcre_uchar *current_subject,
 396   int start_offset,
 397   int *offsets,
 398   int offsetcount,
 399   int *workspace,
 400   int wscount,
 401   int  rlevel)
 402 {
 403 stateblock *active_states, *new_states, *temp_states;
 404 stateblock *next_active_state, *next_new_state;
 405
 406 const pcre_uint8 *ctypes, *lcc, *fcc;
 407 const pcre_uchar *ptr;
 408 const pcre_uchar *end_code, *first_op;
 409
 410 dfa_recursion_info new_recursive;
 411
 412 int active_count, new_count, match_count;
 413
 414 /* Some fields in the md block are frequently referenced, so we load them into
 415 independent variables in the hope that this will perform better. */
 416
 417 const pcre_uchar *start_subject = md->start_subject;
 418 const pcre_uchar *end_subject = md->end_subject;
 419 const pcre_uchar *start_code = md->start_code;
 420
 421 #ifdef SUPPORT_UTF
 422 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
 423 #else
 424 BOOL utf = FALSE;
 425 #endif
 426
 427 BOOL reset_could_continue = FALSE;
 428
 429 rlevel++;
 430 offsetcount &= (-2);
 431
 432 wscount -= 2;
 433 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
 434           (2 * INTS_PER_STATEBLOCK);
 435
 436 DPRINTF(("\n%.*s---------------------\n"
 437   "%.*sCall to internal_dfa_exec f=%d\n",
 438   rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
 439
 440 ctypes = md->tables + ctypes_offset;
 441 lcc = md->tables + lcc_offset;
 442 fcc = md->tables + fcc_offset;
 443
 444 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
 445
 446 active_states = (stateblock *)(workspace + 2);
 447 next_new_state = new_states = active_states + wscount;
 448 new_count = 0;
 449
 450 first_op = this_start_code + 1 + LINK_SIZE +
 451   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
 452     *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
 453     ? IMM2_SIZE:0);
 454
 455 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
 456 the alternative states onto the list, and find out where the end is. This
 457 makes is possible to use this function recursively, when we want to stop at a
 458 matching internal ket rather than at the end.
 459
 460 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
 461 a backward assertion. In that case, we have to find out the maximum amount to
 462 move back, and set up each alternative appropriately. */
 463
 464 if (*first_op == OP_REVERSE)
 465   {
 466   int max_back = 0;
 467   int gone_back;
 468
 469   end_code = this_start_code;
 470   do
 471     {
 472     int back = GET(end_code, 2+LINK_SIZE);
 473     if (back > max_back) max_back = back;
 474     end_code += GET(end_code, 1);
 475     }
 476   while (*end_code == OP_ALT);
 477
 478   /* If we can't go back the amount required for the longest lookbehind
 479   pattern, go back as far as we can; some alternatives may still be viable. */
 480
 481 #ifdef SUPPORT_UTF
 482   /* In character mode we have to step back character by character */
 483
 484   if (utf)
 485     {
 486     for (gone_back = 0; gone_back < max_back; gone_back++)
 487       {
 488       if (current_subject <= start_subject) break;
 489       current_subject--;
 490       ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
 491       }
 492     }
 493   else
 494 #endif
 495
 496   /* In byte-mode we can do this quickly. */
 497
 498     {
 499     gone_back = (current_subject - max_back < start_subject)?
 500       (int)(current_subject - start_subject) : max_back;
 501     current_subject -= gone_back;
 502     }
 503
 504   /* Save the earliest consulted character */
 505
 506   if (current_subject < md->start_used_ptr)
 507     md->start_used_ptr = current_subject;
 508
 509   /* Now we can process the individual branches. */
 510
 511   end_code = this_start_code;
 512   do
 513     {
 514     int back = GET(end_code, 2+LINK_SIZE);
 515     if (back <= gone_back)
 516       {
 517       int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
 518       ADD_NEW_DATA(-bstate, 0, gone_back - back);
 519       }
 520     end_code += GET(end_code, 1);
 521     }
 522   while (*end_code == OP_ALT);
 523  }
 524
 525 /* This is the code for a "normal" subpattern (not a backward assertion). The
 526 start of a whole pattern is always one of these. If we are at the top level,
 527 we may be asked to restart matching from the same point that we reached for a
 528 previous partial match. We still have to scan through the top-level branches to
 529 find the end state. */
 530
 531 else
 532   {
 533   end_code = this_start_code;
 534
 535   /* Restarting */
 536
 537   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
 538     {
 539     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
 540     new_count = workspace[1];
 541     if (!workspace[0])
 542       memcpy(new_states, active_states, new_count * sizeof(stateblock));
 543     }
 544
 545   /* Not restarting */
 546
 547   else
 548     {
 549     int length = 1 + LINK_SIZE +
 550       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
 551         *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
 552         ? IMM2_SIZE:0);
 553     do
 554       {
 555       ADD_NEW((int)(end_code - start_code + length), 0);
 556       end_code += GET(end_code, 1);
 557       length = 1 + LINK_SIZE;
 558       }
 559     while (*end_code == OP_ALT);
 560     }
 561   }
 562
 563 workspace[0] = 0;    /* Bit indicating which vector is current */
 564
 565 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
 566
 567 /* Loop for scanning the subject */
 568
 569 ptr = current_subject;
 570 for (;;)
 571   {
 572   int i, j;
 573   int clen, dlen;
 574   unsigned int c, d;
 575   int forced_fail = 0;
 576   BOOL partial_newline = FALSE;
 577   BOOL could_continue = reset_could_continue;
 578   reset_could_continue = FALSE;
 579
 580   /* Make the new state list into the active state list and empty the
 581   new state list. */
 582
 583   temp_states = active_states;
 584   active_states = new_states;
 585   new_states = temp_states;
 586   active_count = new_count;
 587   new_count = 0;
 588
 589   workspace[0] ^= 1;              /* Remember for the restarting feature */
 590   workspace[1] = active_count;
 591
 592 #ifdef PCRE_DEBUG
 593   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
 594   pchars(ptr, STRLEN_UC(ptr), stdout);
 595   printf("\"\n");
 596
 597   printf("%.*sActive states: ", rlevel*2-2, SP);
 598   for (i = 0; i < active_count; i++)
 599     printf("%d/%d ", active_states[i].offset, active_states[i].count);
 600   printf("\n");
 601 #endif
 602
 603   /* Set the pointers for adding new states */
 604
 605   next_active_state = active_states + active_count;
 606   next_new_state = new_states;
 607
 608   /* Load the current character from the subject outside the loop, as many
 609   different states may want to look at it, and we assume that at least one
 610   will. */
 611
 612   if (ptr < end_subject)
 613     {
 614     clen = 1;        /* Number of data items in the character */
 615 #ifdef SUPPORT_UTF
 616     if (utf) { GETCHARLEN(c, ptr, clen); } else
 617 #endif  /* SUPPORT_UTF */
 618     c = *ptr;
 619     }
 620   else
 621     {
 622     clen = 0;        /* This indicates the end of the subject */
 623     c = NOTACHAR;    /* This value should never actually be used */
 624     }
 625
 626   /* Scan up the active states and act on each one. The result of an action
 627   may be to add more states to the currently active list (e.g. on hitting a
 628   parenthesis) or it may be to put states on the new list, for considering
 629   when we move the character pointer on. */
 630
 631   for (i = 0; i < active_count; i++)
 632     {
 633     stateblock *current_state = active_states + i;
 634     BOOL caseless = FALSE;
 635     const pcre_uchar *code;
 636     int state_offset = current_state->offset;
 637     int count, codevalue, rrc;
 638
 639 #ifdef PCRE_DEBUG
 640     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
 641     if (clen == 0) printf("EOL\n");
 642       else if (c > 32 && c < 127) printf("'%c'\n", c);
 643         else printf("0x%02x\n", c);
 644 #endif
 645
 646     /* A negative offset is a special case meaning "hold off going to this
 647     (negated) state until the number of characters in the data field have
 648     been skipped". If the could_continue flag was passed over from a previous
 649     state, arrange for it to passed on. */
 650
 651     if (state_offset < 0)
 652       {
 653       if (current_state->data > 0)
 654         {
 655         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
 656         ADD_NEW_DATA(state_offset, current_state->count,
 657           current_state->data - 1);
 658         if (could_continue) reset_could_continue = TRUE;
 659         continue;
 660         }
 661       else
 662         {
 663         current_state->offset = state_offset = -state_offset;
 664         }
 665       }
 666
 667     /* Check for a duplicate state with the same count, and skip if found.
 668     See the note at the head of this module about the possibility of improving
 669     performance here. */
 670
 671     for (j = 0; j < i; j++)
 672       {
 673       if (active_states[j].offset == state_offset &&
 674           active_states[j].count == current_state->count)
 675         {
 676         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
 677         goto NEXT_ACTIVE_STATE;
 678         }
 679       }
 680
 681     /* The state offset is the offset to the opcode */
 682
 683     code = start_code + state_offset;
 684     codevalue = *code;
 685
 686     /* If this opcode inspects a character, but we are at the end of the
 687     subject, remember the fact for use when testing for a partial match. */
 688
 689     if (clen == 0 && poptable[codevalue] != 0)
 690       could_continue = TRUE;
 691
 692     /* If this opcode is followed by an inline character, load it. It is
 693     tempting to test for the presence of a subject character here, but that
 694     is wrong, because sometimes zero repetitions of the subject are
 695     permitted.
 696
 697     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
 698     argument that is not a data character - but is always one byte long because
 699     the values are small. We have to take special action to deal with  \P, \p,
 700     \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
 701     these ones to new opcodes. */
 702
 703     if (coptable[codevalue] > 0)
 704       {
 705       dlen = 1;
 706 #ifdef SUPPORT_UTF
 707       if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
 708 #endif  /* SUPPORT_UTF */
 709       d = code[coptable[codevalue]];
 710       if (codevalue >= OP_TYPESTAR)
 711         {
 712         switch(d)
 713           {
 714           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
 715           case OP_NOTPROP:
 716           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
 717           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
 718           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
 719           case OP_NOT_HSPACE:
 720           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
 721           case OP_NOT_VSPACE:
 722           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
 723           default: break;
 724           }
 725         }
 726       }
 727     else
 728       {
 729       dlen = 0;         /* Not strictly necessary, but compilers moan */
 730       d = NOTACHAR;     /* if these variables are not set. */
 731       }
 732
 733
 734     /* Now process the individual opcodes */
 735
 736     switch (codevalue)
 737       {
 738 /* ========================================================================== */
 739       /* These cases are never obeyed. This is a fudge that causes a compile-
 740       time error if the vectors coptable or poptable, which are indexed by
 741       opcode, are not the correct length. It seems to be the only way to do
 742       such a check at compile time, as the sizeof() operator does not work
 743       in the C preprocessor. */
 744
 745       case OP_TABLE_LENGTH:
 746       case OP_TABLE_LENGTH +
 747         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
 748          (sizeof(poptable) == OP_TABLE_LENGTH)):
 749       break;
 750
 751 /* ========================================================================== */
 752       /* Reached a closing bracket. If not at the end of the pattern, carry
 753       on with the next opcode. For repeating opcodes, also add the repeat
 754       state. Note that KETRPOS will always be encountered at the end of the
 755       subpattern, because the possessive subpattern repeats are always handled
 756       using recursive calls. Thus, it never adds any new states.
 757
 758       At the end of the (sub)pattern, unless we have an empty string and
 759       PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
 760       start of the subject, save the match data, shifting up all previous
 761       matches so we always have the longest first. */
 762
 763       case OP_KET:
 764       case OP_KETRMIN:
 765       case OP_KETRMAX:
 766       case OP_KETRPOS:
 767       if (code != end_code)
 768         {
 769         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
 770         if (codevalue != OP_KET)
 771           {
 772           ADD_ACTIVE(state_offset - GET(code, 1), 0);
 773           }
 774         }
 775       else
 776         {
 777         if (ptr > current_subject ||
 778             ((md->moptions & PCRE_NOTEMPTY) == 0 &&
 779               ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
 780                 current_subject > start_subject + md->start_offset)))
 781           {
 782           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
 783             else if (match_count > 0 && ++match_count * 2 > offsetcount)
 784               match_count = 0;
 785           count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
 786           if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
 787           if (offsetcount >= 2)
 788             {
 789             offsets[0] = (int)(current_subject - start_subject);
 790             offsets[1] = (int)(ptr - start_subject);
 791             DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
 792               offsets[1] - offsets[0], (char *)current_subject));
 793             }
 794           if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
 795             {
 796             DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
 797               "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
 798               match_count, rlevel*2-2, SP));
 799             return match_count;
 800             }
 801           }
 802         }
 803       break;
 804
 805 /* ========================================================================== */
 806       /* These opcodes add to the current list of states without looking
 807       at the current character. */
 808
 809       /*-----------------------------------------------------------------*/
 810       case OP_ALT:
 811       do { code += GET(code, 1); } while (*code == OP_ALT);
 812       ADD_ACTIVE((int)(code - start_code), 0);
 813       break;
 814
 815       /*-----------------------------------------------------------------*/
 816       case OP_BRA:
 817       case OP_SBRA:
 818       do
 819         {
 820         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
 821         code += GET(code, 1);
 822         }
 823       while (*code == OP_ALT);
 824       break;
 825
 826       /*-----------------------------------------------------------------*/
 827       case OP_CBRA:
 828       case OP_SCBRA:
 829       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
 830       code += GET(code, 1);
 831       while (*code == OP_ALT)
 832         {
 833         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
 834         code += GET(code, 1);
 835         }
 836       break;
 837
 838       /*-----------------------------------------------------------------*/
 839       case OP_BRAZERO:
 840       case OP_BRAMINZERO:
 841       ADD_ACTIVE(state_offset + 1, 0);
 842       code += 1 + GET(code, 2);
 843       while (*code == OP_ALT) code += GET(code, 1);
 844       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
 845       break;
 846
 847       /*-----------------------------------------------------------------*/
 848       case OP_SKIPZERO:
 849       code += 1 + GET(code, 2);
 850       while (*code == OP_ALT) code += GET(code, 1);
 851       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
 852       break;
 853
 854       /*-----------------------------------------------------------------*/
 855       case OP_CIRC:
 856       if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
 857         { ADD_ACTIVE(state_offset + 1, 0); }
 858       break;
 859
 860       /*-----------------------------------------------------------------*/
 861       case OP_CIRCM:
 862       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
 863           (ptr != end_subject && WAS_NEWLINE(ptr)))
 864         { ADD_ACTIVE(state_offset + 1, 0); }
 865       break;
 866
 867       /*-----------------------------------------------------------------*/
 868       case OP_EOD:
 869       if (ptr >= end_subject)
 870         {
 871         if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
 872           could_continue = TRUE;
 873         else { ADD_ACTIVE(state_offset + 1, 0); }
 874         }
 875       break;
 876
 877       /*-----------------------------------------------------------------*/
 878       case OP_SOD:
 879       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 880       break;
 881
 882       /*-----------------------------------------------------------------*/
 883       case OP_SOM:
 884       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
 885       break;
 886
 887
 888 /* ========================================================================== */
 889       /* These opcodes inspect the next subject character, and sometimes
 890       the previous one as well, but do not have an argument. The variable
 891       clen contains the length of the current character and is zero if we are
 892       at the end of the subject. */
 893
 894       /*-----------------------------------------------------------------*/
 895       case OP_ANY:
 896       if (clen > 0 && !IS_NEWLINE(ptr))
 897         {
 898         if (ptr + 1 >= md->end_subject &&
 899             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
 900             NLBLOCK->nltype == NLTYPE_FIXED &&
 901             NLBLOCK->nllen == 2 &&
 902             c == NLBLOCK->nl[0])
 903           {
 904           could_continue = partial_newline = TRUE;
 905           }
 906         else
 907           {
 908           ADD_NEW(state_offset + 1, 0);
 909           }
 910         }
 911       break;
 912
 913       /*-----------------------------------------------------------------*/
 914       case OP_ALLANY:
 915       if (clen > 0)
 916         { ADD_NEW(state_offset + 1, 0); }
 917       break;
 918
 919       /*-----------------------------------------------------------------*/
 920       case OP_EODN:
 921       if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
 922         could_continue = TRUE;
 923       else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
 924         { ADD_ACTIVE(state_offset + 1, 0); }
 925       break;
 926
 927       /*-----------------------------------------------------------------*/
 928       case OP_DOLL:
 929       if ((md->moptions & PCRE_NOTEOL) == 0)
 930         {
 931         if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
 932           could_continue = TRUE;
 933         else if (clen == 0 ||
 934             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
 935                (ptr == end_subject - md->nllen)
 936             ))
 937           { ADD_ACTIVE(state_offset + 1, 0); }
 938         else if (ptr + 1 >= md->end_subject &&
 939                  (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
 940                  NLBLOCK->nltype == NLTYPE_FIXED &&
 941                  NLBLOCK->nllen == 2 &&
 942                  c == NLBLOCK->nl[0])
 943           {
 944           if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
 945             {
 946             reset_could_continue = TRUE;
 947             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
 948             }
 949           else could_continue = partial_newline = TRUE;
 950           }
 951         }
 952       break;
 953
 954       /*-----------------------------------------------------------------*/
 955       case OP_DOLLM:
 956       if ((md->moptions & PCRE_NOTEOL) == 0)
 957         {
 958         if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
 959           could_continue = TRUE;
 960         else if (clen == 0 ||
 961             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
 962           { ADD_ACTIVE(state_offset + 1, 0); }
 963         else if (ptr + 1 >= md->end_subject &&
 964                  (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
 965                  NLBLOCK->nltype == NLTYPE_FIXED &&
 966                  NLBLOCK->nllen == 2 &&
 967                  c == NLBLOCK->nl[0])
 968           {
 969           if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
 970             {
 971             reset_could_continue = TRUE;
 972             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
 973             }
 974           else could_continue = partial_newline = TRUE;
 975           }
 976         }
 977       else if (IS_NEWLINE(ptr))
 978         { ADD_ACTIVE(state_offset + 1, 0); }
 979       break;
 980
 981       /*-----------------------------------------------------------------*/
 982
 983       case OP_DIGIT:
 984       case OP_WHITESPACE:
 985       case OP_WORDCHAR:
 986       if (clen > 0 && c < 256 &&
 987             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
 988         { ADD_NEW(state_offset + 1, 0); }
 989       break;
 990
 991       /*-----------------------------------------------------------------*/
 992       case OP_NOT_DIGIT:
 993       case OP_NOT_WHITESPACE:
 994       case OP_NOT_WORDCHAR:
 995       if (clen > 0 && (c >= 256 ||
 996             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
 997         { ADD_NEW(state_offset + 1, 0); }
 998       break;
 999
1000       /*-----------------------------------------------------------------*/
1001       case OP_WORD_BOUNDARY:
1002       case OP_NOT_WORD_BOUNDARY:
1003         {
1004         int left_word, right_word;
1005
1006         if (ptr > start_subject)
1007           {
1008           const pcre_uchar *temp = ptr - 1;
1009           if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1010 #ifdef SUPPORT_UTF
1011           if (utf) { BACKCHAR(temp); }
1012 #endif
1013           GETCHARTEST(d, temp);
1014 #ifdef SUPPORT_UCP
1015           if ((md->poptions & PCRE_UCP) != 0)
1016             {
1017             if (d == '_') left_word = TRUE; else
1018               {
1019               int cat = UCD_CATEGORY(d);
1020               left_word = (cat == ucp_L || cat == ucp_N);
1021               }
1022             }
1023           else
1024 #endif
1025           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1026           }
1027         else left_word = FALSE;
1028
1029         if (clen > 0)
1030           {
1031 #ifdef SUPPORT_UCP
1032           if ((md->poptions & PCRE_UCP) != 0)
1033             {
1034             if (c == '_') right_word = TRUE; else
1035               {
1036               int cat = UCD_CATEGORY(c);
1037               right_word = (cat == ucp_L || cat == ucp_N);
1038               }
1039             }
1040           else
1041 #endif
1042           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1043           }
1044         else right_word = FALSE;
1045
1046         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1047           { ADD_ACTIVE(state_offset + 1, 0); }
1048         }
1049       break;
1050
1051
1052       /*-----------------------------------------------------------------*/
1053       /* Check the next character by Unicode property. We will get here only
1054       if the support is in the binary; otherwise a compile-time error occurs.
1055       */
1056
1057 #ifdef SUPPORT_UCP
1058       case OP_PROP:
1059       case OP_NOTPROP:
1060       if (clen > 0)
1061         {
1062         BOOL OK;
1063         const pcre_uint8 chartype = UCD_CHARTYPE(c);
1064         switch(code[1])
1065           {
1066           case PT_ANY:
1067           OK = TRUE;
1068           break;
1069
1070           case PT_LAMP:
1071           OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1072                chartype == ucp_Lt;
1073           break;
1074
1075           case PT_GC:
1076           OK = PRIV(ucp_gentype)[chartype] == code[2];
1077           break;
1078
1079           case PT_PC:
1080           OK = chartype == code[2];
1081           break;
1082
1083           case PT_SC:
1084           OK = UCD_SCRIPT(c) == code[2];
1085           break;
1086
1087           /* These are specials for combination cases. */
1088
1089           case PT_ALNUM:
1090           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1091                PRIV(ucp_gentype)[chartype] == ucp_N;
1092           break;
1093
1094           case PT_SPACE:    /* Perl space */
1095           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1096                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1097           break;
1098
1099           case PT_PXSPACE:  /* POSIX space */
1100           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1101                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1102                c == CHAR_FF || c == CHAR_CR;
1103           break;
1104
1105           case PT_WORD:
1106           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1107                PRIV(ucp_gentype)[chartype] == ucp_N ||
1108                c == CHAR_UNDERSCORE;
1109           break;
1110
1111           /* Should never occur, but keep compilers from grumbling. */
1112
1113           default:
1114           OK = codevalue != OP_PROP;
1115           break;
1116           }
1117
1118         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1119         }
1120       break;
1121 #endif
1122
1123
1124
1125 /* ========================================================================== */
1126       /* These opcodes likewise inspect the subject character, but have an
1127       argument that is not a data character. It is one of these opcodes:
1128       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1129       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1130
1131       case OP_TYPEPLUS:
1132       case OP_TYPEMINPLUS:
1133       case OP_TYPEPOSPLUS:
1134       count = current_state->count;  /* Already matched */
1135       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1136       if (clen > 0)
1137         {
1138         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1139             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1140             NLBLOCK->nltype == NLTYPE_FIXED &&
1141             NLBLOCK->nllen == 2 &&
1142             c == NLBLOCK->nl[0])
1143           {
1144           could_continue = partial_newline = TRUE;
1145           }
1146         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1147             (c < 256 &&
1148               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1149               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1150           {
1151           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1152             {
1153             active_count--;            /* Remove non-match possibility */
1154             next_active_state--;
1155             }
1156           count++;
1157           ADD_NEW(state_offset, count);
1158           }
1159         }
1160       break;
1161
1162       /*-----------------------------------------------------------------*/
1163       case OP_TYPEQUERY:
1164       case OP_TYPEMINQUERY:
1165       case OP_TYPEPOSQUERY:
1166       ADD_ACTIVE(state_offset + 2, 0);
1167       if (clen > 0)
1168         {
1169         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1170             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1171             NLBLOCK->nltype == NLTYPE_FIXED &&
1172             NLBLOCK->nllen == 2 &&
1173             c == NLBLOCK->nl[0])
1174           {
1175           could_continue = partial_newline = TRUE;
1176           }
1177         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1178             (c < 256 &&
1179               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1180               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1181           {
1182           if (codevalue == OP_TYPEPOSQUERY)
1183             {
1184             active_count--;            /* Remove non-match possibility */
1185             next_active_state--;
1186             }
1187           ADD_NEW(state_offset + 2, 0);
1188           }
1189         }
1190       break;
1191
1192       /*-----------------------------------------------------------------*/
1193       case OP_TYPESTAR:
1194       case OP_TYPEMINSTAR:
1195       case OP_TYPEPOSSTAR:
1196       ADD_ACTIVE(state_offset + 2, 0);
1197       if (clen > 0)
1198         {
1199         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1200             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1201             NLBLOCK->nltype == NLTYPE_FIXED &&
1202             NLBLOCK->nllen == 2 &&
1203             c == NLBLOCK->nl[0])
1204           {
1205           could_continue = partial_newline = TRUE;
1206           }
1207         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1208             (c < 256 &&
1209               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1210               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1211           {
1212           if (codevalue == OP_TYPEPOSSTAR)
1213             {
1214             active_count--;            /* Remove non-match possibility */
1215             next_active_state--;
1216             }
1217           ADD_NEW(state_offset, 0);
1218           }
1219         }
1220       break;
1221
1222       /*-----------------------------------------------------------------*/
1223       case OP_TYPEEXACT:
1224       count = current_state->count;  /* Number already matched */
1225       if (clen > 0)
1226         {
1227         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1228             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1229             NLBLOCK->nltype == NLTYPE_FIXED &&
1230             NLBLOCK->nllen == 2 &&
1231             c == NLBLOCK->nl[0])
1232           {
1233           could_continue = partial_newline = TRUE;
1234           }
1235         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1236             (c < 256 &&
1237               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1238               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1239           {
1240           if (++count >= GET2(code, 1))
1241             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1242           else
1243             { ADD_NEW(state_offset, count); }
1244           }
1245         }
1246       break;
1247
1248       /*-----------------------------------------------------------------*/
1249       case OP_TYPEUPTO:
1250       case OP_TYPEMINUPTO:
1251       case OP_TYPEPOSUPTO:
1252       ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1253       count = current_state->count;  /* Number already matched */
1254       if (clen > 0)
1255         {
1256         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1257             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1258             NLBLOCK->nltype == NLTYPE_FIXED &&
1259             NLBLOCK->nllen == 2 &&
1260             c == NLBLOCK->nl[0])
1261           {
1262           could_continue = partial_newline = TRUE;
1263           }
1264         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1265             (c < 256 &&
1266               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1267               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1268           {
1269           if (codevalue == OP_TYPEPOSUPTO)
1270             {
1271             active_count--;           /* Remove non-match possibility */
1272             next_active_state--;
1273             }
1274           if (++count >= GET2(code, 1))
1275             { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1276           else
1277             { ADD_NEW(state_offset, count); }
1278           }
1279         }
1280       break;
1281
1282 /* ========================================================================== */
1283       /* These are virtual opcodes that are used when something like
1284       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1285       argument. It keeps the code above fast for the other cases. The argument
1286       is in the d variable. */
1287
1288 #ifdef SUPPORT_UCP
1289       case OP_PROP_EXTRA + OP_TYPEPLUS:
1290       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1291       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1292       count = current_state->count;           /* Already matched */
1293       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1294       if (clen > 0)
1295         {
1296         BOOL OK;
1297         const pcre_uint8 chartype = UCD_CHARTYPE(c);
1298         switch(code[2])
1299           {
1300           case PT_ANY:
1301           OK = TRUE;
1302           break;
1303
1304           case PT_LAMP:
1305           OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1306             chartype == ucp_Lt;
1307           break;
1308
1309           case PT_GC:
1310           OK = PRIV(ucp_gentype)[chartype] == code[3];
1311           break;
1312
1313           case PT_PC:
1314           OK = chartype == code[3];
1315           break;
1316
1317           case PT_SC:
1318           OK = UCD_SCRIPT(c) == code[3];
1319           break;
1320
1321           /* These are specials for combination cases. */
1322
1323           case PT_ALNUM:
1324           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1325                PRIV(ucp_gentype)[chartype] == ucp_N;
1326           break;
1327
1328           case PT_SPACE:    /* Perl space */
1329           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1330                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1331           break;
1332
1333           case PT_PXSPACE:  /* POSIX space */
1334           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1335                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1336                c == CHAR_FF || c == CHAR_CR;
1337           break;
1338
1339           case PT_WORD:
1340           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1341                PRIV(ucp_gentype)[chartype] == ucp_N ||
1342                c == CHAR_UNDERSCORE;
1343           break;
1344
1345           /* Should never occur, but keep compilers from grumbling. */
1346
1347           default:
1348           OK = codevalue != OP_PROP;
1349           break;
1350           }
1351
1352         if (OK == (d == OP_PROP))
1353           {
1354           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1355             {
1356             active_count--;           /* Remove non-match possibility */
1357             next_active_state--;
1358             }
1359           count++;
1360           ADD_NEW(state_offset, count);
1361           }
1362         }
1363       break;
1364
1365       /*-----------------------------------------------------------------*/
1366       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1367       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1368       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1369       count = current_state->count;  /* Already matched */
1370       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1371       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1372         {
1373         const pcre_uchar *nptr = ptr + clen;
1374         int ncount = 0;
1375         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1376           {
1377           active_count--;           /* Remove non-match possibility */
1378           next_active_state--;
1379           }
1380         while (nptr < end_subject)
1381           {
1382           int nd;
1383           int ndlen = 1;
1384           GETCHARLEN(nd, nptr, ndlen);
1385           if (UCD_CATEGORY(nd) != ucp_M) break;
1386           ncount++;
1387           nptr += ndlen;
1388           }
1389         count++;
1390         ADD_NEW_DATA(-state_offset, count, ncount);
1391         }
1392       break;
1393 #endif
1394
1395       /*-----------------------------------------------------------------*/
1396       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1397       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1398       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1399       count = current_state->count;  /* Already matched */
1400       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1401       if (clen > 0)
1402         {
1403         int ncount = 0;
1404         switch (c)
1405           {
1406           case 0x000b:
1407           case 0x000c:
1408           case 0x0085:
1409           case 0x2028:
1410           case 0x2029:
1411           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1412           goto ANYNL01;
1413
1414           case 0x000d:
1415           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1416           /* Fall through */
1417
1418           ANYNL01:
1419           case 0x000a:
1420           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1421             {
1422             active_count--;           /* Remove non-match possibility */
1423             next_active_state--;
1424             }
1425           count++;
1426           ADD_NEW_DATA(-state_offset, count, ncount);
1427           break;
1428
1429           default:
1430           break;
1431           }
1432         }
1433       break;
1434
1435       /*-----------------------------------------------------------------*/
1436       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1437       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1438       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1439       count = current_state->count;  /* Already matched */
1440       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1441       if (clen > 0)
1442         {
1443         BOOL OK;
1444         switch (c)
1445           {
1446           case 0x000a:
1447           case 0x000b:
1448           case 0x000c:
1449           case 0x000d:
1450           case 0x0085:
1451           case 0x2028:
1452           case 0x2029:
1453           OK = TRUE;
1454           break;
1455
1456           default:
1457           OK = FALSE;
1458           break;
1459           }
1460
1461         if (OK == (d == OP_VSPACE))
1462           {
1463           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1464             {
1465             active_count--;           /* Remove non-match possibility */
1466             next_active_state--;
1467             }
1468           count++;
1469           ADD_NEW_DATA(-state_offset, count, 0);
1470           }
1471         }
1472       break;
1473
1474       /*-----------------------------------------------------------------*/
1475       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1476       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1477       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1478       count = current_state->count;  /* Already matched */
1479       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1480       if (clen > 0)
1481         {
1482         BOOL OK;
1483         switch (c)
1484           {
1485           case 0x09:      /* HT */
1486           case 0x20:      /* SPACE */
1487           case 0xa0:      /* NBSP */
1488           case 0x1680:    /* OGHAM SPACE MARK */
1489           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1490           case 0x2000:    /* EN QUAD */
1491           case 0x2001:    /* EM QUAD */
1492           case 0x2002:    /* EN SPACE */
1493           case 0x2003:    /* EM SPACE */
1494           case 0x2004:    /* THREE-PER-EM SPACE */
1495           case 0x2005:    /* FOUR-PER-EM SPACE */
1496           case 0x2006:    /* SIX-PER-EM SPACE */
1497           case 0x2007:    /* FIGURE SPACE */
1498           case 0x2008:    /* PUNCTUATION SPACE */
1499           case 0x2009:    /* THIN SPACE */
1500           case 0x200A:    /* HAIR SPACE */
1501           case 0x202f:    /* NARROW NO-BREAK SPACE */
1502           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1503           case 0x3000:    /* IDEOGRAPHIC SPACE */
1504           OK = TRUE;
1505           break;
1506
1507           default:
1508           OK = FALSE;
1509           break;
1510           }
1511
1512         if (OK == (d == OP_HSPACE))
1513           {
1514           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1515             {
1516             active_count--;           /* Remove non-match possibility */
1517             next_active_state--;
1518             }
1519           count++;
1520           ADD_NEW_DATA(-state_offset, count, 0);
1521           }
1522         }
1523       break;
1524
1525       /*-----------------------------------------------------------------*/
1526 #ifdef SUPPORT_UCP
1527       case OP_PROP_EXTRA + OP_TYPEQUERY:
1528       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1529       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1530       count = 4;
1531       goto QS1;
1532
1533       case OP_PROP_EXTRA + OP_TYPESTAR:
1534       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1535       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1536       count = 0;
1537
1538       QS1:
1539
1540       ADD_ACTIVE(state_offset + 4, 0);
1541       if (clen > 0)
1542         {
1543         BOOL OK;
1544         const pcre_uint8 chartype = UCD_CHARTYPE(c);
1545         switch(code[2])
1546           {
1547           case PT_ANY:
1548           OK = TRUE;
1549           break;
1550
1551           case PT_LAMP:
1552           OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1553             chartype == ucp_Lt;
1554           break;
1555
1556           case PT_GC:
1557           OK = PRIV(ucp_gentype)[chartype] == code[3];
1558           break;
1559
1560           case PT_PC:
1561           OK = chartype == code[3];
1562           break;
1563
1564           case PT_SC:
1565           OK = UCD_SCRIPT(c) == code[3];
1566           break;
1567
1568           /* These are specials for combination cases. */
1569
1570           case PT_ALNUM:
1571           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1572                PRIV(ucp_gentype)[chartype] == ucp_N;
1573           break;
1574
1575           case PT_SPACE:    /* Perl space */
1576           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1577                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1578           break;
1579
1580           case PT_PXSPACE:  /* POSIX space */
1581           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1582                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1583                c == CHAR_FF || c == CHAR_CR;
1584           break;
1585
1586           case PT_WORD:
1587           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1588                PRIV(ucp_gentype)[chartype] == ucp_N ||
1589                c == CHAR_UNDERSCORE;
1590           break;
1591
1592           /* Should never occur, but keep compilers from grumbling. */
1593
1594           default:
1595           OK = codevalue != OP_PROP;
1596           break;
1597           }
1598
1599         if (OK == (d == OP_PROP))
1600           {
1601           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1602               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1603             {
1604             active_count--;           /* Remove non-match possibility */
1605             next_active_state--;
1606             }
1607           ADD_NEW(state_offset + count, 0);
1608           }
1609         }
1610       break;
1611
1612       /*-----------------------------------------------------------------*/
1613       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1614       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1615       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1616       count = 2;
1617       goto QS2;
1618
1619       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1620       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1621       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1622       count = 0;
1623
1624       QS2:
1625
1626       ADD_ACTIVE(state_offset + 2, 0);
1627       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1628         {
1629         const pcre_uchar *nptr = ptr + clen;
1630         int ncount = 0;
1631         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1632             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1633           {
1634           active_count--;           /* Remove non-match possibility */
1635           next_active_state--;
1636           }
1637         while (nptr < end_subject)
1638           {
1639           int nd;
1640           int ndlen = 1;
1641           GETCHARLEN(nd, nptr, ndlen);
1642           if (UCD_CATEGORY(nd) != ucp_M) break;
1643           ncount++;
1644           nptr += ndlen;
1645           }
1646         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1647         }
1648       break;
1649 #endif
1650
1651       /*-----------------------------------------------------------------*/
1652       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1653       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1654       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1655       count = 2;
1656       goto QS3;
1657
1658       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1659       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1660       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1661       count = 0;
1662
1663       QS3:
1664       ADD_ACTIVE(state_offset + 2, 0);
1665       if (clen > 0)
1666         {
1667         int ncount = 0;
1668         switch (c)
1669           {
1670           case 0x000b:
1671           case 0x000c:
1672           case 0x0085:
1673           case 0x2028:
1674           case 0x2029:
1675           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1676           goto ANYNL02;
1677
1678           case 0x000d:
1679           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1680           /* Fall through */
1681
1682           ANYNL02:
1683           case 0x000a:
1684           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1685               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1686             {
1687             active_count--;           /* Remove non-match possibility */
1688             next_active_state--;
1689             }
1690           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1691           break;
1692
1693           default:
1694           break;
1695           }
1696         }
1697       break;
1698
1699       /*-----------------------------------------------------------------*/
1700       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1701       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1702       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1703       count = 2;
1704       goto QS4;
1705
1706       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1707       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1708       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1709       count = 0;
1710
1711       QS4:
1712       ADD_ACTIVE(state_offset + 2, 0);
1713       if (clen > 0)
1714         {
1715         BOOL OK;
1716         switch (c)
1717           {
1718           case 0x000a:
1719           case 0x000b:
1720           case 0x000c:
1721           case 0x000d:
1722           case 0x0085:
1723           case 0x2028:
1724           case 0x2029:
1725           OK = TRUE;
1726           break;
1727
1728           default:
1729           OK = FALSE;
1730           break;
1731           }
1732         if (OK == (d == OP_VSPACE))
1733           {
1734           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1735               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1736             {
1737             active_count--;           /* Remove non-match possibility */
1738             next_active_state--;
1739             }
1740           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1741           }
1742         }
1743       break;
1744
1745       /*-----------------------------------------------------------------*/
1746       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1747       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1748       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1749       count = 2;
1750       goto QS5;
1751
1752       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1753       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1754       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1755       count = 0;
1756
1757       QS5:
1758       ADD_ACTIVE(state_offset + 2, 0);
1759       if (clen > 0)
1760         {
1761         BOOL OK;
1762         switch (c)
1763           {
1764           case 0x09:      /* HT */
1765           case 0x20:      /* SPACE */
1766           case 0xa0:      /* NBSP */
1767           case 0x1680:    /* OGHAM SPACE MARK */
1768           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1769           case 0x2000:    /* EN QUAD */
1770           case 0x2001:    /* EM QUAD */
1771           case 0x2002:    /* EN SPACE */
1772           case 0x2003:    /* EM SPACE */
1773           case 0x2004:    /* THREE-PER-EM SPACE */
1774           case 0x2005:    /* FOUR-PER-EM SPACE */
1775           case 0x2006:    /* SIX-PER-EM SPACE */
1776           case 0x2007:    /* FIGURE SPACE */
1777           case 0x2008:    /* PUNCTUATION SPACE */
1778           case 0x2009:    /* THIN SPACE */
1779           case 0x200A:    /* HAIR SPACE */
1780           case 0x202f:    /* NARROW NO-BREAK SPACE */
1781           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1782           case 0x3000:    /* IDEOGRAPHIC SPACE */
1783           OK = TRUE;
1784           break;
1785
1786           default:
1787           OK = FALSE;
1788           break;
1789           }
1790
1791         if (OK == (d == OP_HSPACE))
1792           {
1793           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1794               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1795             {
1796             active_count--;           /* Remove non-match possibility */
1797             next_active_state--;
1798             }
1799           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1800           }
1801         }
1802       break;
1803
1804       /*-----------------------------------------------------------------*/
1805 #ifdef SUPPORT_UCP
1806       case OP_PROP_EXTRA + OP_TYPEEXACT:
1807       case OP_PROP_EXTRA + OP_TYPEUPTO:
1808       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1809       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1810       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1811         { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1812       count = current_state->count;  /* Number already matched */
1813       if (clen > 0)
1814         {
1815         BOOL OK;
1816         const pcre_uint8 chartype = UCD_CHARTYPE(c);
1817         switch(code[1 + IMM2_SIZE + 1])
1818           {
1819           case PT_ANY:
1820           OK = TRUE;
1821           break;
1822
1823           case PT_LAMP:
1824           OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1825             chartype == ucp_Lt;
1826           break;
1827
1828           case PT_GC:
1829           OK = PRIV(ucp_gentype)[chartype] == code[1 + IMM2_SIZE + 2];
1830           break;
1831
1832           case PT_PC:
1833           OK = chartype == code[1 + IMM2_SIZE + 2];
1834           break;
1835
1836           case PT_SC:
1837           OK = UCD_SCRIPT(c) == code[1 + IMM2_SIZE + 2];
1838           break;
1839
1840           /* These are specials for combination cases. */
1841
1842           case PT_ALNUM:
1843           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1844                PRIV(ucp_gentype)[chartype] == ucp_N;
1845           break;
1846
1847           case PT_SPACE:    /* Perl space */
1848           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1849                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1850           break;
1851
1852           case PT_PXSPACE:  /* POSIX space */
1853           OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1854                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1855                c == CHAR_FF || c == CHAR_CR;
1856           break;
1857
1858           case PT_WORD:
1859           OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1860                PRIV(ucp_gentype)[chartype] == ucp_N ||
1861                c == CHAR_UNDERSCORE;
1862           break;
1863
1864           /* Should never occur, but keep compilers from grumbling. */
1865
1866           default:
1867           OK = codevalue != OP_PROP;
1868           break;
1869           }
1870
1871         if (OK == (d == OP_PROP))
1872           {
1873           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1874             {
1875             active_count--;           /* Remove non-match possibility */
1876             next_active_state--;
1877             }
1878           if (++count >= GET2(code, 1))
1879             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1880           else
1881             { ADD_NEW(state_offset, count); }
1882           }
1883         }
1884       break;
1885
1886       /*-----------------------------------------------------------------*/
1887       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1888       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1889       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1890       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1891       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1892         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1893       count = current_state->count;  /* Number already matched */
1894       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1895         {
1896         const pcre_uchar *nptr = ptr + clen;
1897         int ncount = 0;
1898         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1899           {
1900           active_count--;           /* Remove non-match possibility */
1901           next_active_state--;
1902           }
1903         while (nptr < end_subject)
1904           {
1905           int nd;
1906           int ndlen = 1;
1907           GETCHARLEN(nd, nptr, ndlen);
1908           if (UCD_CATEGORY(nd) != ucp_M) break;
1909           ncount++;
1910           nptr += ndlen;
1911           }
1912         if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1913             reset_could_continue = TRUE;
1914         if (++count >= GET2(code, 1))
1915           { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1916         else
1917           { ADD_NEW_DATA(-state_offset, count, ncount); }
1918         }
1919       break;
1920 #endif
1921
1922       /*-----------------------------------------------------------------*/
1923       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1924       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1925       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1926       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1927       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1928         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1929       count = current_state->count;  /* Number already matched */
1930       if (clen > 0)
1931         {
1932         int ncount = 0;
1933         switch (c)
1934           {
1935           case 0x000b:
1936           case 0x000c:
1937           case 0x0085:
1938           case 0x2028:
1939           case 0x2029:
1940           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1941           goto ANYNL03;
1942
1943           case 0x000d:
1944           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1945           /* Fall through */
1946
1947           ANYNL03:
1948           case 0x000a:
1949           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1950             {
1951             active_count--;           /* Remove non-match possibility */
1952             next_active_state--;
1953             }
1954           if (++count >= GET2(code, 1))
1955             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1956           else
1957             { ADD_NEW_DATA(-state_offset, count, ncount); }
1958           break;
1959
1960           default:
1961           break;
1962           }
1963         }
1964       break;
1965
1966       /*-----------------------------------------------------------------*/
1967       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1968       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1969       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1970       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1971       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1972         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1973       count = current_state->count;  /* Number already matched */
1974       if (clen > 0)
1975         {
1976         BOOL OK;
1977         switch (c)
1978           {
1979           case 0x000a:
1980           case 0x000b:
1981           case 0x000c:
1982           case 0x000d:
1983           case 0x0085:
1984           case 0x2028:
1985           case 0x2029:
1986           OK = TRUE;
1987           break;
1988
1989           default:
1990           OK = FALSE;
1991           }
1992
1993         if (OK == (d == OP_VSPACE))
1994           {
1995           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1996             {
1997             active_count--;           /* Remove non-match possibility */
1998             next_active_state--;
1999             }
2000           if (++count >= GET2(code, 1))
2001             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2002           else
2003             { ADD_NEW_DATA(-state_offset, count, 0); }
2004           }
2005         }
2006       break;
2007
2008       /*-----------------------------------------------------------------*/
2009       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2010       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2011       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2012       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2013       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2014         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2015       count = current_state->count;  /* Number already matched */
2016       if (clen > 0)
2017         {
2018         BOOL OK;
2019         switch (c)
2020           {
2021           case 0x09:      /* HT */
2022           case 0x20:      /* SPACE */
2023           case 0xa0:      /* NBSP */
2024           case 0x1680:    /* OGHAM SPACE MARK */
2025           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2026           case 0x2000:    /* EN QUAD */
2027           case 0x2001:    /* EM QUAD */
2028           case 0x2002:    /* EN SPACE */
2029           case 0x2003:    /* EM SPACE */
2030           case 0x2004:    /* THREE-PER-EM SPACE */
2031           case 0x2005:    /* FOUR-PER-EM SPACE */
2032           case 0x2006:    /* SIX-PER-EM SPACE */
2033           case 0x2007:    /* FIGURE SPACE */
2034           case 0x2008:    /* PUNCTUATION SPACE */
2035           case 0x2009:    /* THIN SPACE */
2036           case 0x200A:    /* HAIR SPACE */
2037           case 0x202f:    /* NARROW NO-BREAK SPACE */
2038           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2039           case 0x3000:    /* IDEOGRAPHIC SPACE */
2040           OK = TRUE;
2041           break;
2042
2043           default:
2044           OK = FALSE;
2045           break;
2046           }
2047
2048         if (OK == (d == OP_HSPACE))
2049           {
2050           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2051             {
2052             active_count--;           /* Remove non-match possibility */
2053             next_active_state--;
2054             }
2055           if (++count >= GET2(code, 1))
2056             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2057           else
2058             { ADD_NEW_DATA(-state_offset, count, 0); }
2059           }
2060         }
2061       break;
2062
2063 /* ========================================================================== */
2064       /* These opcodes are followed by a character that is usually compared
2065       to the current subject character; it is loaded into d. We still get
2066       here even if there is no subject character, because in some cases zero
2067       repetitions are permitted. */
2068
2069       /*-----------------------------------------------------------------*/
2070       case OP_CHAR:
2071       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2072       break;
2073
2074       /*-----------------------------------------------------------------*/
2075       case OP_CHARI:
2076       if (clen == 0) break;
2077
2078 #ifdef SUPPORT_UTF
2079       if (utf)
2080         {
2081         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2082           {
2083           unsigned int othercase;
2084           if (c < 128)
2085             othercase = fcc[c];
2086           else
2087             /* If we have Unicode property support, we can use it to test the
2088             other case of the character. */
2089 #ifdef SUPPORT_UCP
2090             othercase = UCD_OTHERCASE(c);
2091 #else
2092             othercase = NOTACHAR;
2093 #endif
2094
2095           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2096           }
2097         }
2098       else
2099 #endif  /* SUPPORT_UTF */
2100       /* Not UTF mode */
2101         {
2102         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2103           { ADD_NEW(state_offset + 2, 0); }
2104         }
2105       break;
2106
2107
2108 #ifdef SUPPORT_UCP
2109       /*-----------------------------------------------------------------*/
2110       /* This is a tricky one because it can match more than one character.
2111       Find out how many characters to skip, and then set up a negative state
2112       to wait for them to pass before continuing. */
2113
2114       case OP_EXTUNI:
2115       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2116         {
2117         const pcre_uchar *nptr = ptr + clen;
2118         int ncount = 0;
2119         while (nptr < end_subject)
2120           {
2121           int nclen = 1;
2122           GETCHARLEN(c, nptr, nclen);
2123           if (UCD_CATEGORY(c) != ucp_M) break;
2124           ncount++;
2125           nptr += nclen;
2126           }
2127         if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2128             reset_could_continue = TRUE;
2129         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2130         }
2131       break;
2132 #endif
2133
2134       /*-----------------------------------------------------------------*/
2135       /* This is a tricky like EXTUNI because it too can match more than one
2136       character (when CR is followed by LF). In this case, set up a negative
2137       state to wait for one character to pass before continuing. */
2138
2139       case OP_ANYNL:
2140       if (clen > 0) switch(c)
2141         {
2142         case 0x000b:
2143         case 0x000c:
2144         case 0x0085:
2145         case 0x2028:
2146         case 0x2029:
2147         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2148
2149         case 0x000a:
2150         ADD_NEW(state_offset + 1, 0);
2151         break;
2152
2153         case 0x000d:
2154         if (ptr + 1 >= end_subject)
2155           {
2156           ADD_NEW(state_offset + 1, 0);
2157           if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2158             reset_could_continue = TRUE;
2159           }
2160         else if (ptr[1] == 0x0a)
2161           {
2162           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2163           }
2164         else
2165           {
2166           ADD_NEW(state_offset + 1, 0);
2167           }
2168         break;
2169         }
2170       break;
2171
2172       /*-----------------------------------------------------------------*/
2173       case OP_NOT_VSPACE:
2174       if (clen > 0) switch(c)
2175         {
2176         case 0x000a:
2177         case 0x000b:
2178         case 0x000c:
2179         case 0x000d:
2180         case 0x0085:
2181         case 0x2028:
2182         case 0x2029:
2183         break;
2184
2185         default:
2186         ADD_NEW(state_offset + 1, 0);
2187         break;
2188         }
2189       break;
2190
2191       /*-----------------------------------------------------------------*/
2192       case OP_VSPACE:
2193       if (clen > 0) switch(c)
2194         {
2195         case 0x000a:
2196         case 0x000b:
2197         case 0x000c:
2198         case 0x000d:
2199         case 0x0085:
2200         case 0x2028:
2201         case 0x2029:
2202         ADD_NEW(state_offset + 1, 0);
2203         break;
2204
2205         default: break;
2206         }
2207       break;
2208
2209       /*-----------------------------------------------------------------*/
2210       case OP_NOT_HSPACE:
2211       if (clen > 0) switch(c)
2212         {
2213         case 0x09:      /* HT */
2214         case 0x20:      /* SPACE */
2215         case 0xa0:      /* NBSP */
2216         case 0x1680:    /* OGHAM SPACE MARK */
2217         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2218         case 0x2000:    /* EN QUAD */
2219         case 0x2001:    /* EM QUAD */
2220         case 0x2002:    /* EN SPACE */
2221         case 0x2003:    /* EM SPACE */
2222         case 0x2004:    /* THREE-PER-EM SPACE */
2223         case 0x2005:    /* FOUR-PER-EM SPACE */
2224         case 0x2006:    /* SIX-PER-EM SPACE */
2225         case 0x2007:    /* FIGURE SPACE */
2226         case 0x2008:    /* PUNCTUATION SPACE */
2227         case 0x2009:    /* THIN SPACE */
2228         case 0x200A:    /* HAIR SPACE */
2229         case 0x202f:    /* NARROW NO-BREAK SPACE */
2230         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2231         case 0x3000:    /* IDEOGRAPHIC SPACE */
2232         break;
2233
2234         default:
2235         ADD_NEW(state_offset + 1, 0);
2236         break;
2237         }
2238       break;
2239
2240       /*-----------------------------------------------------------------*/
2241       case OP_HSPACE:
2242       if (clen > 0) switch(c)
2243         {
2244         case 0x09:      /* HT */
2245         case 0x20:      /* SPACE */
2246         case 0xa0:      /* NBSP */
2247         case 0x1680:    /* OGHAM SPACE MARK */
2248         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2249         case 0x2000:    /* EN QUAD */
2250         case 0x2001:    /* EM QUAD */
2251         case 0x2002:    /* EN SPACE */
2252         case 0x2003:    /* EM SPACE */
2253         case 0x2004:    /* THREE-PER-EM SPACE */
2254         case 0x2005:    /* FOUR-PER-EM SPACE */
2255         case 0x2006:    /* SIX-PER-EM SPACE */
2256         case 0x2007:    /* FIGURE SPACE */
2257         case 0x2008:    /* PUNCTUATION SPACE */
2258         case 0x2009:    /* THIN SPACE */
2259         case 0x200A:    /* HAIR SPACE */
2260         case 0x202f:    /* NARROW NO-BREAK SPACE */
2261         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2262         case 0x3000:    /* IDEOGRAPHIC SPACE */
2263         ADD_NEW(state_offset + 1, 0);
2264         break;
2265         }
2266       break;
2267
2268       /*-----------------------------------------------------------------*/
2269       /* Match a negated single character casefully. */
2270
2271       case OP_NOT:
2272       if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2273       break;
2274
2275       /*-----------------------------------------------------------------*/
2276       /* Match a negated single character caselessly. */
2277
2278       case OP_NOTI:
2279       if (clen > 0)
2280         {
2281         unsigned int otherd;
2282 #ifdef SUPPORT_UTF
2283         if (utf && d >= 128)
2284           {
2285 #ifdef SUPPORT_UCP
2286           otherd = UCD_OTHERCASE(d);
2287 #endif  /* SUPPORT_UCP */
2288           }
2289         else
2290 #endif  /* SUPPORT_UTF */
2291         otherd = TABLE_GET(d, fcc, d);
2292         if (c != d && c != otherd)
2293           { ADD_NEW(state_offset + dlen + 1, 0); }
2294         }
2295       break;
2296
2297       /*-----------------------------------------------------------------*/
2298       case OP_PLUSI:
2299       case OP_MINPLUSI:
2300       case OP_POSPLUSI:
2301       case OP_NOTPLUSI:
2302       case OP_NOTMINPLUSI:
2303       case OP_NOTPOSPLUSI:
2304       caseless = TRUE;
2305       codevalue -= OP_STARI - OP_STAR;
2306
2307       /* Fall through */
2308       case OP_PLUS:
2309       case OP_MINPLUS:
2310       case OP_POSPLUS:
2311       case OP_NOTPLUS:
2312       case OP_NOTMINPLUS:
2313       case OP_NOTPOSPLUS:
2314       count = current_state->count;  /* Already matched */
2315       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2316       if (clen > 0)
2317         {
2318         unsigned int otherd = NOTACHAR;
2319         if (caseless)
2320           {
2321 #ifdef SUPPORT_UTF
2322           if (utf && d >= 128)
2323             {
2324 #ifdef SUPPORT_UCP
2325             otherd = UCD_OTHERCASE(d);
2326 #endif  /* SUPPORT_UCP */
2327             }
2328           else
2329 #endif  /* SUPPORT_UTF */
2330           otherd = TABLE_GET(d, fcc, d);
2331           }
2332         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2333           {
2334           if (count > 0 &&
2335               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2336             {
2337             active_count--;             /* Remove non-match possibility */
2338             next_active_state--;
2339             }
2340           count++;
2341           ADD_NEW(state_offset, count);
2342           }
2343         }
2344       break;
2345
2346       /*-----------------------------------------------------------------*/
2347       case OP_QUERYI:
2348       case OP_MINQUERYI:
2349       case OP_POSQUERYI:
2350       case OP_NOTQUERYI:
2351       case OP_NOTMINQUERYI:
2352       case OP_NOTPOSQUERYI:
2353       caseless = TRUE;
2354       codevalue -= OP_STARI - OP_STAR;
2355       /* Fall through */
2356       case OP_QUERY:
2357       case OP_MINQUERY:
2358       case OP_POSQUERY:
2359       case OP_NOTQUERY:
2360       case OP_NOTMINQUERY:
2361       case OP_NOTPOSQUERY:
2362       ADD_ACTIVE(state_offset + dlen + 1, 0);
2363       if (clen > 0)
2364         {
2365         unsigned int otherd = NOTACHAR;
2366         if (caseless)
2367           {
2368 #ifdef SUPPORT_UTF
2369           if (utf && d >= 128)
2370             {
2371 #ifdef SUPPORT_UCP
2372             otherd = UCD_OTHERCASE(d);
2373 #endif  /* SUPPORT_UCP */
2374             }
2375           else
2376 #endif  /* SUPPORT_UTF */
2377           otherd = TABLE_GET(d, fcc, d);
2378           }
2379         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2380           {
2381           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2382             {
2383             active_count--;            /* Remove non-match possibility */
2384             next_active_state--;
2385             }
2386           ADD_NEW(state_offset + dlen + 1, 0);
2387           }
2388         }
2389       break;
2390
2391       /*-----------------------------------------------------------------*/
2392       case OP_STARI:
2393       case OP_MINSTARI:
2394       case OP_POSSTARI:
2395       case OP_NOTSTARI:
2396       case OP_NOTMINSTARI:
2397       case OP_NOTPOSSTARI:
2398       caseless = TRUE;
2399       codevalue -= OP_STARI - OP_STAR;
2400       /* Fall through */
2401       case OP_STAR:
2402       case OP_MINSTAR:
2403       case OP_POSSTAR:
2404       case OP_NOTSTAR:
2405       case OP_NOTMINSTAR:
2406       case OP_NOTPOSSTAR:
2407       ADD_ACTIVE(state_offset + dlen + 1, 0);
2408       if (clen > 0)
2409         {
2410         unsigned int otherd = NOTACHAR;
2411         if (caseless)
2412           {
2413 #ifdef SUPPORT_UTF
2414           if (utf && d >= 128)
2415             {
2416 #ifdef SUPPORT_UCP
2417             otherd = UCD_OTHERCASE(d);
2418 #endif  /* SUPPORT_UCP */
2419             }
2420           else
2421 #endif  /* SUPPORT_UTF */
2422           otherd = TABLE_GET(d, fcc, d);
2423           }
2424         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2425           {
2426           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2427             {
2428             active_count--;            /* Remove non-match possibility */
2429             next_active_state--;
2430             }
2431           ADD_NEW(state_offset, 0);
2432           }
2433         }
2434       break;
2435
2436       /*-----------------------------------------------------------------*/
2437       case OP_EXACTI:
2438       case OP_NOTEXACTI:
2439       caseless = TRUE;
2440       codevalue -= OP_STARI - OP_STAR;
2441       /* Fall through */
2442       case OP_EXACT:
2443       case OP_NOTEXACT:
2444       count = current_state->count;  /* Number already matched */
2445       if (clen > 0)
2446         {
2447         unsigned int otherd = NOTACHAR;
2448         if (caseless)
2449           {
2450 #ifdef SUPPORT_UTF
2451           if (utf && d >= 128)
2452             {
2453 #ifdef SUPPORT_UCP
2454             otherd = UCD_OTHERCASE(d);
2455 #endif  /* SUPPORT_UCP */
2456             }
2457           else
2458 #endif  /* SUPPORT_UTF */
2459           otherd = TABLE_GET(d, fcc, d);
2460           }
2461         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2462           {
2463           if (++count >= GET2(code, 1))
2464             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2465           else
2466             { ADD_NEW(state_offset, count); }
2467           }
2468         }
2469       break;
2470
2471       /*-----------------------------------------------------------------*/
2472       case OP_UPTOI:
2473       case OP_MINUPTOI:
2474       case OP_POSUPTOI:
2475       case OP_NOTUPTOI:
2476       case OP_NOTMINUPTOI:
2477       case OP_NOTPOSUPTOI:
2478       caseless = TRUE;
2479       codevalue -= OP_STARI - OP_STAR;
2480       /* Fall through */
2481       case OP_UPTO:
2482       case OP_MINUPTO:
2483       case OP_POSUPTO:
2484       case OP_NOTUPTO:
2485       case OP_NOTMINUPTO:
2486       case OP_NOTPOSUPTO:
2487       ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2488       count = current_state->count;  /* Number already matched */
2489       if (clen > 0)
2490         {
2491         unsigned int otherd = NOTACHAR;
2492         if (caseless)
2493           {
2494 #ifdef SUPPORT_UTF
2495           if (utf && d >= 128)
2496             {
2497 #ifdef SUPPORT_UCP
2498             otherd = UCD_OTHERCASE(d);
2499 #endif  /* SUPPORT_UCP */
2500             }
2501           else
2502 #endif  /* SUPPORT_UTF */
2503           otherd = TABLE_GET(d, fcc, d);
2504           }
2505         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2506           {
2507           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2508             {
2509             active_count--;             /* Remove non-match possibility */
2510             next_active_state--;
2511             }
2512           if (++count >= GET2(code, 1))
2513             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2514           else
2515             { ADD_NEW(state_offset, count); }
2516           }
2517         }
2518       break;
2519
2520
2521 /* ========================================================================== */
2522       /* These are the class-handling opcodes */
2523
2524       case OP_CLASS:
2525       case OP_NCLASS:
2526       case OP_XCLASS:
2527         {
2528         BOOL isinclass = FALSE;
2529         int next_state_offset;
2530         const pcre_uchar *ecode;
2531
2532         /* For a simple class, there is always just a 32-byte table, and we
2533         can set isinclass from it. */
2534
2535         if (codevalue != OP_XCLASS)
2536           {
2537           ecode = code + 1 + (32 / sizeof(pcre_uchar));
2538           if (clen > 0)
2539             {
2540             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2541               ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2542             }
2543           }
2544
2545         /* An extended class may have a table or a list of single characters,
2546         ranges, or both, and it may be positive or negative. There's a
2547         function that sorts all this out. */
2548
2549         else
2550          {
2551          ecode = code + GET(code, 1);
2552          if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2553          }
2554
2555         /* At this point, isinclass is set for all kinds of class, and ecode
2556         points to the byte after the end of the class. If there is a
2557         quantifier, this is where it will be. */
2558
2559         next_state_offset = (int)(ecode - start_code);
2560
2561         switch (*ecode)
2562           {
2563           case OP_CRSTAR:
2564           case OP_CRMINSTAR:
2565           ADD_ACTIVE(next_state_offset + 1, 0);
2566           if (isinclass) { ADD_NEW(state_offset, 0); }
2567           break;
2568
2569           case OP_CRPLUS:
2570           case OP_CRMINPLUS:
2571           count = current_state->count;  /* Already matched */
2572           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2573           if (isinclass) { count++; ADD_NEW(state_offset, count); }
2574           break;
2575
2576           case OP_CRQUERY:
2577           case OP_CRMINQUERY:
2578           ADD_ACTIVE(next_state_offset + 1, 0);
2579           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2580           break;
2581
2582           case OP_CRRANGE:
2583           case OP_CRMINRANGE:
2584           count = current_state->count;  /* Already matched */
2585           if (count >= GET2(ecode, 1))
2586             { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2587           if (isinclass)
2588             {
2589             int max = GET2(ecode, 1 + IMM2_SIZE);
2590             if (++count >= max && max != 0)   /* Max 0 => no limit */
2591               { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2592             else
2593               { ADD_NEW(state_offset, count); }
2594             }
2595           break;
2596
2597           default:
2598           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2599           break;
2600           }
2601         }
2602       break;
2603
2604 /* ========================================================================== */
2605       /* These are the opcodes for fancy brackets of various kinds. We have
2606       to use recursion in order to handle them. The "always failing" assertion
2607       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2608       though the other "backtracking verbs" are not supported. */
2609
2610       case OP_FAIL:
2611       forced_fail++;    /* Count FAILs for multiple states */
2612       break;
2613
2614       case OP_ASSERT:
2615       case OP_ASSERT_NOT:
2616       case OP_ASSERTBACK:
2617       case OP_ASSERTBACK_NOT:
2618         {
2619         int rc;
2620         int local_offsets[2];
2621         int local_workspace[1000];
2622         const pcre_uchar *endasscode = code + GET(code, 1);
2623
2624         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2625
2626         rc = internal_dfa_exec(
2627           md,                                   /* static match data */
2628           code,                                 /* this subexpression's code */
2629           ptr,                                  /* where we currently are */
2630           (int)(ptr - start_subject),           /* start offset */
2631           local_offsets,                        /* offset vector */
2632           sizeof(local_offsets)/sizeof(int),    /* size of same */
2633           local_workspace,                      /* workspace vector */
2634           sizeof(local_workspace)/sizeof(int),  /* size of same */
2635           rlevel);                              /* function recursion level */
2636
2637         if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2638         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2639             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2640         }
2641       break;
2642
2643       /*-----------------------------------------------------------------*/
2644       case OP_COND:
2645       case OP_SCOND:
2646         {
2647         int local_offsets[1000];
2648         int local_workspace[1000];
2649         int codelink = GET(code, 1);
2650         int condcode;
2651
2652         /* Because of the way auto-callout works during compile, a callout item
2653         is inserted between OP_COND and an assertion condition. This does not
2654         happen for the other conditions. */
2655
2656         if (code[LINK_SIZE+1] == OP_CALLOUT)
2657           {
2658           rrc = 0;
2659           if (PUBL(callout) != NULL)
2660             {
2661             PUBL(callout_block) cb;
2662             cb.version          = 1;   /* Version 1 of the callout block */
2663             cb.callout_number   = code[LINK_SIZE+2];
2664             cb.offset_vector    = offsets;
2665 #ifdef COMPILE_PCRE8
2666             cb.subject          = (PCRE_SPTR)start_subject;
2667 #else
2668             cb.subject          = (PCRE_SPTR16)start_subject;
2669 #endif
2670             cb.subject_length   = (int)(end_subject - start_subject);
2671             cb.start_match      = (int)(current_subject - start_subject);
2672             cb.current_position = (int)(ptr - start_subject);
2673             cb.pattern_position = GET(code, LINK_SIZE + 3);
2674             cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2675             cb.capture_top      = 1;
2676             cb.capture_last     = -1;
2677             cb.callout_data     = md->callout_data;
2678             cb.mark             = NULL;   /* No (*MARK) support */
2679             if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2680             }
2681           if (rrc > 0) break;                      /* Fail this thread */
2682           code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2683           }
2684
2685         condcode = code[LINK_SIZE+1];
2686
2687         /* Back reference conditions are not supported */
2688
2689         if (condcode == OP_CREF || condcode == OP_NCREF)
2690           return PCRE_ERROR_DFA_UCOND;
2691
2692         /* The DEFINE condition is always false */
2693
2694         if (condcode == OP_DEF)
2695           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2696
2697         /* The only supported version of OP_RREF is for the value RREF_ANY,
2698         which means "test if in any recursion". We can't test for specifically
2699         recursed groups. */
2700
2701         else if (condcode == OP_RREF || condcode == OP_NRREF)
2702           {
2703           int value = GET2(code, LINK_SIZE + 2);
2704           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2705           if (md->recursive != NULL)
2706             { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2707           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2708           }
2709
2710         /* Otherwise, the condition is an assertion */
2711
2712         else
2713           {
2714           int rc;
2715           const pcre_uchar *asscode = code + LINK_SIZE + 1;
2716           const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2717
2718           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2719
2720           rc = internal_dfa_exec(
2721             md,                                   /* fixed match data */
2722             asscode,                              /* this subexpression's code */
2723             ptr,                                  /* where we currently are */
2724             (int)(ptr - start_subject),           /* start offset */
2725             local_offsets,                        /* offset vector */
2726             sizeof(local_offsets)/sizeof(int),    /* size of same */
2727             local_workspace,                      /* workspace vector */
2728             sizeof(local_workspace)/sizeof(int),  /* size of same */
2729             rlevel);                              /* function recursion level */
2730
2731           if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2732           if ((rc >= 0) ==
2733                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2734             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2735           else
2736             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2737           }
2738         }
2739       break;
2740
2741       /*-----------------------------------------------------------------*/
2742       case OP_RECURSE:
2743         {
2744         dfa_recursion_info *ri;
2745         int local_offsets[1000];
2746         int local_workspace[1000];
2747         const pcre_uchar *callpat = start_code + GET(code, 1);
2748         int recno = (callpat == md->start_code)? 0 :
2749           GET2(callpat, 1 + LINK_SIZE);
2750         int rc;
2751
2752         DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2753
2754         /* Check for repeating a recursion without advancing the subject
2755         pointer. This should catch convoluted mutual recursions. (Some simple
2756         cases are caught at compile time.) */
2757
2758         for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2759           if (recno == ri->group_num && ptr == ri->subject_position)
2760             return PCRE_ERROR_RECURSELOOP;
2761
2762         /* Remember this recursion and where we started it so as to
2763         catch infinite loops. */
2764
2765         new_recursive.group_num = recno;
2766         new_recursive.subject_position = ptr;
2767         new_recursive.prevrec = md->recursive;
2768         md->recursive = &new_recursive;
2769
2770         rc = internal_dfa_exec(
2771           md,                                   /* fixed match data */
2772           callpat,                              /* this subexpression's code */
2773           ptr,                                  /* where we currently are */
2774           (int)(ptr - start_subject),           /* start offset */
2775           local_offsets,                        /* offset vector */
2776           sizeof(local_offsets)/sizeof(int),    /* size of same */
2777           local_workspace,                      /* workspace vector */
2778           sizeof(local_workspace)/sizeof(int),  /* size of same */
2779           rlevel);                              /* function recursion level */
2780
2781         md->recursive = new_recursive.prevrec;  /* Done this recursion */
2782
2783         DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2784           rc));
2785
2786         /* Ran out of internal offsets */
2787
2788         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2789
2790         /* For each successful matched substring, set up the next state with a
2791         count of characters to skip before trying it. Note that the count is in
2792         characters, not bytes. */
2793
2794         if (rc > 0)
2795           {
2796           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2797             {
2798             int charcount = local_offsets[rc+1] - local_offsets[rc];
2799 #ifdef SUPPORT_UTF
2800             if (utf)
2801               {
2802               const pcre_uchar *p = start_subject + local_offsets[rc];
2803               const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2804               while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2805               }
2806 #endif
2807             if (charcount > 0)
2808               {
2809               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2810               }
2811             else
2812               {
2813               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2814               }
2815             }
2816           }
2817         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2818         }
2819       break;
2820
2821       /*-----------------------------------------------------------------*/
2822       case OP_BRAPOS:
2823       case OP_SBRAPOS:
2824       case OP_CBRAPOS:
2825       case OP_SCBRAPOS:
2826       case OP_BRAPOSZERO:
2827         {
2828         int charcount, matched_count;
2829         const pcre_uchar *local_ptr = ptr;
2830         BOOL allow_zero;
2831
2832         if (codevalue == OP_BRAPOSZERO)
2833           {
2834           allow_zero = TRUE;
2835           codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2836           }
2837         else allow_zero = FALSE;
2838
2839         /* Loop to match the subpattern as many times as possible as if it were
2840         a complete pattern. */
2841
2842         for (matched_count = 0;; matched_count++)
2843           {
2844           int local_offsets[2];
2845           int local_workspace[1000];
2846
2847           int rc = internal_dfa_exec(
2848             md,                                   /* fixed match data */
2849             code,                                 /* this subexpression's code */
2850             local_ptr,                            /* where we currently are */
2851             (int)(ptr - start_subject),           /* start offset */
2852             local_offsets,                        /* offset vector */
2853             sizeof(local_offsets)/sizeof(int),    /* size of same */
2854             local_workspace,                      /* workspace vector */
2855             sizeof(local_workspace)/sizeof(int),  /* size of same */
2856             rlevel);                              /* function recursion level */
2857
2858           /* Failed to match */
2859
2860           if (rc < 0)
2861             {
2862             if (rc != PCRE_ERROR_NOMATCH) return rc;
2863             break;
2864             }
2865
2866           /* Matched: break the loop if zero characters matched. */
2867
2868           charcount = local_offsets[1] - local_offsets[0];
2869           if (charcount == 0) break;
2870           local_ptr += charcount;    /* Advance temporary position ptr */
2871           }
2872
2873         /* At this point we have matched the subpattern matched_count
2874         times, and local_ptr is pointing to the character after the end of the
2875         last match. */
2876
2877         if (matched_count > 0 || allow_zero)
2878           {
2879           const pcre_uchar *end_subpattern = code;
2880           int next_state_offset;
2881
2882           do { end_subpattern += GET(end_subpattern, 1); }
2883             while (*end_subpattern == OP_ALT);
2884           next_state_offset =
2885             (int)(end_subpattern - start_code + LINK_SIZE + 1);
2886
2887           /* Optimization: if there are no more active states, and there
2888           are no new states yet set up, then skip over the subject string
2889           right here, to save looping. Otherwise, set up the new state to swing
2890           into action when the end of the matched substring is reached. */
2891
2892           if (i + 1 >= active_count && new_count == 0)
2893             {
2894             ptr = local_ptr;
2895             clen = 0;
2896             ADD_NEW(next_state_offset, 0);
2897             }
2898           else
2899             {
2900             const pcre_uchar *p = ptr;
2901             const pcre_uchar *pp = local_ptr;
2902             charcount = (int)(pp - p);
2903 #ifdef SUPPORT_UTF
2904             if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2905 #endif
2906             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2907             }
2908           }
2909         }
2910       break;
2911
2912       /*-----------------------------------------------------------------*/
2913       case OP_ONCE:
2914       case OP_ONCE_NC:
2915         {
2916         int local_offsets[2];
2917         int local_workspace[1000];
2918
2919         int rc = internal_dfa_exec(
2920           md,                                   /* fixed match data */
2921           code,                                 /* this subexpression's code */
2922           ptr,                                  /* where we currently are */
2923           (int)(ptr - start_subject),           /* start offset */
2924           local_offsets,                        /* offset vector */
2925           sizeof(local_offsets)/sizeof(int),    /* size of same */
2926           local_workspace,                      /* workspace vector */
2927           sizeof(local_workspace)/sizeof(int),  /* size of same */
2928           rlevel);                              /* function recursion level */
2929
2930         if (rc >= 0)
2931           {
2932           const pcre_uchar *end_subpattern = code;
2933           int charcount = local_offsets[1] - local_offsets[0];
2934           int next_state_offset, repeat_state_offset;
2935
2936           do { end_subpattern += GET(end_subpattern, 1); }
2937             while (*end_subpattern == OP_ALT);
2938           next_state_offset =
2939             (int)(end_subpattern - start_code + LINK_SIZE + 1);
2940
2941           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2942           arrange for the repeat state also to be added to the relevant list.
2943           Calculate the offset, or set -1 for no repeat. */
2944
2945           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2946                                  *end_subpattern == OP_KETRMIN)?
2947             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2948
2949           /* If we have matched an empty string, add the next state at the
2950           current character pointer. This is important so that the duplicate
2951           checking kicks in, which is what breaks infinite loops that match an
2952           empty string. */
2953
2954           if (charcount == 0)
2955             {
2956             ADD_ACTIVE(next_state_offset, 0);
2957             }
2958
2959           /* Optimization: if there are no more active states, and there
2960           are no new states yet set up, then skip over the subject string
2961           right here, to save looping. Otherwise, set up the new state to swing
2962           into action when the end of the matched substring is reached. */
2963
2964           else if (i + 1 >= active_count && new_count == 0)
2965             {
2966             ptr += charcount;
2967             clen = 0;
2968             ADD_NEW(next_state_offset, 0);
2969
2970             /* If we are adding a repeat state at the new character position,
2971             we must fudge things so that it is the only current state.
2972             Otherwise, it might be a duplicate of one we processed before, and
2973             that would cause it to be skipped. */
2974
2975             if (repeat_state_offset >= 0)
2976               {
2977               next_active_state = active_states;
2978               active_count = 0;
2979               i = -1;
2980               ADD_ACTIVE(repeat_state_offset, 0);
2981               }
2982             }
2983           else
2984             {
2985 #ifdef SUPPORT_UTF
2986             if (utf)
2987               {
2988               const pcre_uchar *p = start_subject + local_offsets[0];
2989               const pcre_uchar *pp = start_subject + local_offsets[1];
2990               while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2991               }
2992 #endif
2993             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2994             if (repeat_state_offset >= 0)
2995               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2996             }
2997           }
2998         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2999         }
3000       break;
3001
3002
3003 /* ========================================================================== */
3004       /* Handle callouts */
3005
3006       case OP_CALLOUT:
3007       rrc = 0;
3008       if (PUBL(callout) != NULL)
3009         {
3010         PUBL(callout_block) cb;
3011         cb.version          = 1;   /* Version 1 of the callout block */
3012         cb.callout_number   = code[1];
3013         cb.offset_vector    = offsets;
3014 #ifdef COMPILE_PCRE8
3015         cb.subject          = (PCRE_SPTR)start_subject;
3016 #else
3017         cb.subject          = (PCRE_SPTR16)start_subject;
3018 #endif
3019         cb.subject_length   = (int)(end_subject - start_subject);
3020         cb.start_match      = (int)(current_subject - start_subject);
3021         cb.current_position = (int)(ptr - start_subject);
3022         cb.pattern_position = GET(code, 2);
3023         cb.next_item_length = GET(code, 2 + LINK_SIZE);
3024         cb.capture_top      = 1;
3025         cb.capture_last     = -1;
3026         cb.callout_data     = md->callout_data;
3027         cb.mark             = NULL;   /* No (*MARK) support */
3028         if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
3029         }
3030       if (rrc == 0)
3031         { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3032       break;
3033
3034
3035 /* ========================================================================== */
3036       default:        /* Unsupported opcode */
3037       return PCRE_ERROR_DFA_UITEM;
3038       }
3039
3040     NEXT_ACTIVE_STATE: continue;
3041
3042     }      /* End of loop scanning active states */
3043
3044   /* We have finished the processing at the current subject character. If no
3045   new states have been set for the next character, we have found all the
3046   matches that we are going to find. If we are at the top level and partial
3047   matching has been requested, check for appropriate conditions.
3048
3049   The "forced_ fail" variable counts the number of (*F) encountered for the
3050   character. If it is equal to the original active_count (saved in
3051   workspace[1]) it means that (*F) was found on every active state. In this
3052   case we don't want to give a partial match.
3053
3054   The "could_continue" variable is true if a state could have continued but
3055   for the fact that the end of the subject was reached. */
3056
3057   if (new_count <= 0)
3058     {
3059     if (rlevel == 1 &&                               /* Top level, and */
3060         could_continue &&                            /* Some could go on, and */
3061         forced_fail != workspace[1] &&               /* Not all forced fail & */
3062         (                                            /* either... */
3063         (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
3064         ||                                           /* or... */
3065         ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3066          match_count < 0)                            /* no matches */
3067         ) &&                                         /* And... */
3068         (
3069         partial_newline ||                           /* Either partial NL */
3070           (                                          /* or ... */
3071           ptr >= end_subject &&                /* End of subject and */
3072           ptr > md->start_used_ptr)            /* Inspected non-empty string */
3073           )
3074         )
3075       {
3076       if (offsetcount >= 2)
3077         {
3078         offsets[0] = (int)(md->start_used_ptr - start_subject);
3079         offsets[1] = (int)(end_subject - start_subject);
3080         }
3081       match_count = PCRE_ERROR_PARTIAL;
3082       }
3083
3084     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3085       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3086       rlevel*2-2, SP));
3087     break;        /* In effect, "return", but see the comment below */
3088     }
3089
3090   /* One or more states are active for the next character. */
3091
3092   ptr += clen;    /* Advance to next subject character */
3093   }               /* Loop to move along the subject string */
3094
3095 /* Control gets here from "break" a few lines above. We do it this way because
3096 if we use "return" above, we have compiler trouble. Some compilers warn if
3097 there's nothing here because they think the function doesn't return a value. On
3098 the other hand, if we put a dummy statement here, some more clever compilers
3099 complain that it can't be reached. Sigh. */
3100
3101 return match_count;
3102 }
3103
3104
3105
3106
3107 /*************************************************
3108 *    Execute a Regular Expression - DFA engine   *
3109 *************************************************/
3110
3111 /* This external function applies a compiled re to a subject string using a DFA
3112 engine. This function calls the internal function multiple times if the pattern
3113 is not anchored.
3114
3115 Arguments:
3116   argument_re     points to the compiled expression
3117   extra_data      points to extra data or is NULL
3118   subject         points to the subject string
3119   length          length of subject string (may contain binary zeros)
3120   start_offset    where to start in the subject string
3121   options         option bits
3122   offsets         vector of match offsets
3123   offsetcount     size of same
3124   workspace       workspace vector
3125   wscount         size of same
3126
3127 Returns:          > 0 => number of match offset pairs placed in offsets
3128                   = 0 => offsets overflowed; longest matches are present
3129                    -1 => failed to match
3130                  < -1 => some kind of unexpected problem
3131 */
3132
3133 #ifdef COMPILE_PCRE8
3134 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3135 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3136   const char *subject, int length, int start_offset, int options, int *offsets,
3137   int offsetcount, int *workspace, int wscount)
3138 #else
3139 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3140 pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3141   PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3142   int offsetcount, int *workspace, int wscount)
3143 #endif
3144 {
3145 REAL_PCRE *re = (REAL_PCRE *)argument_re;
3146 dfa_match_data match_block;
3147 dfa_match_data *md = &match_block;
3148 BOOL utf, anchored, startline, firstline;
3149 const pcre_uchar *current_subject, *end_subject;
3150 const pcre_study_data *study = NULL;
3151
3152 const pcre_uchar *req_char_ptr;
3153 const pcre_uint8 *start_bits = NULL;
3154 BOOL has_first_char = FALSE;
3155 BOOL has_req_char = FALSE;
3156 pcre_uchar first_char = 0;
3157 pcre_uchar first_char2 = 0;
3158 pcre_uchar req_char = 0;
3159 pcre_uchar req_char2 = 0;
3160 int newline;
3161
3162 /* Plausibility checks */
3163
3164 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3165 if (re == NULL || subject == NULL || workspace == NULL ||
3166    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3167 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3168 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3169 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3170
3171 /* Check that the first field in the block is the magic number. If it is not,
3172 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3173 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3174 means that the pattern is likely compiled with different endianness. */
3175
3176 if (re->magic_number != MAGIC_NUMBER)
3177   return re->magic_number == REVERSED_MAGIC_NUMBER?
3178     PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3179 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3180
3181 /* If restarting after a partial match, do some sanity checks on the contents
3182 of the workspace. */
3183
3184 if ((options & PCRE_DFA_RESTART) != 0)
3185   {
3186   if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3187     workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3188       return PCRE_ERROR_DFA_BADRESTART;
3189   }
3190
3191 /* Set up study, callout, and table data */
3192
3193 md->tables = re->tables;
3194 md->callout_data = NULL;
3195
3196 if (extra_data != NULL)
3197   {
3198   unsigned int flags = extra_data->flags;
3199   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3200     study = (const pcre_study_data *)extra_data->study_data;
3201   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3202   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3203     return PCRE_ERROR_DFA_UMLIMIT;
3204   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3205     md->callout_data = extra_data->callout_data;
3206   if ((flags & PCRE_EXTRA_TABLES) != 0)
3207     md->tables = extra_data->tables;
3208   }
3209
3210 /* Set some local values */
3211
3212 current_subject = (const pcre_uchar *)subject + start_offset;
3213 end_subject = (const pcre_uchar *)subject + length;
3214 req_char_ptr = current_subject - 1;
3215
3216 #ifdef SUPPORT_UTF
3217 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3218 utf = (re->options & PCRE_UTF8) != 0;
3219 #else
3220 utf = FALSE;
3221 #endif
3222
3223 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3224   (re->options & PCRE_ANCHORED) != 0;
3225
3226 /* The remaining fixed data for passing around. */
3227
3228 md->start_code = (const pcre_uchar *)argument_re +
3229     re->name_table_offset + re->name_count * re->name_entry_size;
3230 md->start_subject = (const pcre_uchar *)subject;
3231 md->end_subject = end_subject;
3232 md->start_offset = start_offset;
3233 md->moptions = options;
3234 md->poptions = re->options;
3235
3236 /* If the BSR option is not set at match time, copy what was set
3237 at compile time. */
3238
3239 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3240   {
3241   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3242     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3243 #ifdef BSR_ANYCRLF
3244   else md->moptions |= PCRE_BSR_ANYCRLF;
3245 #endif
3246   }
3247
3248 /* Handle different types of newline. The three bits give eight cases. If
3249 nothing is set at run time, whatever was used at compile time applies. */
3250
3251 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3252          PCRE_NEWLINE_BITS)
3253   {
3254   case 0: newline = NEWLINE; break;   /* Compile-time default */
3255   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3256   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3257   case PCRE_NEWLINE_CR+
3258        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3259   case PCRE_NEWLINE_ANY: newline = -1; break;
3260   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3261   default: return PCRE_ERROR_BADNEWLINE;
3262   }
3263
3264 if (newline == -2)
3265   {
3266   md->nltype = NLTYPE_ANYCRLF;
3267   }
3268 else if (newline < 0)
3269   {
3270   md->nltype = NLTYPE_ANY;
3271   }
3272 else
3273   {
3274   md->nltype = NLTYPE_FIXED;
3275   if (newline > 255)
3276     {
3277     md->nllen = 2;
3278     md->nl[0] = (newline >> 8) & 255;
3279     md->nl[1] = newline & 255;
3280     }
3281   else
3282     {
3283     md->nllen = 1;
3284     md->nl[0] = newline;
3285     }
3286   }
3287
3288 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3289 back the character offset. */
3290
3291 #ifdef SUPPORT_UTF
3292 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3293   {
3294   int erroroffset;
3295   int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3296   if (errorcode != 0)
3297     {
3298     if (offsetcount >= 2)
3299       {
3300       offsets[0] = erroroffset;
3301       offsets[1] = errorcode;
3302       }
3303     return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3304       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3305     }
3306   if (start_offset > 0 && start_offset < length &&
3307         NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3308     return PCRE_ERROR_BADUTF8_OFFSET;
3309   }
3310 #endif
3311
3312 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3313 is a feature that makes it possible to save compiled regex and re-use them
3314 in other programs later. */
3315
3316 if (md->tables == NULL) md->tables = PRIV(default_tables);
3317
3318 /* The "must be at the start of a line" flags are used in a loop when finding
3319 where to start. */
3320
3321 startline = (re->flags & PCRE_STARTLINE) != 0;
3322 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3323
3324 /* Set up the first character to match, if available. The first_byte value is
3325 never set for an anchored regular expression, but the anchoring may be forced
3326 at run time, so we have to test for anchoring. The first char may be unset for
3327 an unanchored pattern, of course. If there's no first char and the pattern was
3328 studied, there may be a bitmap of possible first characters. */
3329
3330 if (!anchored)
3331   {
3332   if ((re->flags & PCRE_FIRSTSET) != 0)
3333     {
3334     has_first_char = TRUE;
3335     first_char = first_char2 = (pcre_uchar)(re->first_char);
3336     if ((re->flags & PCRE_FCH_CASELESS) != 0)
3337       {
3338       first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3339 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3340       if (utf && first_char > 127)
3341         first_char2 = UCD_OTHERCASE(first_char);
3342 #endif
3343       }
3344     }
3345   else
3346     {
3347     if (!startline && study != NULL &&
3348          (study->flags & PCRE_STUDY_MAPPED) != 0)
3349       start_bits = study->start_bits;
3350     }
3351   }
3352
3353 /* For anchored or unanchored matches, there may be a "last known required
3354 character" set. */
3355
3356 if ((re->flags & PCRE_REQCHSET) != 0)
3357   {
3358   has_req_char = TRUE;
3359   req_char = req_char2 = (pcre_uchar)(re->req_char);
3360   if ((re->flags & PCRE_RCH_CASELESS) != 0)
3361     {
3362     req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3363 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3364     if (utf && req_char > 127)
3365       req_char2 = UCD_OTHERCASE(req_char);
3366 #endif
3367     }
3368   }
3369
3370 /* Call the main matching function, looping for a non-anchored regex after a
3371 failed match. If not restarting, perform certain optimizations at the start of
3372 a match. */
3373
3374 for (;;)
3375   {
3376   int rc;
3377
3378   if ((options & PCRE_DFA_RESTART) == 0)
3379     {
3380     const pcre_uchar *save_end_subject = end_subject;
3381
3382     /* If firstline is TRUE, the start of the match is constrained to the first
3383     line of a multiline string. Implement this by temporarily adjusting
3384     end_subject so that we stop scanning at a newline. If the match fails at
3385     the newline, later code breaks this loop. */
3386
3387     if (firstline)
3388       {
3389       PCRE_PUCHAR t = current_subject;
3390 #ifdef SUPPORT_UTF
3391       if (utf)
3392         {
3393         while (t < md->end_subject && !IS_NEWLINE(t))
3394           {
3395           t++;
3396           ACROSSCHAR(t < end_subject, *t, t++);
3397           }
3398         }
3399       else
3400 #endif
3401       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3402       end_subject = t;
3403       }
3404
3405     /* There are some optimizations that avoid running the match if a known
3406     starting point is not found. However, there is an option that disables
3407     these, for testing and for ensuring that all callouts do actually occur.
3408     The option can be set in the regex by (*NO_START_OPT) or passed in
3409     match-time options. */
3410
3411     if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3412       {
3413       /* Advance to a known first char. */
3414
3415       if (has_first_char)
3416         {
3417         if (first_char != first_char2)
3418           while (current_subject < end_subject &&
3419               *current_subject != first_char && *current_subject != first_char2)
3420             current_subject++;
3421         else
3422           while (current_subject < end_subject &&
3423                  *current_subject != first_char)
3424             current_subject++;
3425         }
3426
3427       /* Or to just after a linebreak for a multiline match if possible */
3428
3429       else if (startline)
3430         {
3431         if (current_subject > md->start_subject + start_offset)
3432           {
3433 #ifdef SUPPORT_UTF
3434           if (utf)
3435             {
3436             while (current_subject < end_subject &&
3437                    !WAS_NEWLINE(current_subject))
3438               {
3439               current_subject++;
3440               ACROSSCHAR(current_subject < end_subject, *current_subject,
3441                 current_subject++);
3442               }
3443             }
3444           else
3445 #endif
3446           while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3447             current_subject++;
3448
3449           /* If we have just passed a CR and the newline option is ANY or
3450           ANYCRLF, and we are now at a LF, advance the match position by one
3451           more character. */
3452
3453           if (current_subject[-1] == CHAR_CR &&
3454                (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3455                current_subject < end_subject &&
3456                *current_subject == CHAR_NL)
3457             current_subject++;
3458           }
3459         }
3460
3461       /* Or to a non-unique first char after study */
3462
3463       else if (start_bits != NULL)
3464         {
3465         while (current_subject < end_subject)
3466           {
3467           unsigned int c = *current_subject;
3468 #ifndef COMPILE_PCRE8
3469           if (c > 255) c = 255;
3470 #endif
3471           if ((start_bits[c/8] & (1 << (c&7))) == 0)
3472             {
3473             current_subject++;
3474 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3475             /* In non 8-bit mode, the iteration will stop for
3476             characters > 255 at the beginning or not stop at all. */
3477             if (utf)
3478               ACROSSCHAR(current_subject < end_subject, *current_subject,
3479                 current_subject++);
3480 #endif
3481             }
3482           else break;
3483           }
3484         }
3485       }
3486
3487     /* Restore fudged end_subject */
3488
3489     end_subject = save_end_subject;
3490
3491     /* The following two optimizations are disabled for partial matching or if
3492     disabling is explicitly requested (and of course, by the test above, this
3493     code is not obeyed when restarting after a partial match). */
3494
3495     if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3496         (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3497       {
3498       /* If the pattern was studied, a minimum subject length may be set. This
3499       is a lower bound; no actual string of that length may actually match the
3500       pattern. Although the value is, strictly, in characters, we treat it as
3501       bytes to avoid spending too much time in this optimization. */
3502
3503       if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3504           (pcre_uint32)(end_subject - current_subject) < study->minlength)
3505         return PCRE_ERROR_NOMATCH;
3506
3507       /* If req_char is set, we know that that character must appear in the
3508       subject for the match to succeed. If the first character is set, req_char
3509       must be later in the subject; otherwise the test starts at the match
3510       point. This optimization can save a huge amount of work in patterns with
3511       nested unlimited repeats that aren't going to match. Writing separate
3512       code for cased/caseless versions makes it go faster, as does using an
3513       autoincrement and backing off on a match.
3514
3515       HOWEVER: when the subject string is very, very long, searching to its end
3516       can take a long time, and give bad performance on quite ordinary
3517       patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3518       string... so we don't do this when the string is sufficiently long. */
3519
3520       if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3521         {
3522         PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3523
3524         /* We don't need to repeat the search if we haven't yet reached the
3525         place we found it at last time. */
3526
3527         if (p > req_char_ptr)
3528           {
3529           if (req_char != req_char2)
3530             {
3531             while (p < end_subject)
3532               {
3533               int pp = *p++;
3534               if (pp == req_char || pp == req_char2) { p--; break; }
3535               }
3536             }
3537           else
3538             {
3539             while (p < end_subject)
3540               {
3541               if (*p++ == req_char) { p--; break; }
3542               }
3543             }
3544
3545           /* If we can't find the required character, break the matching loop,
3546           which will cause a return or PCRE_ERROR_NOMATCH. */
3547
3548           if (p >= end_subject) break;
3549
3550           /* If we have found the required character, save the point where we
3551           found it, so that we don't search again next time round the loop if
3552           the start hasn't passed this character yet. */
3553
3554           req_char_ptr = p;
3555           }
3556         }
3557       }
3558     }   /* End of optimizations that are done when not restarting */
3559
3560   /* OK, now we can do the business */
3561
3562   md->start_used_ptr = current_subject;
3563   md->recursive = NULL;
3564
3565   rc = internal_dfa_exec(
3566     md,                                /* fixed match data */
3567     md->start_code,                    /* this subexpression's code */
3568     current_subject,                   /* where we currently are */
3569     start_offset,                      /* start offset in subject */
3570     offsets,                           /* offset vector */
3571     offsetcount,                       /* size of same */
3572     workspace,                         /* workspace vector */
3573     wscount,                           /* size of same */
3574     0);                                /* function recurse level */
3575
3576   /* Anything other than "no match" means we are done, always; otherwise, carry
3577   on only if not anchored. */
3578
3579   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3580
3581   /* Advance to the next subject character unless we are at the end of a line
3582   and firstline is set. */
3583
3584   if (firstline && IS_NEWLINE(current_subject)) break;
3585   current_subject++;
3586 #ifdef SUPPORT_UTF
3587   if (utf)
3588     {
3589     ACROSSCHAR(current_subject < end_subject, *current_subject,
3590       current_subject++);
3591     }
3592 #endif
3593   if (current_subject > end_subject) break;
3594
3595   /* If we have just passed a CR and we are now at a LF, and the pattern does
3596   not contain any explicit matches for \r or \n, and the newline option is CRLF
3597   or ANY or ANYCRLF, advance the match position by one more character. */
3598
3599   if (current_subject[-1] == CHAR_CR &&
3600       current_subject < end_subject &&
3601       *current_subject == CHAR_NL &&
3602       (re->flags & PCRE_HASCRORLF) == 0 &&
3603         (md->nltype == NLTYPE_ANY ||
3604          md->nltype == NLTYPE_ANYCRLF ||
3605          md->nllen == 2))
3606     current_subject++;
3607
3608   }   /* "Bumpalong" loop */
3609
3610 return PCRE_ERROR_NOMATCH;
3611 }
3612
3613 /* End of pcre_dfa_exec.c */