src/third_party/icu/source/i18n/regexcmp.cpp

   1 //
   2 //  file:  regexcmp.cpp
   3 //
   4 //  Copyright (C) 2002-2013 International Business Machines Corporation and others.
   5 //  All Rights Reserved.
   6 //
   7 //  This file contains the ICU regular expression compiler, which is responsible
   8 //  for processing a regular expression pattern into the compiled form that
   9 //  is used by the match finding engine.
  10 //
  11
  12 #include "unicode/utypes.h"
  13
  14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  15
  16 #include "unicode/ustring.h"
  17 #include "unicode/unistr.h"
  18 #include "unicode/uniset.h"
  19 #include "unicode/uchar.h"
  20 #include "unicode/uchriter.h"
  21 #include "unicode/parsepos.h"
  22 #include "unicode/parseerr.h"
  23 #include "unicode/regex.h"
  24 #include "unicode/utf.h"
  25 #include "unicode/utf16.h"
  26 #include "patternprops.h"
  27 #include "putilimp.h"
  28 #include "cmemory.h"
  29 #include "cstring.h"
  30 #include "uvectr32.h"
  31 #include "uvectr64.h"
  32 #include "uassert.h"
  33 #include "ucln_in.h"
  34 #include "uinvchar.h"
  35
  36 #include "regeximp.h"
  37 #include "regexcst.h"   // Contains state table for the regex pattern parser.
  38                         //   generated by a Perl script.
  39 #include "regexcmp.h"
  40 #include "regexst.h"
  41 #include "regextxt.h"
  42
  43
  44
  45 U_NAMESPACE_BEGIN
  46
  47
  48 //------------------------------------------------------------------------------
  49 //
  50 //  Constructor.
  51 //
  52 //------------------------------------------------------------------------------
  53 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
  54    fParenStack(status), fSetStack(status), fSetOpStack(status)
  55 {
  56     // Lazy init of all shared global sets (needed for init()'s empty text)
  57     RegexStaticSets::initGlobals(&status);
  58
  59     fStatus           = &status;
  60
  61     fRXPat            = rxp;
  62     fScanIndex        = 0;
  63     fLastChar         = -1;
  64     fPeekChar         = -1;
  65     fLineNum          = 1;
  66     fCharNum          = 0;
  67     fQuoteMode        = FALSE;
  68     fInBackslashQuote = FALSE;
  69     fModeFlags        = fRXPat->fFlags | 0x80000000;
  70     fEOLComments      = TRUE;
  71
  72     fMatchOpenParen   = -1;
  73     fMatchCloseParen  = -1;
  74
  75     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
  76         status = rxp->fDeferredStatus;
  77     }
  78 }
  79
  80 static const UChar      chAmp       = 0x26;      // '&'
  81 static const UChar      chDash      = 0x2d;      // '-'
  82
  83
  84 //------------------------------------------------------------------------------
  85 //
  86 //  Destructor
  87 //
  88 //------------------------------------------------------------------------------
  89 RegexCompile::~RegexCompile() {
  90 }
  91
  92 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
  93     set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
  94 }
  95
  96 //------------------------------------------------------------------------------
  97 //
  98 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
  99 //                           The state tables are hand-written in the file regexcst.txt,
 100 //                           and converted to the form used here by a perl
 101 //                           script regexcst.pl
 102 //
 103 //------------------------------------------------------------------------------
 104 void    RegexCompile::compile(
 105                          const UnicodeString &pat,   // Source pat to be compiled.
 106                          UParseError &pp,            // Error position info
 107                          UErrorCode &e)              // Error Code
 108 {
 109     fRXPat->fPatternString = new UnicodeString(pat);
 110     UText patternText = UTEXT_INITIALIZER;
 111     utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
 112
 113     if (U_SUCCESS(e)) {
 114         compile(&patternText, pp, e);
 115         utext_close(&patternText);
 116     }
 117 }
 118
 119 //
 120 //   compile, UText mode
 121 //     All the work is actually done here.
 122 //
 123 void    RegexCompile::compile(
 124                          UText *pat,                 // Source pat to be compiled.
 125                          UParseError &pp,            // Error position info
 126                          UErrorCode &e)              // Error Code
 127 {
 128     fStatus             = &e;
 129     fParseErr           = &pp;
 130     fStackPtr           = 0;
 131     fStack[fStackPtr]   = 0;
 132
 133     if (U_FAILURE(*fStatus)) {
 134         return;
 135     }
 136
 137     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
 138     U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
 139
 140     // Prepare the RegexPattern object to receive the compiled pattern.
 141     fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
 142     fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
 143     fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;
 144
 145
 146     // Initialize the pattern scanning state machine
 147     fPatternLength = utext_nativeLength(pat);
 148     uint16_t                state = 1;
 149     const RegexTableEl      *tableEl;
 150
 151     // UREGEX_LITERAL force entire pattern to be treated as a literal string.
 152     if (fModeFlags & UREGEX_LITERAL) {
 153         fQuoteMode = TRUE;
 154     }
 155
 156     nextChar(fC);                        // Fetch the first char from the pattern string.
 157
 158     //
 159     // Main loop for the regex pattern parsing state machine.
 160     //   Runs once per state transition.
 161     //   Each time through optionally performs, depending on the state table,
 162     //      - an advance to the the next pattern char
 163     //      - an action to be performed.
 164     //      - pushing or popping a state to/from the local state return stack.
 165     //   file regexcst.txt is the source for the state table.  The logic behind
 166     //     recongizing the pattern syntax is there, not here.
 167     //
 168     for (;;) {
 169         //  Bail out if anything has gone wrong.
 170         //  Regex pattern parsing stops on the first error encountered.
 171         if (U_FAILURE(*fStatus)) {
 172             break;
 173         }
 174
 175         U_ASSERT(state != 0);
 176
 177         // Find the state table element that matches the input char from the pattern, or the
 178         //    class of the input character.  Start with the first table row for this
 179         //    state, then linearly scan forward until we find a row that matches the
 180         //    character.  The last row for each state always matches all characters, so
 181         //    the search will stop there, if not before.
 182         //
 183         tableEl = &gRuleParseStateTable[state];
 184         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
 185             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
 186
 187         for (;;) {    // loop through table rows belonging to this state, looking for one
 188                       //   that matches the current input char.
 189             REGEX_SCAN_DEBUG_PRINTF(("."));
 190             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
 191                 // Table row specified an individual character, not a set, and
 192                 //   the input character is not quoted, and
 193                 //   the input character matched it.
 194                 break;
 195             }
 196             if (tableEl->fCharClass == 255) {
 197                 // Table row specified default, match anything character class.
 198                 break;
 199             }
 200             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
 201                 // Table row specified "quoted" and the char was quoted.
 202                 break;
 203             }
 204             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
 205                 // Table row specified eof and we hit eof on the input.
 206                 break;
 207             }
 208
 209             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
 210                 fC.fQuoted == FALSE &&                                       //   char is not escaped &&
 211                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
 212                 U_ASSERT(tableEl->fCharClass <= 137);
 213                 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
 214                     // Table row specified a character class, or set of characters,
 215                     //   and the current char matches it.
 216                     break;
 217                 }
 218             }
 219
 220             // No match on this row, advance to the next  row for this state,
 221             tableEl++;
 222         }
 223         REGEX_SCAN_DEBUG_PRINTF(("\n"));
 224
 225         //
 226         // We've found the row of the state table that matches the current input
 227         //   character from the rules string.
 228         // Perform any action specified  by this row in the state table.
 229         if (doParseActions(tableEl->fAction) == FALSE) {
 230             // Break out of the state machine loop if the
 231             //   the action signalled some kind of error, or
 232             //   the action was to exit, occurs on normal end-of-rules-input.
 233             break;
 234         }
 235
 236         if (tableEl->fPushState != 0) {
 237             fStackPtr++;
 238             if (fStackPtr >= kStackSize) {
 239                 error(U_REGEX_INTERNAL_ERROR);
 240                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
 241                 fStackPtr--;
 242             }
 243             fStack[fStackPtr] = tableEl->fPushState;
 244         }
 245
 246         //
 247         //  NextChar.  This is where characters are actually fetched from the pattern.
 248         //             Happens under control of the 'n' tag in the state table.
 249         //
 250         if (tableEl->fNextChar) {
 251             nextChar(fC);
 252         }
 253
 254         // Get the next state from the table entry, or from the
 255         //   state stack if the next state was specified as "pop".
 256         if (tableEl->fNextState != 255) {
 257             state = tableEl->fNextState;
 258         } else {
 259             state = fStack[fStackPtr];
 260             fStackPtr--;
 261             if (fStackPtr < 0) {
 262                 // state stack underflow
 263                 // This will occur if the user pattern has mis-matched parentheses,
 264                 //   with extra close parens.
 265                 //
 266                 fStackPtr++;
 267                 error(U_REGEX_MISMATCHED_PAREN);
 268             }
 269         }
 270
 271     }
 272
 273     if (U_FAILURE(*fStatus)) {
 274         // Bail out if the pattern had errors.
 275         //   Set stack cleanup:  a successful compile would have left it empty,
 276         //   but errors can leave temporary sets hanging around.
 277         while (!fSetStack.empty()) {
 278             delete (UnicodeSet *)fSetStack.pop();
 279         }
 280         return;
 281     }
 282
 283     //
 284     // The pattern has now been read and processed, and the compiled code generated.
 285     //
 286
 287     //
 288     // Compute the number of digits requried for the largest capture group number.
 289     //
 290     fRXPat->fMaxCaptureDigits = 1;
 291     int32_t  n = 10;
 292     int32_t  groupCount = fRXPat->fGroupMap->size();
 293     while (n <= groupCount) {
 294         fRXPat->fMaxCaptureDigits++;
 295         n *= 10;
 296     }
 297
 298     //
 299     // The pattern's fFrameSize so far has accumulated the requirements for
 300     //   storage for capture parentheses, counters, etc. that are encountered
 301     //   in the pattern.  Add space for the two variables that are always
 302     //   present in the saved state:  the input string position (int64_t) and
 303     //   the position in the compiled pattern.
 304     //
 305     allocateStackData(RESTACKFRAME_HDRCOUNT);
 306
 307     //
 308     // Optimization pass 1: NOPs, back-references, and case-folding
 309     //
 310     stripNOPs();
 311
 312     //
 313     // Get bounds for the minimum and maximum length of a string that this
 314     //   pattern can match.  Used to avoid looking for matches in strings that
 315     //   are too short.
 316     //
 317     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
 318
 319     //
 320     // Optimization pass 2: match start type
 321     //
 322     matchStartType();
 323
 324     //
 325     // Set up fast latin-1 range sets
 326     //
 327     int32_t numSets = fRXPat->fSets->size();
 328     fRXPat->fSets8 = new Regex8BitSet[numSets];
 329     // Null pointer check.
 330     if (fRXPat->fSets8 == NULL) {
 331         e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
 332         return;
 333     }
 334     int32_t i;
 335     for (i=0; i<numSets; i++) {
 336         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
 337         fRXPat->fSets8[i].init(s);
 338     }
 339
 340 }
 341
 342
 343
 344
 345
 346 //------------------------------------------------------------------------------
 347 //
 348 //  doParseAction        Do some action during regex pattern parsing.
 349 //                       Called by the parse state machine.
 350 //
 351 //                       Generation of the match engine PCode happens here, or
 352 //                       in functions called from the parse actions defined here.
 353 //
 354 //
 355 //------------------------------------------------------------------------------
 356 UBool RegexCompile::doParseActions(int32_t action)
 357 {
 358     UBool   returnVal = TRUE;
 359
 360     switch ((Regex_PatternParseAction)action) {
 361
 362     case doPatStart:
 363         // Start of pattern compiles to:
 364         //0   SAVE   2        Fall back to position of FAIL
 365         //1   jmp    3
 366         //2   FAIL            Stop if we ever reach here.
 367         //3   NOP             Dummy, so start of pattern looks the same as
 368         //                    the start of an ( grouping.
 369         //4   NOP             Resreved, will be replaced by a save if there are
 370         //                    OR | operators at the top level
 371         appendOp(URX_BUILD(URX_STATE_SAVE, 2));
 372         appendOp(URX_BUILD(URX_JMP,  3));
 373         appendOp(URX_BUILD(URX_FAIL, 0));
 374
 375         // Standard open nonCapture paren action emits the two NOPs and
 376         //   sets up the paren stack frame.
 377         doParseActions(doOpenNonCaptureParen);
 378         break;
 379
 380     case doPatFinish:
 381         // We've scanned to the end of the pattern
 382         //  The end of pattern compiles to:
 383         //        URX_END
 384         //    which will stop the runtime match engine.
 385         //  Encountering end of pattern also behaves like a close paren,
 386         //   and forces fixups of the State Save at the beginning of the compiled pattern
 387         //   and of any OR operations at the top level.
 388         //
 389         handleCloseParen();
 390         if (fParenStack.size() > 0) {
 391             // Missing close paren in pattern.
 392             error(U_REGEX_MISMATCHED_PAREN);
 393         }
 394
 395         // add the END operation to the compiled pattern.
 396         appendOp(URX_BUILD(URX_END, 0));
 397
 398         // Terminate the pattern compilation state machine.
 399         returnVal = FALSE;
 400         break;
 401
 402
 403
 404     case doOrOperator:
 405         // Scanning a '|', as in (A|B)
 406         {
 407             // Generate code for any pending literals preceding the '|'
 408             fixLiterals(FALSE);
 409
 410             // Insert a SAVE operation at the start of the pattern section preceding
 411             //   this OR at this level.  This SAVE will branch the match forward
 412             //   to the right hand side of the OR in the event that the left hand
 413             //   side fails to match and backtracks.  Locate the position for the
 414             //   save from the location on the top of the parentheses stack.
 415             int32_t savePosition = fParenStack.popi();
 416             int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
 417             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
 418             op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
 419             fRXPat->fCompiledPat->setElementAt(op, savePosition);
 420
 421             // Append an JMP operation into the compiled pattern.  The operand for
 422             //  the JMP will eventually be the location following the ')' for the
 423             //  group.  This will be patched in later, when the ')' is encountered.
 424             op = URX_BUILD(URX_JMP, 0);
 425             appendOp(op);
 426
 427             // Push the position of the newly added JMP op onto the parentheses stack.
 428             // This registers if for fixup when this block's close paren is encountered.
 429             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
 430
 431             // Append a NOP to the compiled pattern.  This is the slot reserved
 432             //   for a SAVE in the event that there is yet another '|' following
 433             //   this one.
 434             appendOp(URX_BUILD(URX_NOP, 0));
 435             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
 436         }
 437         break;
 438
 439
 440     case doOpenCaptureParen:
 441         // Open Paren.
 442         //   Compile to a
 443         //      - NOP, which later may be replaced by a save-state if the
 444         //         parenthesized group gets a * quantifier, followed by
 445         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
 446         //      - NOP, which may later be replaced by a save-state if there
 447         //             is an '|' alternation within the parens.
 448         //
 449         //    Each capture group gets three slots in the save stack frame:
 450         //         0: Capture Group start position (in input string being matched.)
 451         //         1: Capture Group end position.
 452         //         2: Start of Match-in-progress.
 453         //    The first two locations are for a completed capture group, and are
 454         //     referred to by back references and the like.
 455         //    The third location stores the capture start position when an START_CAPTURE is
 456         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
 457         //      END_CAPTURE is encountered.
 458         {
 459             fixLiterals();
 460             appendOp(URX_BUILD(URX_NOP, 0));
 461             int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
 462             int32_t  cop     = URX_BUILD(URX_START_CAPTURE, varsLoc);
 463             appendOp(cop);
 464             appendOp(URX_BUILD(URX_NOP, 0));
 465
 466             // On the Parentheses stack, start a new frame and add the postions
 467             //   of the two NOPs.  Depending on what follows in the pattern, the
 468             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
 469             //   address of the end of the parenthesized group.
 470             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 471             fParenStack.push(capturing, *fStatus);                        // Frame type.
 472             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
 473             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
 474
 475             // Save the mapping from group number to stack frame variable position.
 476             fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
 477         }
 478          break;
 479
 480     case doOpenNonCaptureParen:
 481         // Open non-caputuring (grouping only) Paren.
 482         //   Compile to a
 483         //      - NOP, which later may be replaced by a save-state if the
 484         //         parenthesized group gets a * quantifier, followed by
 485         //      - NOP, which may later be replaced by a save-state if there
 486         //             is an '|' alternation within the parens.
 487         {
 488             fixLiterals();
 489             appendOp(URX_BUILD(URX_NOP, 0));
 490             appendOp(URX_BUILD(URX_NOP, 0));
 491
 492             // On the Parentheses stack, start a new frame and add the postions
 493             //   of the two NOPs.
 494             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 495             fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
 496             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
 497             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
 498         }
 499          break;
 500
 501
 502     case doOpenAtomicParen:
 503         // Open Atomic Paren.  (?>
 504         //   Compile to a
 505         //      - NOP, which later may be replaced if the parenthesized group
 506         //         has a quantifier, followed by
 507         //      - STO_SP  save state stack position, so it can be restored at the ")"
 508         //      - NOP, which may later be replaced by a save-state if there
 509         //             is an '|' alternation within the parens.
 510         {
 511             fixLiterals();
 512             appendOp(URX_BUILD(URX_NOP, 0));
 513             int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
 514             int32_t  stoOp = URX_BUILD(URX_STO_SP, varLoc);
 515             appendOp(stoOp);
 516             appendOp(URX_BUILD(URX_NOP, 0));
 517
 518             // On the Parentheses stack, start a new frame and add the postions
 519             //   of the two NOPs.  Depending on what follows in the pattern, the
 520             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
 521             //   address of the end of the parenthesized group.
 522             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 523             fParenStack.push(atomic, *fStatus);                           // Frame type.
 524             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
 525             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
 526         }
 527         break;
 528
 529
 530     case doOpenLookAhead:
 531         // Positive Look-ahead   (?=  stuff  )
 532         //
 533         //   Note:   Addition of transparent input regions, with the need to
 534         //           restore the original regions when failing out of a lookahead
 535         //           block, complicated this sequence.  Some conbined opcodes
 536         //           might make sense - or might not, lookahead aren't that common.
 537         //
 538         //      Caution:  min match length optimization knows about this
 539         //               sequence; don't change without making updates there too.
 540         //
 541         // Compiles to
 542         //    1    START_LA     dataLoc     Saves SP, Input Pos
 543         //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
 544         //    3    JMP          6           continue ...
 545         //
 546         //    4.   LA_END                   Look Ahead failed.  Restore regions.
 547         //    5.   BACKTRACK                and back track again.
 548         //
 549         //    6.   NOP              reserved for use by quantifiers on the block.
 550         //                          Look-ahead can't have quantifiers, but paren stack
 551         //                             compile time conventions require the slot anyhow.
 552         //    7.   NOP              may be replaced if there is are '|' ops in the block.
 553         //    8.     code for parenthesized stuff.
 554         //    9.   LA_END
 555         //
 556         //  Two data slots are reserved, for saving the stack ptr and the input position.
 557         {
 558             fixLiterals();
 559             int32_t dataLoc = allocateData(2);
 560             int32_t op = URX_BUILD(URX_LA_START, dataLoc);
 561             appendOp(op);
 562
 563             op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
 564             appendOp(op);
 565
 566             op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
 567             appendOp(op);
 568
 569             op = URX_BUILD(URX_LA_END, dataLoc);
 570             appendOp(op);
 571
 572             op = URX_BUILD(URX_BACKTRACK, 0);
 573             appendOp(op);
 574
 575             op = URX_BUILD(URX_NOP, 0);
 576             appendOp(op);
 577             appendOp(op);
 578
 579             // On the Parentheses stack, start a new frame and add the postions
 580             //   of the NOPs.
 581             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 582             fParenStack.push(lookAhead, *fStatus);                        // Frame type.
 583             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
 584             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
 585         }
 586         break;
 587
 588     case doOpenLookAheadNeg:
 589         // Negated Lookahead.   (?! stuff )
 590         // Compiles to
 591         //    1.    START_LA    dataloc
 592         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
 593         //                                //   which continues with the match.
 594         //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
 595         //    4.       code for parenthesized stuff.
 596         //    5.    END_LA                // Cut back stack, remove saved state from step 2.
 597         //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
 598         //    7.    END_LA                // Restore match region, in case look-ahead was using
 599         //                                        an alternate (transparent) region.
 600         {
 601             fixLiterals();
 602             int32_t dataLoc = allocateData(2);
 603             int32_t op = URX_BUILD(URX_LA_START, dataLoc);
 604             appendOp(op);
 605
 606             op = URX_BUILD(URX_STATE_SAVE, 0);    // dest address will be patched later.
 607             appendOp(op);
 608
 609             op = URX_BUILD(URX_NOP, 0);
 610             appendOp(op);
 611
 612             // On the Parentheses stack, start a new frame and add the postions
 613             //   of the StateSave and NOP.
 614             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 615             fParenStack.push(negLookAhead, *fStatus);                    // Frame type
 616             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
 617             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
 618
 619             // Instructions #5 - #7 will be added when the ')' is encountered.
 620         }
 621         break;
 622
 623     case doOpenLookBehind:
 624         {
 625             //   Compile a (?<= look-behind open paren.
 626             //
 627             //          Compiles to
 628             //              0       URX_LB_START     dataLoc
 629             //              1       URX_LB_CONT      dataLoc
 630             //              2                        MinMatchLen
 631             //              3                        MaxMatchLen
 632             //              4       URX_NOP          Standard '(' boilerplate.
 633             //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
 634             //              6         <code for LookBehind expression>
 635             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
 636             //              8       URX_LA_END       dataLoc    # Restore stack, input pos
 637             //
 638             //          Allocate a block of matcher data, to contain (when running a match)
 639             //              0:    Stack ptr on entry
 640             //              1:    Input Index on entry
 641             //              2:    Start index of match current match attempt.
 642             //              3:    Original Input String len.
 643
 644             // Generate match code for any pending literals.
 645             fixLiterals();
 646
 647             // Allocate data space
 648             int32_t dataLoc = allocateData(4);
 649
 650             // Emit URX_LB_START
 651             int32_t op = URX_BUILD(URX_LB_START, dataLoc);
 652             appendOp(op);
 653
 654             // Emit URX_LB_CONT
 655             op = URX_BUILD(URX_LB_CONT, dataLoc);
 656             appendOp(op);
 657             appendOp(0);    // MinMatchLength.  To be filled later.
 658             appendOp(0);    // MaxMatchLength.  To be filled later.
 659
 660             // Emit the NOP
 661             op = URX_BUILD(URX_NOP, 0);
 662             appendOp(op);
 663             appendOp(op);
 664
 665             // On the Parentheses stack, start a new frame and add the postions
 666             //   of the URX_LB_CONT and the NOP.
 667             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 668             fParenStack.push(lookBehind, *fStatus);                       // Frame type
 669             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
 670             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
 671
 672             // The final two instructions will be added when the ')' is encountered.
 673         }
 674
 675         break;
 676
 677     case doOpenLookBehindNeg:
 678         {
 679             //   Compile a (?<! negated look-behind open paren.
 680             //
 681             //          Compiles to
 682             //              0       URX_LB_START     dataLoc    # Save entry stack, input len
 683             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
 684             //              2                        MinMatchLen
 685             //              3                        MaxMatchLen
 686             //              4                        continueLoc (9)
 687             //              5       URX_NOP          Standard '(' boilerplate.
 688             //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
 689             //              7         <code for LookBehind expression>
 690             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
 691             //              9       ...
 692             //
 693             //          Allocate a block of matcher data, to contain (when running a match)
 694             //              0:    Stack ptr on entry
 695             //              1:    Input Index on entry
 696             //              2:    Start index of match current match attempt.
 697             //              3:    Original Input String len.
 698
 699             // Generate match code for any pending literals.
 700             fixLiterals();
 701
 702             // Allocate data space
 703             int32_t dataLoc = allocateData(4);
 704
 705             // Emit URX_LB_START
 706             int32_t op = URX_BUILD(URX_LB_START, dataLoc);
 707             appendOp(op);
 708
 709             // Emit URX_LBN_CONT
 710             op = URX_BUILD(URX_LBN_CONT, dataLoc);
 711             appendOp(op);
 712             appendOp(0);    // MinMatchLength.  To be filled later.
 713             appendOp(0);    // MaxMatchLength.  To be filled later.
 714             appendOp(0);    // Continue Loc.    To be filled later.
 715
 716             // Emit the NOP
 717             op = URX_BUILD(URX_NOP, 0);
 718             appendOp(op);
 719             appendOp(op);
 720
 721             // On the Parentheses stack, start a new frame and add the postions
 722             //   of the URX_LB_CONT and the NOP.
 723             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
 724             fParenStack.push(lookBehindN, *fStatus);                      // Frame type
 725             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
 726             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
 727
 728             // The final two instructions will be added when the ')' is encountered.
 729         }
 730         break;
 731
 732     case doConditionalExpr:
 733         // Conditionals such as (?(1)a:b)
 734     case doPerlInline:
 735         // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
 736         error(U_REGEX_UNIMPLEMENTED);
 737         break;
 738
 739
 740     case doCloseParen:
 741         handleCloseParen();
 742         if (fParenStack.size() <= 0) {
 743             //  Extra close paren, or missing open paren.
 744             error(U_REGEX_MISMATCHED_PAREN);
 745         }
 746         break;
 747
 748     case doNOP:
 749         break;
 750
 751
 752     case doBadOpenParenType:
 753     case doRuleError:
 754         error(U_REGEX_RULE_SYNTAX);
 755         break;
 756
 757
 758     case doMismatchedParenErr:
 759         error(U_REGEX_MISMATCHED_PAREN);
 760         break;
 761
 762     case doPlus:
 763         //  Normal '+'  compiles to
 764         //     1.   stuff to be repeated  (already built)
 765         //     2.   jmp-sav 1
 766         //     3.   ...
 767         //
 768         //  Or, if the item to be repeated can match a zero length string,
 769         //     1.   STO_INP_LOC  data-loc
 770         //     2.      body of stuff to be repeated
 771         //     3.   JMP_SAV_X    2
 772         //     4.   ...
 773
 774         //
 775         //  Or, if the item to be repeated is simple
 776         //     1.   Item to be repeated.
 777         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
 778         //     3.   LOOP_C       stack location
 779         {
 780             int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
 781             int32_t  frameLoc;
 782
 783             // Check for simple constructs, which may get special optimized code.
 784             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
 785                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
 786
 787                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
 788                     // Emit optimized code for [char set]+
 789                     int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
 790                     appendOp(loopOpI);
 791                     frameLoc = allocateStackData(1);
 792                     int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
 793                     appendOp(loopOpC);
 794                     break;
 795                 }
 796
 797                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
 798                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
 799                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
 800                     // Emit Optimized code for .+ operations.
 801                     int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
 802                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
 803                         // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
 804                         loopOpI |= 1;
 805                     }
 806                     if (fModeFlags & UREGEX_UNIX_LINES) {
 807                         loopOpI |= 2;
 808                     }
 809                     appendOp(loopOpI);
 810                     frameLoc = allocateStackData(1);
 811                     int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
 812                     appendOp(loopOpC);
 813                     break;
 814                 }
 815
 816             }
 817
 818             // General case.
 819
 820             // Check for minimum match length of zero, which requires
 821             //    extra loop-breaking code.
 822             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
 823                 // Zero length match is possible.
 824                 // Emit the code sequence that can handle it.
 825                 insertOp(topLoc);
 826                 frameLoc = allocateStackData(1);
 827
 828                 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc);
 829                 fRXPat->fCompiledPat->setElementAt(op, topLoc);
 830
 831                 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1);
 832                 appendOp(op);
 833             } else {
 834                 // Simpler code when the repeated body must match something non-empty
 835                 int32_t  jmpOp  = URX_BUILD(URX_JMP_SAV, topLoc);
 836                 appendOp(jmpOp);
 837             }
 838         }
 839         break;
 840
 841     case doNGPlus:
 842         //  Non-greedy '+?'  compiles to
 843         //     1.   stuff to be repeated  (already built)
 844         //     2.   state-save  1
 845         //     3.   ...
 846         {
 847             int32_t topLoc      = blockTopLoc(FALSE);
 848             int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc);
 849             appendOp(saveStateOp);
 850         }
 851         break;
 852
 853
 854     case doOpt:
 855         // Normal (greedy) ? quantifier.
 856         //  Compiles to
 857         //     1. state save 3
 858         //     2.    body of optional block
 859         //     3. ...
 860         // Insert the state save into the compiled pattern, and we're done.
 861         {
 862             int32_t   saveStateLoc = blockTopLoc(TRUE);
 863             int32_t   saveStateOp  = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
 864             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
 865         }
 866         break;
 867
 868     case doNGOpt:
 869         // Non-greedy ?? quantifier
 870         //   compiles to
 871         //    1.  jmp   4
 872         //    2.     body of optional block
 873         //    3   jmp   5
 874         //    4.  state save 2
 875         //    5    ...
 876         //  This code is less than ideal, with two jmps instead of one, because we can only
 877         //  insert one instruction at the top of the block being iterated.
 878         {
 879             int32_t  jmp1_loc = blockTopLoc(TRUE);
 880             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
 881
 882             int32_t  jmp1_op  = URX_BUILD(URX_JMP, jmp2_loc+1);
 883             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
 884
 885             int32_t  jmp2_op  = URX_BUILD(URX_JMP, jmp2_loc+2);
 886             appendOp(jmp2_op);
 887
 888             int32_t  save_op  = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1);
 889             appendOp(save_op);
 890         }
 891         break;
 892
 893
 894     case doStar:
 895         // Normal (greedy) * quantifier.
 896         // Compiles to
 897         //       1.   STATE_SAVE   4
 898         //       2.      body of stuff being iterated over
 899         //       3.   JMP_SAV      2
 900         //       4.   ...
 901         //
 902         // Or, if the body is a simple [Set],
 903         //       1.   LOOP_SR_I    set number
 904         //       2.   LOOP_C       stack location
 905         //       ...
 906         //
 907         // Or if this is a .*
 908         //       1.   LOOP_DOT_I    (. matches all mode flag)
 909         //       2.   LOOP_C        stack location
 910         //
 911         // Or, if the body can match a zero-length string, to inhibit infinite loops,
 912         //       1.   STATE_SAVE   5
 913         //       2.   STO_INP_LOC  data-loc
 914         //       3.      body of stuff
 915         //       4.   JMP_SAV_X    2
 916         //       5.   ...
 917         {
 918             // location of item #1, the STATE_SAVE
 919             int32_t   topLoc = blockTopLoc(FALSE);
 920             int32_t   dataLoc = -1;
 921
 922             // Check for simple *, where the construct being repeated
 923             //   compiled to single opcode, and might be optimizable.
 924             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
 925                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
 926
 927                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
 928                     // Emit optimized code for a [char set]*
 929                     int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
 930                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
 931                     dataLoc = allocateStackData(1);
 932                     int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
 933                     appendOp(loopOpC);
 934                     break;
 935                 }
 936
 937                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
 938                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
 939                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
 940                     // Emit Optimized code for .* operations.
 941                     int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
 942                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
 943                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
 944                         loopOpI |= 1;
 945                     }
 946                     if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
 947                         loopOpI |= 2;
 948                     }
 949                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
 950                     dataLoc = allocateStackData(1);
 951                     int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
 952                     appendOp(loopOpC);
 953                     break;
 954                 }
 955             }
 956
 957             // Emit general case code for this *
 958             // The optimizations did not apply.
 959
 960             int32_t   saveStateLoc = blockTopLoc(TRUE);
 961             int32_t   jmpOp        = URX_BUILD(URX_JMP_SAV, saveStateLoc+1);
 962
 963             // Check for minimum match length of zero, which requires
 964             //    extra loop-breaking code.
 965             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
 966                 insertOp(saveStateLoc);
 967                 dataLoc = allocateStackData(1);
 968
 969                 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
 970                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
 971                 jmpOp      = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2);
 972             }
 973
 974             // Locate the position in the compiled pattern where the match will continue
 975             //   after completing the *.   (4 or 5 in the comment above)
 976             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
 977
 978             // Put together the save state op store it into the compiled code.
 979             int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
 980             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
 981
 982             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
 983             appendOp(jmpOp);
 984         }
 985         break;
 986
 987     case doNGStar:
 988         // Non-greedy *? quantifier
 989         // compiles to
 990         //     1.   JMP    3
 991         //     2.      body of stuff being iterated over
 992         //     3.   STATE_SAVE  2
 993         //     4    ...
 994         {
 995             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
 996             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
 997             int32_t     jmpOp   = URX_BUILD(URX_JMP, saveLoc);
 998             int32_t     stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
 999             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1000             appendOp(stateSaveOp);
1001         }
1002         break;
1003
1004
1005     case doIntervalInit:
1006         // The '{' opening an interval quantifier was just scanned.
1007         // Init the counter varaiables that will accumulate the values as the digits
1008         //    are scanned.
1009         fIntervalLow = 0;
1010         fIntervalUpper = -1;
1011         break;
1012
1013     case doIntevalLowerDigit:
1014         // Scanned a digit from the lower value of an {lower,upper} interval
1015         {
1016             int32_t digitValue = u_charDigitValue(fC.fChar);
1017             U_ASSERT(digitValue >= 0);
1018             fIntervalLow = fIntervalLow*10 + digitValue;
1019             if (fIntervalLow < 0) {
1020                 error(U_REGEX_NUMBER_TOO_BIG);
1021             }
1022         }
1023         break;
1024
1025     case doIntervalUpperDigit:
1026         // Scanned a digit from the upper value of an {lower,upper} interval
1027         {
1028             if (fIntervalUpper < 0) {
1029                 fIntervalUpper = 0;
1030             }
1031             int32_t digitValue = u_charDigitValue(fC.fChar);
1032             U_ASSERT(digitValue >= 0);
1033             fIntervalUpper = fIntervalUpper*10 + digitValue;
1034             if (fIntervalUpper < 0) {
1035                 error(U_REGEX_NUMBER_TOO_BIG);
1036             }
1037         }
1038         break;
1039
1040     case doIntervalSame:
1041         // Scanned a single value interval like {27}.  Upper = Lower.
1042         fIntervalUpper = fIntervalLow;
1043         break;
1044
1045     case doInterval:
1046         // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1047         if (compileInlineInterval() == FALSE) {
1048             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1049         }
1050         break;
1051
1052     case doPossessiveInterval:
1053         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1054         {
1055             // Remember the loc for the top of the block being looped over.
1056             //   (Can not reserve a slot in the compiled pattern at this time, because
1057             //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1058             //    once per block.)
1059             int32_t topLoc = blockTopLoc(FALSE);
1060
1061             // Produce normal looping code.
1062             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1063
1064             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1065             //  just as if the loop was inclosed in atomic parentheses.
1066
1067             // First the STO_SP before the start of the loop
1068             insertOp(topLoc);
1069
1070             int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
1071             int32_t  op     = URX_BUILD(URX_STO_SP, varLoc);
1072             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1073
1074             int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1075             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1076             loopOp++;     // point LoopOp after the just-inserted STO_SP
1077             fRXPat->fCompiledPat->push(loopOp, *fStatus);
1078
1079             // Then the LD_SP after the end of the loop
1080             op = URX_BUILD(URX_LD_SP, varLoc);
1081             appendOp(op);
1082         }
1083
1084         break;
1085
1086     case doNGInterval:
1087         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1088         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1089         break;
1090
1091     case doIntervalError:
1092         error(U_REGEX_BAD_INTERVAL);
1093         break;
1094
1095     case doLiteralChar:
1096         // We've just scanned a "normal" character from the pattern,
1097         literalChar(fC.fChar);
1098         break;
1099
1100
1101     case doEscapedLiteralChar:
1102         // We've just scanned an backslashed escaped character with  no
1103         //   special meaning.  It represents itself.
1104         if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1105             ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1106             (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1107                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1108              }
1109         literalChar(fC.fChar);
1110         break;
1111
1112
1113     case doDotAny:
1114         // scanned a ".",  match any single character.
1115         {
1116             fixLiterals(FALSE);
1117             int32_t   op;
1118             if (fModeFlags & UREGEX_DOTALL) {
1119                 op = URX_BUILD(URX_DOTANY_ALL, 0);
1120             } else if (fModeFlags & UREGEX_UNIX_LINES) {
1121                 op = URX_BUILD(URX_DOTANY_UNIX, 0);
1122             } else {
1123                 op = URX_BUILD(URX_DOTANY, 0);
1124             }
1125             appendOp(op);
1126         }
1127         break;
1128
1129     case doCaret:
1130         {
1131             fixLiterals(FALSE);
1132             int32_t op = 0;
1133             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1134                 op = URX_CARET;
1135             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1136                 op = URX_CARET_M;
1137             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1138                 op = URX_CARET;   // Only testing true start of input.
1139             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1140                 op = URX_CARET_M_UNIX;
1141             }
1142             appendOp(URX_BUILD(op, 0));
1143         }
1144         break;
1145
1146     case doDollar:
1147         {
1148             fixLiterals(FALSE);
1149             int32_t op = 0;
1150             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1151                 op = URX_DOLLAR;
1152             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1153                 op = URX_DOLLAR_M;
1154             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1155                 op = URX_DOLLAR_D;
1156             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1157                 op = URX_DOLLAR_MD;
1158             }
1159             appendOp(URX_BUILD(op, 0));
1160         }
1161         break;
1162
1163     case doBackslashA:
1164         fixLiterals(FALSE);
1165         appendOp(URX_BUILD(URX_CARET, 0));
1166         break;
1167
1168     case doBackslashB:
1169         {
1170             #if  UCONFIG_NO_BREAK_ITERATION==1
1171             if (fModeFlags & UREGEX_UWORD) {
1172                 error(U_UNSUPPORTED_ERROR);
1173             }
1174             #endif
1175             fixLiterals(FALSE);
1176             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1177             appendOp(URX_BUILD(op, 1));
1178         }
1179         break;
1180
1181     case doBackslashb:
1182         {
1183             #if  UCONFIG_NO_BREAK_ITERATION==1
1184             if (fModeFlags & UREGEX_UWORD) {
1185                 error(U_UNSUPPORTED_ERROR);
1186             }
1187             #endif
1188             fixLiterals(FALSE);
1189             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1190             appendOp(URX_BUILD(op, 0));
1191         }
1192         break;
1193
1194     case doBackslashD:
1195         fixLiterals(FALSE);
1196         appendOp(URX_BUILD(URX_BACKSLASH_D, 1));
1197         break;
1198
1199     case doBackslashd:
1200         fixLiterals(FALSE);
1201         appendOp(URX_BUILD(URX_BACKSLASH_D, 0));
1202         break;
1203
1204     case doBackslashG:
1205         fixLiterals(FALSE);
1206         appendOp(URX_BUILD(URX_BACKSLASH_G, 0));
1207         break;
1208
1209     case doBackslashS:
1210         fixLiterals(FALSE);
1211         appendOp(URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET));
1212         break;
1213
1214     case doBackslashs:
1215         fixLiterals(FALSE);
1216         appendOp(URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET));
1217         break;
1218
1219     case doBackslashW:
1220         fixLiterals(FALSE);
1221         appendOp(URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET));
1222         break;
1223
1224     case doBackslashw:
1225         fixLiterals(FALSE);
1226         appendOp(URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET));
1227         break;
1228
1229     case doBackslashX:
1230         fixLiterals(FALSE);
1231         appendOp(URX_BUILD(URX_BACKSLASH_X, 0));
1232         break;
1233
1234
1235     case doBackslashZ:
1236         fixLiterals(FALSE);
1237         appendOp(URX_BUILD(URX_DOLLAR, 0));
1238         break;
1239
1240     case doBackslashz:
1241         fixLiterals(FALSE);
1242         appendOp(URX_BUILD(URX_BACKSLASH_Z, 0));
1243         break;
1244
1245     case doEscapeError:
1246         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1247         break;
1248
1249     case doExit:
1250         fixLiterals(FALSE);
1251         returnVal = FALSE;
1252         break;
1253
1254     case doProperty:
1255         {
1256             fixLiterals(FALSE);
1257             UnicodeSet *theSet = scanProp();
1258             compileSet(theSet);
1259         }
1260         break;
1261
1262     case doNamedChar:
1263         {
1264             UChar32 c = scanNamedChar();
1265             literalChar(c);
1266         }
1267         break;
1268
1269
1270     case doBackRef:
1271         // BackReference.  Somewhat unusual in that the front-end can not completely parse
1272         //                 the regular expression, because the number of digits to be consumed
1273         //                 depends on the number of capture groups that have been defined.  So
1274         //                 we have to do it here instead.
1275         {
1276             int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1277             int32_t  groupNum = 0;
1278             UChar32  c        = fC.fChar;
1279
1280             for (;;) {
1281                 // Loop once per digit, for max allowed number of digits in a back reference.
1282                 int32_t digit = u_charDigitValue(c);
1283                 groupNum = groupNum * 10 + digit;
1284                 if (groupNum >= numCaptureGroups) {
1285                     break;
1286                 }
1287                 c = peekCharLL();
1288                 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1289                     break;
1290                 }
1291                 nextCharLL();
1292             }
1293
1294             // Scan of the back reference in the source regexp is complete.  Now generate
1295             //  the compiled code for it.
1296             // Because capture groups can be forward-referenced by back-references,
1297             //  we fill the operand with the capture group number.  At the end
1298             //  of compilation, it will be changed to the variable's location.
1299             U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1300                                      //    and shouldn't enter this code path at all.
1301             fixLiterals(FALSE);
1302             int32_t  op;
1303             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1304                 op = URX_BUILD(URX_BACKREF_I, groupNum);
1305             } else {
1306                 op = URX_BUILD(URX_BACKREF, groupNum);
1307             }
1308             appendOp(op);
1309         }
1310         break;
1311
1312
1313     case doPossessivePlus:
1314         // Possessive ++ quantifier.
1315         // Compiles to
1316         //       1.   STO_SP
1317         //       2.      body of stuff being iterated over
1318         //       3.   STATE_SAVE 5
1319         //       4.   JMP        2
1320         //       5.   LD_SP
1321         //       6.   ...
1322         //
1323         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1324         //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1325         //
1326         {
1327             // Emit the STO_SP
1328             int32_t   topLoc = blockTopLoc(TRUE);
1329             int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
1330             int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
1331             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1332
1333             // Emit the STATE_SAVE
1334             op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1335             appendOp(op);
1336
1337             // Emit the JMP
1338             op = URX_BUILD(URX_JMP, topLoc+1);
1339             appendOp(op);
1340
1341             // Emit the LD_SP
1342             op = URX_BUILD(URX_LD_SP, stoLoc);
1343             appendOp(op);
1344         }
1345         break;
1346
1347     case doPossessiveStar:
1348         // Possessive *+ quantifier.
1349         // Compiles to
1350         //       1.   STO_SP       loc
1351         //       2.   STATE_SAVE   5
1352         //       3.      body of stuff being iterated over
1353         //       4.   JMP          2
1354         //       5.   LD_SP        loc
1355         //       6    ...
1356         // TODO:  do something to cut back the state stack each time through the loop.
1357         {
1358             // Reserve two slots at the top of the block.
1359             int32_t   topLoc = blockTopLoc(TRUE);
1360             insertOp(topLoc);
1361
1362             // emit   STO_SP     loc
1363             int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
1364             int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
1365             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1366
1367             // Emit the SAVE_STATE   5
1368             int32_t L7 = fRXPat->fCompiledPat->size()+1;
1369             op = URX_BUILD(URX_STATE_SAVE, L7);
1370             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1371
1372             // Append the JMP operation.
1373             op = URX_BUILD(URX_JMP, topLoc+1);
1374             appendOp(op);
1375
1376             // Emit the LD_SP       loc
1377             op = URX_BUILD(URX_LD_SP, stoLoc);
1378             appendOp(op);
1379         }
1380         break;
1381
1382     case doPossessiveOpt:
1383         // Possessive  ?+ quantifier.
1384         //  Compiles to
1385         //     1. STO_SP      loc
1386         //     2. SAVE_STATE  5
1387         //     3.    body of optional block
1388         //     4. LD_SP       loc
1389         //     5. ...
1390         //
1391         {
1392             // Reserve two slots at the top of the block.
1393             int32_t   topLoc = blockTopLoc(TRUE);
1394             insertOp(topLoc);
1395
1396             // Emit the STO_SP
1397             int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
1398             int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
1399             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1400
1401             // Emit the SAVE_STATE
1402             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1403             op = URX_BUILD(URX_STATE_SAVE, continueLoc);
1404             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1405
1406             // Emit the LD_SP
1407             op = URX_BUILD(URX_LD_SP, stoLoc);
1408             appendOp(op);
1409         }
1410         break;
1411
1412
1413     case doBeginMatchMode:
1414         fNewModeFlags = fModeFlags;
1415         fSetModeFlag  = TRUE;
1416         break;
1417
1418     case doMatchMode:   //  (?i)    and similar
1419         {
1420             int32_t  bit = 0;
1421             switch (fC.fChar) {
1422             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1423             case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1424             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1425             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1426             case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1427             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1428             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1429             case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
1430             default:
1431                 U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
1432                                    // by the scanner.
1433             }
1434             if (fSetModeFlag) {
1435                 fNewModeFlags |= bit;
1436             } else {
1437                 fNewModeFlags &= ~bit;
1438             }
1439         }
1440         break;
1441
1442     case doSetMatchMode:
1443         // Emit code to match any pending literals, using the not-yet changed match mode.
1444         fixLiterals();
1445
1446         // We've got a (?i) or similar.  The match mode is being changed, but
1447         //   the change is not scoped to a parenthesized block.
1448         U_ASSERT(fNewModeFlags < 0);
1449         fModeFlags = fNewModeFlags;
1450
1451         break;
1452
1453
1454     case doMatchModeParen:
1455         // We've got a (?i: or similar.  Begin a parenthesized block, save old
1456         //   mode flags so they can be restored at the close of the block.
1457         //
1458         //   Compile to a
1459         //      - NOP, which later may be replaced by a save-state if the
1460         //         parenthesized group gets a * quantifier, followed by
1461         //      - NOP, which may later be replaced by a save-state if there
1462         //             is an '|' alternation within the parens.
1463         {
1464             fixLiterals(FALSE);
1465             appendOp(URX_BUILD(URX_NOP, 0));
1466             appendOp(URX_BUILD(URX_NOP, 0));
1467
1468             // On the Parentheses stack, start a new frame and add the postions
1469             //   of the two NOPs (a normal non-capturing () frame, except for the
1470             //   saving of the orignal mode flags.)
1471             fParenStack.push(fModeFlags, *fStatus);
1472             fParenStack.push(flags, *fStatus);                            // Frame Marker
1473             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1474             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1475
1476             // Set the current mode flags to the new values.
1477             U_ASSERT(fNewModeFlags < 0);
1478             fModeFlags = fNewModeFlags;
1479         }
1480         break;
1481
1482     case doBadModeFlag:
1483         error(U_REGEX_INVALID_FLAG);
1484         break;
1485
1486     case doSuppressComments:
1487         // We have just scanned a '(?'.  We now need to prevent the character scanner from
1488         // treating a '#' as a to-the-end-of-line comment.
1489         //   (This Perl compatibility just gets uglier and uglier to do...)
1490         fEOLComments = FALSE;
1491         break;
1492
1493
1494     case doSetAddAmp:
1495         {
1496           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1497           set->add(chAmp);
1498         }
1499         break;
1500
1501     case doSetAddDash:
1502         {
1503           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1504           set->add(chDash);
1505         }
1506         break;
1507
1508      case doSetBackslash_s:
1509         {
1510          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1511          set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1512          break;
1513         }
1514
1515      case doSetBackslash_S:
1516         {
1517             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1518             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1519             SSet.complement();
1520             set->addAll(SSet);
1521             break;
1522         }
1523
1524     case doSetBackslash_d:
1525         {
1526             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1527             // TODO - make a static set, ticket 6058.
1528             addCategory(set, U_GC_ND_MASK, *fStatus);
1529             break;
1530         }
1531
1532     case doSetBackslash_D:
1533         {
1534             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1535             UnicodeSet digits;
1536             // TODO - make a static set, ticket 6058.
1537             digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1538             digits.complement();
1539             set->addAll(digits);
1540             break;
1541         }
1542
1543     case doSetBackslash_w:
1544         {
1545             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1546             set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1547             break;
1548         }
1549
1550     case doSetBackslash_W:
1551         {
1552             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1553             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1554             SSet.complement();
1555             set->addAll(SSet);
1556             break;
1557         }
1558
1559     case doSetBegin:
1560         fixLiterals(FALSE);
1561         fSetStack.push(new UnicodeSet(), *fStatus);
1562         fSetOpStack.push(setStart, *fStatus);
1563         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1564             fSetOpStack.push(setCaseClose, *fStatus);
1565         }
1566         break;
1567
1568     case doSetBeginDifference1:
1569         //  We have scanned something like [[abc]-[
1570         //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1571         //  Push a Difference operator, which will cause the new set to be subtracted from what
1572         //    went before once it is created.
1573         setPushOp(setDifference1);
1574         fSetOpStack.push(setStart, *fStatus);
1575         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1576             fSetOpStack.push(setCaseClose, *fStatus);
1577         }
1578         break;
1579
1580     case doSetBeginIntersection1:
1581         //  We have scanned something like  [[abc]&[
1582         //   Need both the '&' operator and the open '[' operator.
1583         setPushOp(setIntersection1);
1584         fSetOpStack.push(setStart, *fStatus);
1585         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1586             fSetOpStack.push(setCaseClose, *fStatus);
1587         }
1588         break;
1589
1590     case doSetBeginUnion:
1591         //  We have scanned something like  [[abc][
1592         //     Need to handle the union operation explicitly [[abc] | [
1593         setPushOp(setUnion);
1594         fSetOpStack.push(setStart, *fStatus);
1595         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1596             fSetOpStack.push(setCaseClose, *fStatus);
1597         }
1598         break;
1599
1600     case doSetDifference2:
1601         // We have scanned something like [abc--
1602         //   Consider this to unambiguously be a set difference operator.
1603         setPushOp(setDifference2);
1604         break;
1605
1606     case doSetEnd:
1607         // Have encountered the ']' that closes a set.
1608         //    Force the evaluation of any pending operations within this set,
1609         //    leave the completed set on the top of the set stack.
1610         setEval(setEnd);
1611         U_ASSERT(fSetOpStack.peeki()==setStart);
1612         fSetOpStack.popi();
1613         break;
1614
1615     case doSetFinish:
1616         {
1617         // Finished a complete set expression, including all nested sets.
1618         //   The close bracket has already triggered clearing out pending set operators,
1619         //    the operator stack should be empty and the operand stack should have just
1620         //    one entry, the result set.
1621         U_ASSERT(fSetOpStack.empty());
1622         UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1623         U_ASSERT(fSetStack.empty());
1624         compileSet(theSet);
1625         break;
1626         }
1627
1628     case doSetIntersection2:
1629         // Have scanned something like [abc&&
1630         setPushOp(setIntersection2);
1631         break;
1632
1633     case doSetLiteral:
1634         // Union the just-scanned literal character into the set being built.
1635         //    This operation is the highest precedence set operation, so we can always do
1636         //    it immediately, without waiting to see what follows.  It is necessary to perform
1637         //    any pending '-' or '&' operation first, because these have the same precedence
1638         //    as union-ing in a literal'
1639         {
1640             setEval(setUnion);
1641             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1642             s->add(fC.fChar);
1643             fLastSetLiteral = fC.fChar;
1644             break;
1645         }
1646
1647     case doSetLiteralEscaped:
1648         // A back-slash escaped literal character was encountered.
1649         // Processing is the same as with setLiteral, above, with the addition of
1650         //  the optional check for errors on escaped ASCII letters.
1651         {
1652             if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1653                 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1654                  (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1655                 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1656             }
1657             setEval(setUnion);
1658             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1659             s->add(fC.fChar);
1660             fLastSetLiteral = fC.fChar;
1661             break;
1662         }
1663
1664         case doSetNamedChar:
1665         // Scanning a \N{UNICODE CHARACTER NAME}
1666         //  Aside from the source of the character, the processing is identical to doSetLiteral,
1667         //    above.
1668         {
1669             UChar32  c = scanNamedChar();
1670             setEval(setUnion);
1671             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1672             s->add(c);
1673             fLastSetLiteral = c;
1674             break;
1675         }
1676
1677     case doSetNamedRange:
1678         // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1679         // The left character is already in the set, and is saved in fLastSetLiteral.
1680         // The right side needs to be picked up, the scan is at the 'N'.
1681         // Lower Limit > Upper limit being an error matches both Java
1682         //        and ICU UnicodeSet behavior.
1683         {
1684             UChar32  c = scanNamedChar();
1685             if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) {
1686                 error(U_REGEX_INVALID_RANGE);
1687             }
1688             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1689             s->add(fLastSetLiteral, c);
1690             fLastSetLiteral = c;
1691             break;
1692         }
1693
1694
1695     case  doSetNegate:
1696         // Scanned a '^' at the start of a set.
1697         // Push the negation operator onto the set op stack.
1698         // A twist for case-insensitive matching:
1699         //   the case closure operation must happen _before_ negation.
1700         //   But the case closure operation will already be on the stack if it's required.
1701         //   This requires checking for case closure, and swapping the stack order
1702         //    if it is present.
1703         {
1704             int32_t  tosOp = fSetOpStack.peeki();
1705             if (tosOp == setCaseClose) {
1706                 fSetOpStack.popi();
1707                 fSetOpStack.push(setNegation, *fStatus);
1708                 fSetOpStack.push(setCaseClose, *fStatus);
1709             } else {
1710                 fSetOpStack.push(setNegation, *fStatus);
1711             }
1712         }
1713         break;
1714
1715     case doSetNoCloseError:
1716         error(U_REGEX_MISSING_CLOSE_BRACKET);
1717         break;
1718
1719     case doSetOpError:
1720         error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1721         break;
1722
1723     case doSetPosixProp:
1724         {
1725             UnicodeSet *s = scanPosixProp();
1726             if (s != NULL) {
1727                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1728                 tos->addAll(*s);
1729                 delete s;
1730             }  // else error.  scanProp() reported the error status already.
1731         }
1732         break;
1733
1734     case doSetProp:
1735         //  Scanned a \p \P within [brackets].
1736         {
1737             UnicodeSet *s = scanProp();
1738             if (s != NULL) {
1739                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1740                 tos->addAll(*s);
1741                 delete s;
1742             }  // else error.  scanProp() reported the error status already.
1743         }
1744         break;
1745
1746
1747     case doSetRange:
1748         // We have scanned literal-literal.  Add the range to the set.
1749         // The left character is already in the set, and is saved in fLastSetLiteral.
1750         // The right side is the current character.
1751         // Lower Limit > Upper limit being an error matches both Java
1752         //        and ICU UnicodeSet behavior.
1753         {
1754         if (fLastSetLiteral > fC.fChar) {
1755             error(U_REGEX_INVALID_RANGE);
1756         }
1757         UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1758         s->add(fLastSetLiteral, fC.fChar);
1759         break;
1760         }
1761
1762     default:
1763         U_ASSERT(FALSE);
1764         error(U_REGEX_INTERNAL_ERROR);
1765         break;
1766     }
1767
1768     if (U_FAILURE(*fStatus)) {
1769         returnVal = FALSE;
1770     }
1771
1772     return returnVal;
1773 }
1774
1775
1776
1777 //------------------------------------------------------------------------------
1778 //
1779 //   literalChar           We've encountered a literal character from the pattern,
1780 //                             or an escape sequence that reduces to a character.
1781 //                         Add it to the string containing all literal chars/strings from
1782 //                             the pattern.
1783 //
1784 //------------------------------------------------------------------------------
1785 void RegexCompile::literalChar(UChar32 c)  {
1786     fLiteralChars.append(c);
1787 }
1788
1789
1790 //------------------------------------------------------------------------------
1791 //
1792 //    fixLiterals           When compiling something that can follow a literal
1793 //                          string in a pattern, emit the code to match the
1794 //                          accumulated literal string.
1795 //
1796 //                          Optionally, split the last char of the string off into
1797 //                          a single "ONE_CHAR" operation, so that quantifiers can
1798 //                          apply to that char alone.  Example:   abc*
1799 //                          The * must apply to the 'c' only.
1800 //
1801 //------------------------------------------------------------------------------
1802 void    RegexCompile::fixLiterals(UBool split) {
1803     int32_t  op = 0;                       // An op from/for the compiled pattern.
1804
1805     // If no literal characters have been scanned but not yet had code generated
1806     //   for them, nothing needs to be done.
1807     if (fLiteralChars.length() == 0) {
1808         return;
1809     }
1810
1811     int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1812     UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1813
1814     // Split:  We need to  ensure that the last item in the compiled pattern
1815     //     refers only to the last literal scanned in the pattern, so that
1816     //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1817     //     Split before case folding for case insensitive matches.
1818
1819     if (split) {
1820         fLiteralChars.truncate(indexOfLastCodePoint);
1821         fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
1822                               //  Note that the truncated literal string may be empty, in which case
1823                               //  nothing will be emitted.
1824
1825         literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1826         fixLiterals(FALSE);          // Second recursive call, code for the final code point.
1827         return;
1828     }
1829
1830     // If we are doing case-insensitive matching, case fold the string.  This may expand
1831     //   the string, e.g. the German sharp-s turns into "ss"
1832     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1833         fLiteralChars.foldCase();
1834         indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1835         lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1836     }
1837
1838     if (indexOfLastCodePoint == 0) {
1839         // Single character, emit a URX_ONECHAR op to match it.
1840         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1841                  u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1842             op = URX_BUILD(URX_ONECHAR_I, lastCodePoint);
1843         } else {
1844             op = URX_BUILD(URX_ONECHAR, lastCodePoint);
1845         }
1846         appendOp(op);
1847     } else {
1848         // Two or more chars, emit a URX_STRING to match them.
1849         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1850             op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length());
1851         } else {
1852             // TODO here:  add optimization to split case sensitive strings of length two
1853             //             into two single char ops, for efficiency.
1854             op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length());
1855         }
1856         appendOp(op);
1857         op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length());
1858         appendOp(op);
1859
1860         // Add this string into the accumulated strings of the compiled pattern.
1861         // The total size of the accumulated strings must be restricted to 24 bits because
1862         // string indexes appear as compiled pattern operand values.
1863         // This is the only place that the pattern.fLiteralText string is modified.
1864
1865         fRXPat->fLiteralText.append(fLiteralChars);
1866         if (U_SUCCESS(*fStatus) && fRXPat->fLiteralText.length() > 0x00ffffff) {
1867             *fStatus = U_REGEX_PATTERN_TOO_BIG;
1868         }
1869     }
1870
1871     fLiteralChars.remove();
1872 }
1873
1874
1875 //------------------------------------------------------------------------------
1876 //
1877 //   appendOp()             Append a new instruction onto the compiled pattern
1878 //                          Includes error checking, limiting the size of the
1879 //                          pattern to lengths that can be represented in the
1880 //                          24 bit operand field of an instruction.
1881 //
1882 //------------------------------------------------------------------------------
1883 void RegexCompile::appendOp(int32_t op) {
1884     fRXPat->fCompiledPat->addElement(op, *fStatus);
1885     if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
1886         *fStatus = U_REGEX_PATTERN_TOO_BIG;
1887     }
1888 }
1889
1890
1891 //------------------------------------------------------------------------------
1892 //
1893 //   insertOp()             Insert a slot for a new opcode into the already
1894 //                          compiled pattern code.
1895 //
1896 //                          Fill the slot with a NOP.  Our caller will replace it
1897 //                          with what they really wanted.
1898 //
1899 //------------------------------------------------------------------------------
1900 void   RegexCompile::insertOp(int32_t where) {
1901     UVector64 *code = fRXPat->fCompiledPat;
1902     U_ASSERT(where>0 && where < code->size());
1903
1904     int32_t  nop = URX_BUILD(URX_NOP, 0);
1905     code->insertElementAt(nop, where, *fStatus);
1906
1907     // Walk through the pattern, looking for any ops with targets that
1908     //  were moved down by the insert.  Fix them.
1909     int32_t loc;
1910     for (loc=0; loc<code->size(); loc++) {
1911         int32_t op = (int32_t)code->elementAti(loc);
1912         int32_t opType = URX_TYPE(op);
1913         int32_t opValue = URX_VAL(op);
1914         if ((opType == URX_JMP         ||
1915             opType == URX_JMPX         ||
1916             opType == URX_STATE_SAVE   ||
1917             opType == URX_CTR_LOOP     ||
1918             opType == URX_CTR_LOOP_NG  ||
1919             opType == URX_JMP_SAV      ||
1920             opType == URX_JMP_SAV_X    ||
1921             opType == URX_RELOC_OPRND)    && opValue > where) {
1922             // Target location for this opcode is after the insertion point and
1923             //   needs to be incremented to adjust for the insertion.
1924             opValue++;
1925             op = URX_BUILD(opType, opValue);
1926             code->setElementAt(op, loc);
1927         }
1928     }
1929
1930     // Now fix up the parentheses stack.  All positive values in it are locations in
1931     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
1932     for (loc=0; loc<fParenStack.size(); loc++) {
1933         int32_t x = fParenStack.elementAti(loc);
1934         U_ASSERT(x < code->size());
1935         if (x>where) {
1936             x++;
1937             fParenStack.setElementAt(x, loc);
1938         }
1939     }
1940
1941     if (fMatchCloseParen > where) {
1942         fMatchCloseParen++;
1943     }
1944     if (fMatchOpenParen > where) {
1945         fMatchOpenParen++;
1946     }
1947 }
1948
1949
1950 //------------------------------------------------------------------------------
1951 //
1952 //   allocateData()        Allocate storage in the matcher's static data area.
1953 //                         Return the index for the newly allocated data.
1954 //                         The storage won't actually exist until we are running a match
1955 //                         operation, but the storage indexes are inserted into various
1956 //                         opcodes while compiling the pattern.
1957 //
1958 //------------------------------------------------------------------------------
1959 int32_t RegexCompile::allocateData(int32_t size) {
1960     if (U_FAILURE(*fStatus)) {
1961         return 0;
1962     }
1963     if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
1964         *fStatus = U_REGEX_INTERNAL_ERROR;
1965         return 0;
1966     }
1967     int32_t dataIndex = fRXPat->fDataSize;
1968     fRXPat->fDataSize += size;
1969     if (fRXPat->fDataSize >= 0x00fffff0) {
1970         *fStatus = U_REGEX_PATTERN_TOO_BIG;
1971     }
1972     return dataIndex;
1973 }
1974
1975
1976 //------------------------------------------------------------------------------
1977 //
1978 //   allocateStackData()   Allocate space in the back-tracking stack frame.
1979 //                         Return the index for the newly allocated data.
1980 //                         The frame indexes are inserted into various
1981 //                         opcodes while compiling the pattern, meaning that frame
1982 //                         size must be restricted to the size that will fit
1983 //                         as an operand (24 bits).
1984 //
1985 //------------------------------------------------------------------------------
1986 int32_t RegexCompile::allocateStackData(int32_t size) {
1987     if (U_FAILURE(*fStatus)) {
1988         return 0;
1989     }
1990     if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
1991         *fStatus = U_REGEX_INTERNAL_ERROR;
1992         return 0;
1993     }
1994     int32_t dataIndex = fRXPat->fFrameSize;
1995     fRXPat->fFrameSize += size;
1996     if (fRXPat->fFrameSize >= 0x00fffff0) {
1997         *fStatus = U_REGEX_PATTERN_TOO_BIG;
1998     }
1999     return dataIndex;
2000 }
2001
2002
2003 //------------------------------------------------------------------------------
2004 //
2005 //   blockTopLoc()          Find or create a location in the compiled pattern
2006 //                          at the start of the operation or block that has
2007 //                          just been compiled.  Needed when a quantifier (* or
2008 //                          whatever) appears, and we need to add an operation
2009 //                          at the start of the thing being quantified.
2010 //
2011 //                          (Parenthesized Blocks) have a slot with a NOP that
2012 //                          is reserved for this purpose.  .* or similar don't
2013 //                          and a slot needs to be added.
2014 //
2015 //       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
2016 //                                         at the returned location.
2017 //                                 FALSE - just return the address,
2018 //                                         do not reserve a location there.
2019 //
2020 //------------------------------------------------------------------------------
2021 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
2022     int32_t   theLoc;
2023     fixLiterals(TRUE);  // Emit code for any pending literals.
2024                         //   If last item was a string, emit separate op for the its last char.
2025     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2026     {
2027         // The item just processed is a parenthesized block.
2028         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
2029         U_ASSERT(theLoc > 0);
2030         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2031     }
2032     else {
2033         // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2034         // No slot for STATE_SAVE was pre-reserved in the compiled code.
2035         // We need to make space now.
2036         theLoc = fRXPat->fCompiledPat->size()-1;
2037         int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2038         if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2039             // Strings take two opcode, we want the position of the first one.
2040             // We can have a string at this point if a single character case-folded to two.
2041             theLoc--;
2042         }
2043         if (reserveLoc) {
2044             int32_t  nop = URX_BUILD(URX_NOP, 0);
2045             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2046         }
2047     }
2048     return theLoc;
2049 }
2050
2051
2052
2053 //------------------------------------------------------------------------------
2054 //
2055 //    handleCloseParen      When compiling a close paren, we need to go back
2056 //                          and fix up any JMP or SAVE operations within the
2057 //                          parenthesized block that need to target the end
2058 //                          of the block.  The locations of these are kept on
2059 //                          the paretheses stack.
2060 //
2061 //                          This function is called both when encountering a
2062 //                          real ) and at the end of the pattern.
2063 //
2064 //------------------------------------------------------------------------------
2065 void  RegexCompile::handleCloseParen() {
2066     int32_t   patIdx;
2067     int32_t   patOp;
2068     if (fParenStack.size() <= 0) {
2069         error(U_REGEX_MISMATCHED_PAREN);
2070         return;
2071     }
2072
2073     // Emit code for any pending literals.
2074     fixLiterals(FALSE);
2075
2076     // Fixup any operations within the just-closed parenthesized group
2077     //    that need to reference the end of the (block).
2078     //    (The first one popped from the stack is an unused slot for
2079     //     alternation (OR) state save, but applying the fixup to it does no harm.)
2080     for (;;) {
2081         patIdx = fParenStack.popi();
2082         if (patIdx < 0) {
2083             // value < 0 flags the start of the frame on the paren stack.
2084             break;
2085         }
2086         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2087         patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2088         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2089         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2090         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2091         fMatchOpenParen     = patIdx;
2092     }
2093
2094     //  At the close of any parenthesized block, restore the match mode flags  to
2095     //  the value they had at the open paren.  Saved value is
2096     //  at the top of the paren stack.
2097     fModeFlags = fParenStack.popi();
2098     U_ASSERT(fModeFlags < 0);
2099
2100     // DO any additional fixups, depending on the specific kind of
2101     // parentesized grouping this is
2102
2103     switch (patIdx) {
2104     case plain:
2105     case flags:
2106         // No additional fixups required.
2107         //   (Grouping-only parentheses)
2108         break;
2109     case capturing:
2110         // Capturing Parentheses.
2111         //   Insert a End Capture op into the pattern.
2112         //   The frame offset of the variables for this cg is obtained from the
2113         //       start capture op and put it into the end-capture op.
2114         {
2115             int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2116             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2117
2118             int32_t   frameVarLocation = URX_VAL(captureOp);
2119             int32_t   endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation);
2120             appendOp(endCaptureOp);
2121         }
2122         break;
2123     case atomic:
2124         // Atomic Parenthesis.
2125         //   Insert a LD_SP operation to restore the state stack to the position
2126         //   it was when the atomic parens were entered.
2127         {
2128             int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2129             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2130             int32_t   stoLoc = URX_VAL(stoOp);
2131             int32_t   ldOp   = URX_BUILD(URX_LD_SP, stoLoc);
2132             appendOp(ldOp);
2133         }
2134         break;
2135
2136     case lookAhead:
2137         {
2138             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2139             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2140             int32_t dataLoc  = URX_VAL(startOp);
2141             int32_t op       = URX_BUILD(URX_LA_END, dataLoc);
2142             appendOp(op);
2143         }
2144         break;
2145
2146     case negLookAhead:
2147         {
2148             // See comment at doOpenLookAheadNeg
2149             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2150             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2151             int32_t dataLoc  = URX_VAL(startOp);
2152             int32_t op       = URX_BUILD(URX_LA_END, dataLoc);
2153             appendOp(op);
2154             op               = URX_BUILD(URX_BACKTRACK, 0);
2155             appendOp(op);
2156             op               = URX_BUILD(URX_LA_END, dataLoc);
2157             appendOp(op);
2158
2159             // Patch the URX_SAVE near the top of the block.
2160             // The destination of the SAVE is the final LA_END that was just added.
2161             int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2162             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2163             int32_t dest     = fRXPat->fCompiledPat->size()-1;
2164             saveOp           = URX_BUILD(URX_STATE_SAVE, dest);
2165             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2166         }
2167         break;
2168
2169     case lookBehind:
2170         {
2171             // See comment at doOpenLookBehind.
2172
2173             // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2174             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2175             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2176             int32_t dataLoc  = URX_VAL(startOp);
2177             int32_t op       = URX_BUILD(URX_LB_END, dataLoc);
2178             appendOp(op);
2179             op = URX_BUILD(URX_LA_END, dataLoc);
2180             appendOp(op);
2181
2182             // Determine the min and max bounds for the length of the
2183             //  string that the pattern can match.
2184             //  An unbounded upper limit is an error.
2185             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2186             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2187             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2188             if (URX_TYPE(maxML) != 0) {
2189                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2190                 break;
2191             }
2192             if (maxML == INT32_MAX) {
2193                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2194                 break;
2195             }
2196             U_ASSERT(minML <= maxML);
2197
2198             // Insert the min and max match len bounds into the URX_LB_CONT op that
2199             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2200             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2201             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2202
2203         }
2204         break;
2205
2206
2207
2208     case lookBehindN:
2209         {
2210             // See comment at doOpenLookBehindNeg.
2211
2212             // Append the URX_LBN_END to the compiled pattern.
2213             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2214             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2215             int32_t dataLoc  = URX_VAL(startOp);
2216             int32_t op       = URX_BUILD(URX_LBN_END, dataLoc);
2217             appendOp(op);
2218
2219             // Determine the min and max bounds for the length of the
2220             //  string that the pattern can match.
2221             //  An unbounded upper limit is an error.
2222             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2223             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2224             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2225             if (URX_TYPE(maxML) != 0) {
2226                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2227                 break;
2228             }
2229             if (maxML == INT32_MAX) {
2230                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2231                 break;
2232             }
2233             U_ASSERT(minML <= maxML);
2234
2235             // Insert the min and max match len bounds into the URX_LB_CONT op that
2236             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2237             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2238             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2239
2240             // Insert the pattern location to continue at after a successful match
2241             //  as the last operand of the URX_LBN_CONT
2242             op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2243             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2244         }
2245         break;
2246
2247
2248
2249     default:
2250         U_ASSERT(FALSE);
2251     }
2252
2253     // remember the next location in the compiled pattern.
2254     // The compilation of Quantifiers will look at this to see whether its looping
2255     //   over a parenthesized block or a single item
2256     fMatchCloseParen = fRXPat->fCompiledPat->size();
2257 }
2258
2259
2260
2261 //------------------------------------------------------------------------------
2262 //
2263 //   compileSet       Compile the pattern operations for a reference to a
2264 //                    UnicodeSet.
2265 //
2266 //------------------------------------------------------------------------------
2267 void        RegexCompile::compileSet(UnicodeSet *theSet)
2268 {
2269     if (theSet == NULL) {
2270         return;
2271     }
2272     //  Remove any strings from the set.
2273     //  There shoudn't be any, but just in case.
2274     //     (Case Closure can add them; if we had a simple case closure avaialble that
2275     //      ignored strings, that would be better.)
2276     theSet->removeAllStrings();
2277     int32_t  setSize = theSet->size();
2278
2279     switch (setSize) {
2280     case 0:
2281         {
2282             // Set of no elements.   Always fails to match.
2283             appendOp(URX_BUILD(URX_BACKTRACK, 0));
2284             delete theSet;
2285         }
2286         break;
2287
2288     case 1:
2289         {
2290             // The set contains only a single code point.  Put it into
2291             //   the compiled pattern as a single char operation rather
2292             //   than a set, and discard the set itself.
2293             literalChar(theSet->charAt(0));
2294             delete theSet;
2295         }
2296         break;
2297
2298     default:
2299         {
2300             //  The set contains two or more chars.  (the normal case)
2301             //  Put it into the compiled pattern as a set.
2302             int32_t setNumber = fRXPat->fSets->size();
2303             fRXPat->fSets->addElement(theSet, *fStatus);
2304             int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
2305             appendOp(setOp);
2306         }
2307     }
2308 }
2309
2310
2311 //------------------------------------------------------------------------------
2312 //
2313 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
2314 //                      Except for the specific opcodes used, the code is the same
2315 //                      for all three types (greedy, non-greedy, possessive) of
2316 //                      intervals.  The opcodes are supplied as parameters.
2317 //                      (There are two sets of opcodes - greedy & possessive use the
2318 //                      same ones, while non-greedy has it's own.)
2319 //
2320 //                      The code for interval loops has this form:
2321 //                         0  CTR_INIT   counter loc (in stack frame)
2322 //                         1             5  patt address of CTR_LOOP at bottom of block
2323 //                         2             min count
2324 //                         3             max count   (-1 for unbounded)
2325 //                         4  ...        block to be iterated over
2326 //                         5  CTR_LOOP
2327 //
2328 //                       In
2329 //------------------------------------------------------------------------------
2330 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2331 {
2332     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2333     //   four slots in the compiled code.  Reserve them.
2334     int32_t   topOfBlock = blockTopLoc(TRUE);
2335     insertOp(topOfBlock);
2336     insertOp(topOfBlock);
2337     insertOp(topOfBlock);
2338
2339     // The operands for the CTR_INIT opcode include the index in the matcher data
2340     //   of the counter.  Allocate it now. There are two data items
2341     //        counterLoc   -->  Loop counter
2342     //               +1    -->  Input index (for breaking non-progressing loops)
2343     //                          (Only present if unbounded upper limit on loop)
2344     int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
2345     int32_t   counterLoc = allocateStackData(dataSize);
2346
2347     int32_t   op = URX_BUILD(InitOp, counterLoc);
2348     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2349
2350     // The second operand of CTR_INIT is the location following the end of the loop.
2351     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2352     //   compilation of something later on causes the code to grow and the target
2353     //   position to move.
2354     int32_t loopEnd = fRXPat->fCompiledPat->size();
2355     op = URX_BUILD(URX_RELOC_OPRND, loopEnd);
2356     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2357
2358     // Followed by the min and max counts.
2359     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2360     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2361
2362     // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2363     //   Goes at end of the block being looped over, so just append to the code so far.
2364     op = URX_BUILD(LoopOp, topOfBlock);
2365     appendOp(op);
2366
2367     if ((fIntervalLow & 0xff000000) != 0 ||
2368         (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2369             error(U_REGEX_NUMBER_TOO_BIG);
2370         }
2371
2372     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2373         error(U_REGEX_MAX_LT_MIN);
2374     }
2375 }
2376
2377
2378
2379 UBool RegexCompile::compileInlineInterval() {
2380     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2381         // Too big to inline.  Fail, which will cause looping code to be generated.
2382         //   (Upper < Lower picks up unbounded upper and errors, both.)
2383         return FALSE;
2384     }
2385
2386     int32_t   topOfBlock = blockTopLoc(FALSE);
2387     if (fIntervalUpper == 0) {
2388         // Pathological case.  Attempt no matches, as if the block doesn't exist.
2389         // Discard the generated code for the block.
2390         // If the block included parens, discard the info pertaining to them as well.
2391         fRXPat->fCompiledPat->setSize(topOfBlock);
2392         if (fMatchOpenParen >= topOfBlock) {
2393             fMatchOpenParen = -1;
2394         }
2395         if (fMatchCloseParen >= topOfBlock) {
2396             fMatchCloseParen = -1;
2397         }
2398         return TRUE;
2399     }
2400
2401     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2402         // The thing being repeated is not a single op, but some
2403         //   more complex block.  Do it as a loop, not inlines.
2404         //   Note that things "repeated" a max of once are handled as inline, because
2405         //     the one copy of the code already generated is just fine.
2406         return FALSE;
2407     }
2408
2409     // Pick up the opcode that is to be repeated
2410     //
2411     int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2412
2413     // Compute the pattern location where the inline sequence
2414     //   will end, and set up the state save op that will be needed.
2415     //
2416     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2417                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2418     int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc);
2419     if (fIntervalLow == 0) {
2420         insertOp(topOfBlock);
2421         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2422     }
2423
2424
2425
2426     //  Loop, emitting the op for the thing being repeated each time.
2427     //    Loop starts at 1 because one instance of the op already exists in the pattern,
2428     //    it was put there when it was originally encountered.
2429     int32_t i;
2430     for (i=1; i<fIntervalUpper; i++ ) {
2431         if (i == fIntervalLow) {
2432             appendOp(saveOp);
2433         }
2434         if (i > fIntervalLow) {
2435             appendOp(saveOp);
2436         }
2437         appendOp(op);
2438     }
2439     return TRUE;
2440 }
2441
2442
2443
2444 //------------------------------------------------------------------------------
2445 //
2446 //   matchStartType    Determine how a match can start.
2447 //                     Used to optimize find() operations.
2448 //
2449 //                     Operation is very similar to minMatchLength().  Walk the compiled
2450 //                     pattern, keeping an on-going minimum-match-length.  For any
2451 //                     op where the min match coming in is zero, add that ops possible
2452 //                     starting matches to the possible starts for the overall pattern.
2453 //
2454 //------------------------------------------------------------------------------
2455 void   RegexCompile::matchStartType() {
2456     if (U_FAILURE(*fStatus)) {
2457         return;
2458     }
2459
2460
2461     int32_t    loc;                    // Location in the pattern of the current op being processed.
2462     int32_t    op;                     // The op being processed
2463     int32_t    opType;                 // The opcode type of the op
2464     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2465     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2466
2467     UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
2468                                        //   could have advanced the position in a match.
2469                                        //   (Maximum match length so far == 0)
2470
2471     // forwardedLength is a vector holding minimum-match-length values that
2472     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2473     //   It must be one longer than the pattern being checked because some  ops
2474     //   will jmp to a end-of-block+1 location from within a block, and we must
2475     //   count those when checking the block.
2476     int32_t end = fRXPat->fCompiledPat->size();
2477     UVector32  forwardedLength(end+1, *fStatus);
2478     forwardedLength.setSize(end+1);
2479     for (loc=3; loc<end; loc++) {
2480         forwardedLength.setElementAt(INT32_MAX, loc);
2481     }
2482
2483     for (loc = 3; loc<end; loc++) {
2484         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2485         opType = URX_TYPE(op);
2486
2487         // The loop is advancing linearly through the pattern.
2488         // If the op we are now at was the destination of a branch in the pattern,
2489         // and that path has a shorter minimum length than the current accumulated value,
2490         // replace the current accumulated value.
2491         if (forwardedLength.elementAti(loc) < currentLen) {
2492             currentLen = forwardedLength.elementAti(loc);
2493             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2494         }
2495
2496         switch (opType) {
2497             // Ops that don't change the total length matched
2498         case URX_RESERVED_OP:
2499         case URX_END:
2500         case URX_FAIL:
2501         case URX_STRING_LEN:
2502         case URX_NOP:
2503         case URX_START_CAPTURE:
2504         case URX_END_CAPTURE:
2505         case URX_BACKSLASH_B:
2506         case URX_BACKSLASH_BU:
2507         case URX_BACKSLASH_G:
2508         case URX_BACKSLASH_Z:
2509         case URX_DOLLAR:
2510         case URX_DOLLAR_M:
2511         case URX_DOLLAR_D:
2512         case URX_DOLLAR_MD:
2513         case URX_RELOC_OPRND:
2514         case URX_STO_INP_LOC:
2515         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2516         case URX_BACKREF_I:
2517
2518         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2519         case URX_LD_SP:
2520             break;
2521
2522         case URX_CARET:
2523             if (atStart) {
2524                 fRXPat->fStartType = START_START;
2525             }
2526             break;
2527
2528         case URX_CARET_M:
2529         case URX_CARET_M_UNIX:
2530             if (atStart) {
2531                 fRXPat->fStartType = START_LINE;
2532             }
2533             break;
2534
2535         case URX_ONECHAR:
2536             if (currentLen == 0) {
2537                 // This character could appear at the start of a match.
2538                 //   Add it to the set of possible starting characters.
2539                 fRXPat->fInitialChars->add(URX_VAL(op));
2540                 numInitialStrings += 2;
2541             }
2542             currentLen++;
2543             atStart = FALSE;
2544             break;
2545
2546
2547         case URX_SETREF:
2548             if (currentLen == 0) {
2549                 int32_t  sn = URX_VAL(op);
2550                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2551                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2552                 fRXPat->fInitialChars->addAll(*s);
2553                 numInitialStrings += 2;
2554             }
2555             currentLen++;
2556             atStart = FALSE;
2557             break;
2558
2559         case URX_LOOP_SR_I:
2560             // [Set]*, like a SETREF, above, in what it can match,
2561             //  but may not match at all, so currentLen is not incremented.
2562             if (currentLen == 0) {
2563                 int32_t  sn = URX_VAL(op);
2564                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2565                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2566                 fRXPat->fInitialChars->addAll(*s);
2567                 numInitialStrings += 2;
2568             }
2569             atStart = FALSE;
2570             break;
2571
2572         case URX_LOOP_DOT_I:
2573             if (currentLen == 0) {
2574                 // .* at the start of a pattern.
2575                 //    Any character can begin the match.
2576                 fRXPat->fInitialChars->clear();
2577                 fRXPat->fInitialChars->complement();
2578                 numInitialStrings += 2;
2579             }
2580             atStart = FALSE;
2581             break;
2582
2583
2584         case URX_STATIC_SETREF:
2585             if (currentLen == 0) {
2586                 int32_t  sn = URX_VAL(op);
2587                 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2588                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2589                 fRXPat->fInitialChars->addAll(*s);
2590                 numInitialStrings += 2;
2591             }
2592             currentLen++;
2593             atStart = FALSE;
2594             break;
2595
2596
2597
2598         case URX_STAT_SETREF_N:
2599             if (currentLen == 0) {
2600                 int32_t  sn = URX_VAL(op);
2601                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2602                 UnicodeSet sc(*s);
2603                 sc.complement();
2604                 fRXPat->fInitialChars->addAll(sc);
2605                 numInitialStrings += 2;
2606             }
2607             currentLen++;
2608             atStart = FALSE;
2609             break;
2610
2611
2612
2613         case URX_BACKSLASH_D:
2614             // Digit Char
2615              if (currentLen == 0) {
2616                  UnicodeSet s;
2617                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2618                  if (URX_VAL(op) != 0) {
2619                      s.complement();
2620                  }
2621                  fRXPat->fInitialChars->addAll(s);
2622                  numInitialStrings += 2;
2623             }
2624             currentLen++;
2625             atStart = FALSE;
2626             break;
2627
2628
2629         case URX_ONECHAR_I:
2630             // Case Insensitive Single Character.
2631             if (currentLen == 0) {
2632                 UChar32  c = URX_VAL(op);
2633                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2634
2635                     // Disable optimizations on first char of match.
2636                     // TODO: Compute the set of chars that case fold to this char, or to
2637                     //       a string that begins with this char.
2638                     //       For simple case folding, this code worked:
2639                     //   UnicodeSet s(c, c);
2640                     //   s.closeOver(USET_CASE_INSENSITIVE);
2641                     //   fRXPat->fInitialChars->addAll(s);
2642
2643                     fRXPat->fInitialChars->clear();
2644                     fRXPat->fInitialChars->complement();
2645                 } else {
2646                     // Char has no case variants.  Just add it as-is to the
2647                     //   set of possible starting chars.
2648                     fRXPat->fInitialChars->add(c);
2649                 }
2650                 numInitialStrings += 2;
2651             }
2652             currentLen++;
2653             atStart = FALSE;
2654             break;
2655
2656
2657         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
2658         case URX_DOTANY_ALL:    // . matches one or two.
2659         case URX_DOTANY:
2660         case URX_DOTANY_UNIX:
2661             if (currentLen == 0) {
2662                 // These constructs are all bad news when they appear at the start
2663                 //   of a match.  Any character can begin the match.
2664                 fRXPat->fInitialChars->clear();
2665                 fRXPat->fInitialChars->complement();
2666                 numInitialStrings += 2;
2667             }
2668             currentLen++;
2669             atStart = FALSE;
2670             break;
2671
2672
2673         case URX_JMPX:
2674             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2675         case URX_JMP:
2676             {
2677                 int32_t  jmpDest = URX_VAL(op);
2678                 if (jmpDest < loc) {
2679                     // Loop of some kind.  Can safely ignore, the worst that will happen
2680                     //  is that we understate the true minimum length
2681                     currentLen = forwardedLength.elementAti(loc+1);
2682
2683                 } else {
2684                     // Forward jump.  Propagate the current min length to the target loc of the jump.
2685                     U_ASSERT(jmpDest <= end+1);
2686                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
2687                         forwardedLength.setElementAt(currentLen, jmpDest);
2688                     }
2689                 }
2690             }
2691             atStart = FALSE;
2692             break;
2693
2694         case URX_JMP_SAV:
2695         case URX_JMP_SAV_X:
2696             // Combo of state save to the next loc, + jmp backwards.
2697             //   Net effect on min. length computation is nothing.
2698             atStart = FALSE;
2699             break;
2700
2701         case URX_BACKTRACK:
2702             // Fails are kind of like a branch, except that the min length was
2703             //   propagated already, by the state save.
2704             currentLen = forwardedLength.elementAti(loc+1);
2705             atStart = FALSE;
2706             break;
2707
2708
2709         case URX_STATE_SAVE:
2710             {
2711                 // State Save, for forward jumps, propagate the current minimum.
2712                 //             of the state save.
2713                 int32_t  jmpDest = URX_VAL(op);
2714                 if (jmpDest > loc) {
2715                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
2716                         forwardedLength.setElementAt(currentLen, jmpDest);
2717                     }
2718                 }
2719             }
2720             atStart = FALSE;
2721             break;
2722
2723
2724
2725
2726         case URX_STRING:
2727             {
2728                 loc++;
2729                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2730                 int32_t stringLen   = URX_VAL(stringLenOp);
2731                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2732                 U_ASSERT(stringLenOp >= 2);
2733                 if (currentLen == 0) {
2734                     // Add the starting character of this string to the set of possible starting
2735                     //   characters for this pattern.
2736                     int32_t stringStartIdx = URX_VAL(op);
2737                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
2738                     fRXPat->fInitialChars->add(c);
2739
2740                     // Remember this string.  After the entire pattern has been checked,
2741                     //  if nothing else is identified that can start a match, we'll use it.
2742                     numInitialStrings++;
2743                     fRXPat->fInitialStringIdx = stringStartIdx;
2744                     fRXPat->fInitialStringLen = stringLen;
2745                 }
2746
2747                 currentLen += stringLen;
2748                 atStart = FALSE;
2749             }
2750             break;
2751
2752         case URX_STRING_I:
2753             {
2754                 // Case-insensitive string.  Unlike exact-match strings, we won't
2755                 //   attempt a string search for possible match positions.  But we
2756                 //   do update the set of possible starting characters.
2757                 loc++;
2758                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2759                 int32_t stringLen   = URX_VAL(stringLenOp);
2760                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2761                 U_ASSERT(stringLenOp >= 2);
2762                 if (currentLen == 0) {
2763                     // Add the starting character of this string to the set of possible starting
2764                     //   characters for this pattern.
2765                     int32_t stringStartIdx = URX_VAL(op);
2766                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
2767                     UnicodeSet s(c, c);
2768
2769                     // TODO:  compute correct set of starting chars for full case folding.
2770                     //        For the moment, say any char can start.
2771                     // s.closeOver(USET_CASE_INSENSITIVE);
2772                     s.clear();
2773                     s.complement();
2774
2775                     fRXPat->fInitialChars->addAll(s);
2776                     numInitialStrings += 2;  // Matching on an initial string not possible.
2777                 }
2778                 currentLen += stringLen;
2779                 atStart = FALSE;
2780             }
2781             break;
2782
2783         case URX_CTR_INIT:
2784         case URX_CTR_INIT_NG:
2785             {
2786                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops
2787                 //   so location must be updated accordingly.
2788                 // Loop Init Ops.
2789                 //   If the min loop count == 0
2790                 //      move loc forwards to the end of the loop, skipping over the body.
2791                 //   If the min count is > 0,
2792                 //      continue normal processing of the body of the loop.
2793                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
2794                         loopEndLoc   = URX_VAL(loopEndLoc);
2795                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
2796                 if (minLoopCount == 0) {
2797                     // Min Loop Count of 0, treat like a forward branch and
2798                     //   move the current minimum length up to the target
2799                     //   (end of loop) location.
2800                     U_ASSERT(loopEndLoc <= end+1);
2801                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
2802                         forwardedLength.setElementAt(currentLen, loopEndLoc);
2803                     }
2804                 }
2805                 loc+=3;  // Skips over operands of CTR_INIT
2806             }
2807             atStart = FALSE;
2808             break;
2809
2810
2811         case URX_CTR_LOOP:
2812         case URX_CTR_LOOP_NG:
2813             // Loop ops.
2814             //  The jump is conditional, backwards only.
2815             atStart = FALSE;
2816             break;
2817
2818         case URX_LOOP_C:
2819             // More loop ops.  These state-save to themselves.
2820             //   don't change the minimum match
2821             atStart = FALSE;
2822             break;
2823
2824
2825         case URX_LA_START:
2826         case URX_LB_START:
2827             {
2828                 // Look-around.  Scan forward until the matching look-ahead end,
2829                 //   without processing the look-around block.  This is overly pessimistic.
2830
2831                 // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
2832                 //   lookahead contains two LA_END instructions, so count goes up by two
2833                 //   for each LA_START.
2834                 int32_t  depth = (opType == URX_LA_START? 2: 1);
2835                 for (;;) {
2836                     loc++;
2837                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2838                     if (URX_TYPE(op) == URX_LA_START) {
2839                         depth+=2;
2840                     }
2841                     if (URX_TYPE(op) == URX_LB_START) {
2842                         depth++;
2843                     }
2844                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
2845                         depth--;
2846                         if (depth == 0) {
2847                             break;
2848                         }
2849                     }
2850                     if (URX_TYPE(op) == URX_STATE_SAVE) {
2851                         // Need this because neg lookahead blocks will FAIL to outside
2852                         //   of the block.
2853                         int32_t  jmpDest = URX_VAL(op);
2854                         if (jmpDest > loc) {
2855                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
2856                                 forwardedLength.setElementAt(currentLen, jmpDest);
2857                             }
2858                         }
2859                     }
2860                     U_ASSERT(loc <= end);
2861                 }
2862             }
2863             break;
2864
2865         case URX_LA_END:
2866         case URX_LB_CONT:
2867         case URX_LB_END:
2868         case URX_LBN_CONT:
2869         case URX_LBN_END:
2870             U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be
2871                                  //  consumed by the scan in URX_LA_START and LB_START
2872
2873             break;
2874
2875         default:
2876             U_ASSERT(FALSE);
2877             }
2878
2879         }
2880
2881
2882     // We have finished walking through the ops.  Check whether some forward jump
2883     //   propagated a shorter length to location end+1.
2884     if (forwardedLength.elementAti(end+1) < currentLen) {
2885         currentLen = forwardedLength.elementAti(end+1);
2886     }
2887
2888
2889     fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
2890
2891
2892     // Sort out what we should check for when looking for candidate match start positions.
2893     // In order of preference,
2894     //     1.   Start of input text buffer.
2895     //     2.   A literal string.
2896     //     3.   Start of line in multi-line mode.
2897     //     4.   A single literal character.
2898     //     5.   A character from a set of characters.
2899     //
2900     if (fRXPat->fStartType == START_START) {
2901         // Match only at the start of an input text string.
2902         //    start type is already set.  We're done.
2903     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
2904         // Match beginning only with a literal string.
2905         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
2906         U_ASSERT(fRXPat->fInitialChars->contains(c));
2907         fRXPat->fStartType   = START_STRING;
2908         fRXPat->fInitialChar = c;
2909     } else if (fRXPat->fStartType == START_LINE) {
2910         // Match at start of line in Multi-Line mode.
2911         // Nothing to do here; everything is already set.
2912     } else if (fRXPat->fMinMatchLen == 0) {
2913         // Zero length match possible.  We could start anywhere.
2914         fRXPat->fStartType = START_NO_INFO;
2915     } else if (fRXPat->fInitialChars->size() == 1) {
2916         // All matches begin with the same char.
2917         fRXPat->fStartType   = START_CHAR;
2918         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
2919         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
2920     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
2921         fRXPat->fMinMatchLen > 0) {
2922         // Matches start with a set of character smaller than the set of all chars.
2923         fRXPat->fStartType = START_SET;
2924     } else {
2925         // Matches can start with anything
2926         fRXPat->fStartType = START_NO_INFO;
2927     }
2928
2929     return;
2930 }
2931
2932
2933
2934 //------------------------------------------------------------------------------
2935 //
2936 //   minMatchLength    Calculate the length of the shortest string that could
2937 //                     match the specified pattern.
2938 //                     Length is in 16 bit code units, not code points.
2939 //
2940 //                     The calculated length may not be exact.  The returned
2941 //                     value may be shorter than the actual minimum; it must
2942 //                     never be longer.
2943 //
2944 //                     start and end are the range of p-code operations to be
2945 //                     examined.  The endpoints are included in the range.
2946 //
2947 //------------------------------------------------------------------------------
2948 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
2949     if (U_FAILURE(*fStatus)) {
2950         return 0;
2951     }
2952
2953     U_ASSERT(start <= end);
2954     U_ASSERT(end < fRXPat->fCompiledPat->size());
2955
2956
2957     int32_t    loc;
2958     int32_t    op;
2959     int32_t    opType;
2960     int32_t    currentLen = 0;
2961
2962
2963     // forwardedLength is a vector holding minimum-match-length values that
2964     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2965     //   It must be one longer than the pattern being checked because some  ops
2966     //   will jmp to a end-of-block+1 location from within a block, and we must
2967     //   count those when checking the block.
2968     UVector32  forwardedLength(end+2, *fStatus);
2969     forwardedLength.setSize(end+2);
2970     for (loc=start; loc<=end+1; loc++) {
2971         forwardedLength.setElementAt(INT32_MAX, loc);
2972     }
2973
2974     for (loc = start; loc<=end; loc++) {
2975         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2976         opType = URX_TYPE(op);
2977
2978         // The loop is advancing linearly through the pattern.
2979         // If the op we are now at was the destination of a branch in the pattern,
2980         // and that path has a shorter minimum length than the current accumulated value,
2981         // replace the current accumulated value.
2982         // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
2983                                                                //   no-match-possible cases.
2984         if (forwardedLength.elementAti(loc) < currentLen) {
2985             currentLen = forwardedLength.elementAti(loc);
2986             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2987         }
2988
2989         switch (opType) {
2990             // Ops that don't change the total length matched
2991         case URX_RESERVED_OP:
2992         case URX_END:
2993         case URX_STRING_LEN:
2994         case URX_NOP:
2995         case URX_START_CAPTURE:
2996         case URX_END_CAPTURE:
2997         case URX_BACKSLASH_B:
2998         case URX_BACKSLASH_BU:
2999         case URX_BACKSLASH_G:
3000         case URX_BACKSLASH_Z:
3001         case URX_CARET:
3002         case URX_DOLLAR:
3003         case URX_DOLLAR_M:
3004         case URX_DOLLAR_D:
3005         case URX_DOLLAR_MD:
3006         case URX_RELOC_OPRND:
3007         case URX_STO_INP_LOC:
3008         case URX_CARET_M:
3009         case URX_CARET_M_UNIX:
3010         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3011         case URX_BACKREF_I:
3012
3013         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3014         case URX_LD_SP:
3015
3016         case URX_JMP_SAV:
3017         case URX_JMP_SAV_X:
3018             break;
3019
3020
3021             // Ops that match a minimum of one character (one or two 16 bit code units.)
3022             //
3023         case URX_ONECHAR:
3024         case URX_STATIC_SETREF:
3025         case URX_STAT_SETREF_N:
3026         case URX_SETREF:
3027         case URX_BACKSLASH_D:
3028         case URX_ONECHAR_I:
3029         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3030         case URX_DOTANY_ALL:    // . matches one or two.
3031         case URX_DOTANY:
3032         case URX_DOTANY_UNIX:
3033             currentLen++;
3034             break;
3035
3036
3037         case URX_JMPX:
3038             loc++;              // URX_JMPX has an extra operand, ignored here,
3039                                 //   otherwise processed identically to URX_JMP.
3040         case URX_JMP:
3041             {
3042                 int32_t  jmpDest = URX_VAL(op);
3043                 if (jmpDest < loc) {
3044                     // Loop of some kind.  Can safely ignore, the worst that will happen
3045                     //  is that we understate the true minimum length
3046                     currentLen = forwardedLength.elementAti(loc+1);
3047                 } else {
3048                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3049                     U_ASSERT(jmpDest <= end+1);
3050                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
3051                         forwardedLength.setElementAt(currentLen, jmpDest);
3052                     }
3053                 }
3054             }
3055             break;
3056
3057         case URX_BACKTRACK:
3058             {
3059                 // Back-tracks are kind of like a branch, except that the min length was
3060                 //   propagated already, by the state save.
3061                 currentLen = forwardedLength.elementAti(loc+1);
3062             }
3063             break;
3064
3065
3066         case URX_STATE_SAVE:
3067             {
3068                 // State Save, for forward jumps, propagate the current minimum.
3069                 //             of the state save.
3070                 int32_t  jmpDest = URX_VAL(op);
3071                 if (jmpDest > loc) {
3072                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
3073                         forwardedLength.setElementAt(currentLen, jmpDest);
3074                     }
3075                 }
3076             }
3077             break;
3078
3079
3080         case URX_STRING:
3081             {
3082                 loc++;
3083                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3084                 currentLen += URX_VAL(stringLenOp);
3085             }
3086             break;
3087
3088
3089         case URX_STRING_I:
3090             {
3091                 loc++;
3092                 // TODO: with full case folding, matching input text may be shorter than
3093                 //       the string we have here.  More smarts could put some bounds on it.
3094                 //       Assume a min length of one for now.  A min length of zero causes
3095                 //        optimization failures for a pattern like "string"+
3096                 // currentLen += URX_VAL(stringLenOp);
3097                 currentLen += 1;
3098             }
3099             break;
3100
3101         case URX_CTR_INIT:
3102         case URX_CTR_INIT_NG:
3103             {
3104                 // Loop Init Ops.
3105                 //   If the min loop count == 0
3106                 //      move loc forwards to the end of the loop, skipping over the body.
3107                 //   If the min count is > 0,
3108                 //      continue normal processing of the body of the loop.
3109                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3110                         loopEndLoc   = URX_VAL(loopEndLoc);
3111                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3112                 if (minLoopCount == 0) {
3113                     loc = loopEndLoc;
3114                 } else {
3115                     loc+=3;  // Skips over operands of CTR_INIT
3116                 }
3117             }
3118             break;
3119
3120
3121         case URX_CTR_LOOP:
3122         case URX_CTR_LOOP_NG:
3123             // Loop ops.
3124             //  The jump is conditional, backwards only.
3125             break;
3126
3127         case URX_LOOP_SR_I:
3128         case URX_LOOP_DOT_I:
3129         case URX_LOOP_C:
3130             // More loop ops.  These state-save to themselves.
3131             //   don't change the minimum match - could match nothing at all.
3132             break;
3133
3134
3135         case URX_LA_START:
3136         case URX_LB_START:
3137             {
3138                 // Look-around.  Scan forward until the matching look-ahead end,
3139                 //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3140                 //   it assumes that the look-ahead match might be zero-length.
3141                 //   TODO:  Positive lookahead could recursively do the block, then continue
3142                 //          with the longer of the block or the value coming in.  Ticket 6060
3143                 int32_t  depth = (opType == URX_LA_START? 2: 1);;
3144                 for (;;) {
3145                     loc++;
3146                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3147                     if (URX_TYPE(op) == URX_LA_START) {
3148                         // The boilerplate for look-ahead includes two LA_END insturctions,
3149                         //    Depth will be decremented by each one when it is seen.
3150                         depth += 2;
3151                     }
3152                     if (URX_TYPE(op) == URX_LB_START) {
3153                         depth++;
3154                     }
3155                     if (URX_TYPE(op) == URX_LA_END) {
3156                         depth--;
3157                         if (depth == 0) {
3158                             break;
3159                         }
3160                     }
3161                     if (URX_TYPE(op)==URX_LBN_END) {
3162                         depth--;
3163                         if (depth == 0) {
3164                             break;
3165                         }
3166                     }
3167                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3168                         // Need this because neg lookahead blocks will FAIL to outside
3169                         //   of the block.
3170                         int32_t  jmpDest = URX_VAL(op);
3171                         if (jmpDest > loc) {
3172                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3173                                 forwardedLength.setElementAt(currentLen, jmpDest);
3174                             }
3175                         }
3176                     }
3177                     U_ASSERT(loc <= end);
3178                 }
3179             }
3180             break;
3181
3182         case URX_LA_END:
3183         case URX_LB_CONT:
3184         case URX_LB_END:
3185         case URX_LBN_CONT:
3186         case URX_LBN_END:
3187             // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3188             //   range being sized, which happens when measuring size of look-behind blocks.
3189             break;
3190
3191         default:
3192             U_ASSERT(FALSE);
3193             }
3194
3195         }
3196
3197     // We have finished walking through the ops.  Check whether some forward jump
3198     //   propagated a shorter length to location end+1.
3199     if (forwardedLength.elementAti(end+1) < currentLen) {
3200         currentLen = forwardedLength.elementAti(end+1);
3201         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3202     }
3203
3204     return currentLen;
3205 }
3206
3207 // Increment with overflow check.
3208 // val and delta will both be positive.
3209
3210 static int32_t safeIncrement(int32_t val, int32_t delta) {
3211     if (INT32_MAX - val > delta) {
3212         return val + delta;
3213     } else {
3214         return INT32_MAX;
3215     }
3216 }
3217
3218
3219 //------------------------------------------------------------------------------
3220 //
3221 //   maxMatchLength    Calculate the length of the longest string that could
3222 //                     match the specified pattern.
3223 //                     Length is in 16 bit code units, not code points.
3224 //
3225 //                     The calculated length may not be exact.  The returned
3226 //                     value may be longer than the actual maximum; it must
3227 //                     never be shorter.
3228 //
3229 //------------------------------------------------------------------------------
3230 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3231     if (U_FAILURE(*fStatus)) {
3232         return 0;
3233     }
3234     U_ASSERT(start <= end);
3235     U_ASSERT(end < fRXPat->fCompiledPat->size());
3236
3237
3238     int32_t    loc;
3239     int32_t    op;
3240     int32_t    opType;
3241     int32_t    currentLen = 0;
3242     UVector32  forwardedLength(end+1, *fStatus);
3243     forwardedLength.setSize(end+1);
3244
3245     for (loc=start; loc<=end; loc++) {
3246         forwardedLength.setElementAt(0, loc);
3247     }
3248
3249     for (loc = start; loc<=end; loc++) {
3250         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3251         opType = URX_TYPE(op);
3252
3253         // The loop is advancing linearly through the pattern.
3254         // If the op we are now at was the destination of a branch in the pattern,
3255         // and that path has a longer maximum length than the current accumulated value,
3256         // replace the current accumulated value.
3257         if (forwardedLength.elementAti(loc) > currentLen) {
3258             currentLen = forwardedLength.elementAti(loc);
3259         }
3260
3261         switch (opType) {
3262             // Ops that don't change the total length matched
3263         case URX_RESERVED_OP:
3264         case URX_END:
3265         case URX_STRING_LEN:
3266         case URX_NOP:
3267         case URX_START_CAPTURE:
3268         case URX_END_CAPTURE:
3269         case URX_BACKSLASH_B:
3270         case URX_BACKSLASH_BU:
3271         case URX_BACKSLASH_G:
3272         case URX_BACKSLASH_Z:
3273         case URX_CARET:
3274         case URX_DOLLAR:
3275         case URX_DOLLAR_M:
3276         case URX_DOLLAR_D:
3277         case URX_DOLLAR_MD:
3278         case URX_RELOC_OPRND:
3279         case URX_STO_INP_LOC:
3280         case URX_CARET_M:
3281         case URX_CARET_M_UNIX:
3282
3283         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3284         case URX_LD_SP:
3285
3286         case URX_LB_END:
3287         case URX_LB_CONT:
3288         case URX_LBN_CONT:
3289         case URX_LBN_END:
3290             break;
3291
3292
3293             // Ops that increase that cause an unbounded increase in the length
3294             //   of a matched string, or that increase it a hard to characterize way.
3295             //   Call the max length unbounded, and stop further checking.
3296         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3297         case URX_BACKREF_I:
3298         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3299             currentLen = INT32_MAX;
3300             break;
3301
3302
3303             // Ops that match a max of one character (possibly two 16 bit code units.)
3304             //
3305         case URX_STATIC_SETREF:
3306         case URX_STAT_SETREF_N:
3307         case URX_SETREF:
3308         case URX_BACKSLASH_D:
3309         case URX_ONECHAR_I:
3310         case URX_DOTANY_ALL:
3311         case URX_DOTANY:
3312         case URX_DOTANY_UNIX:
3313             currentLen = safeIncrement(currentLen, 2);
3314             break;
3315
3316             // Single literal character.  Increase current max length by one or two,
3317             //       depending on whether the char is in the supplementary range.
3318         case URX_ONECHAR:
3319             currentLen = safeIncrement(currentLen, 1);
3320             if (URX_VAL(op) > 0x10000) {
3321                 currentLen = safeIncrement(currentLen, 1);
3322             }
3323             break;
3324
3325             // Jumps.
3326             //
3327         case URX_JMP:
3328         case URX_JMPX:
3329         case URX_JMP_SAV:
3330         case URX_JMP_SAV_X:
3331             {
3332                 int32_t  jmpDest = URX_VAL(op);
3333                 if (jmpDest < loc) {
3334                     // Loop of some kind.  Max match length is unbounded.
3335                     currentLen = INT32_MAX;
3336                 } else {
3337                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3338                     if (forwardedLength.elementAti(jmpDest) < currentLen) {
3339                         forwardedLength.setElementAt(currentLen, jmpDest);
3340                     }
3341                     currentLen = 0;
3342                 }
3343             }
3344             break;
3345
3346         case URX_BACKTRACK:
3347             // back-tracks are kind of like a branch, except that the max length was
3348             //   propagated already, by the state save.
3349             currentLen = forwardedLength.elementAti(loc+1);
3350             break;
3351
3352
3353         case URX_STATE_SAVE:
3354             {
3355                 // State Save, for forward jumps, propagate the current minimum.
3356                 //               of the state save.
3357                 //             For backwards jumps, they create a loop, maximum
3358                 //               match length is unbounded.
3359                 int32_t  jmpDest = URX_VAL(op);
3360                 if (jmpDest > loc) {
3361                     if (currentLen > forwardedLength.elementAti(jmpDest)) {
3362                         forwardedLength.setElementAt(currentLen, jmpDest);
3363                     }
3364                 } else {
3365                     currentLen = INT32_MAX;
3366                 }
3367             }
3368             break;
3369
3370
3371
3372
3373         case URX_STRING:
3374             {
3375                 loc++;
3376                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3377                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3378                 break;
3379             }
3380
3381         case URX_STRING_I:
3382             // TODO:  This code assumes that any user string that matches will be no longer
3383             //        than our compiled string, with case insensitive matching.
3384             //        Our compiled string has been case-folded already.
3385             //
3386             //        Any matching user string will have no more code points than our
3387             //        compiled (folded) string.  Folding may add code points, but
3388             //        not remove them.
3389             //
3390             //        There is a potential problem if a supplemental code point
3391             //        case-folds to a BMP code point.  In this case our compiled string
3392             //        could be shorter (in code units) than a matching user string.
3393             //
3394             //        At this time (Unicode 6.1) there are no such characters, and this case
3395             //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3396             //        any problematic characters are added to Unicode.
3397             //
3398             //        If this happens, we can make a set of the BMP chars that the
3399             //        troublesome supplementals fold to, scan our string, and bump the
3400             //        currentLen one extra for each that is found.
3401             //
3402             {
3403                 loc++;
3404                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3405                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3406             }
3407             break;
3408
3409         case URX_CTR_INIT:
3410         case URX_CTR_INIT_NG:
3411             // For Loops, recursively call this function on the pattern for the loop body,
3412             //   then multiply the result by the maximum loop count.
3413             {
3414                 int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3415                 if (loopEndLoc == loc+4) {
3416                     // Loop has an empty body. No affect on max match length.
3417                     // Continue processing with code after the loop end.
3418                     loc = loopEndLoc;
3419                     break;
3420                 }
3421
3422                 int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3);
3423                 if (maxLoopCount == -1) {
3424                     // Unbounded Loop. No upper bound on match length.
3425                     currentLen = INT32_MAX;
3426                     break;
3427                 }
3428
3429                 U_ASSERT(loopEndLoc >= loc+4);
3430                 int32_t  blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3431                 if (blockLen == INT32_MAX) {
3432                     currentLen = blockLen;
3433                     break;
3434                 }
3435                 currentLen += blockLen * maxLoopCount;
3436                 loc = loopEndLoc;
3437                 break;
3438             }
3439
3440         case URX_CTR_LOOP:
3441         case URX_CTR_LOOP_NG:
3442             // These opcodes will be skipped over by code for URX_CRT_INIT.
3443             // We shouldn't encounter them here.
3444             U_ASSERT(FALSE);
3445             break;
3446
3447         case URX_LOOP_SR_I:
3448         case URX_LOOP_DOT_I:
3449         case URX_LOOP_C:
3450             // For anything to do with loops, make the match length unbounded.
3451             currentLen = INT32_MAX;
3452             break;
3453
3454
3455
3456         case URX_LA_START:
3457         case URX_LA_END:
3458             // Look-ahead.  Just ignore, treat the look-ahead block as if
3459             // it were normal pattern.  Gives a too-long match length,
3460             //  but good enough for now.
3461             break;
3462
3463             // End of look-ahead ops should always be consumed by the processing at
3464             //  the URX_LA_START op.
3465             // U_ASSERT(FALSE);
3466             // break;
3467
3468         case URX_LB_START:
3469             {
3470                 // Look-behind.  Scan forward until the matching look-around end,
3471                 //   without processing the look-behind block.
3472                 int32_t  depth = 0;
3473                 for (;;) {
3474                     loc++;
3475                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3476                     if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
3477                         depth++;
3478                     }
3479                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3480                         if (depth == 0) {
3481                             break;
3482                         }
3483                         depth--;
3484                     }
3485                     U_ASSERT(loc < end);
3486                 }
3487             }
3488             break;
3489
3490         default:
3491             U_ASSERT(FALSE);
3492         }
3493
3494
3495         if (currentLen == INT32_MAX) {
3496             //  The maximum length is unbounded.
3497             //  Stop further processing of the pattern.
3498             break;
3499         }
3500
3501     }
3502     return currentLen;
3503
3504 }
3505
3506
3507 //------------------------------------------------------------------------------
3508 //
3509 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
3510 //                Extra NOPs are inserted for some constructs during the initial
3511 //                code generation to provide locations that may be patched later.
3512 //                Many end up unneeded, and are removed by this function.
3513 //
3514 //                In order to minimize the number of passes through the pattern,
3515 //                back-reference fixup is also performed here (adjusting
3516 //                back-reference operands to point to the correct frame offsets).
3517 //
3518 //------------------------------------------------------------------------------
3519 void RegexCompile::stripNOPs() {
3520
3521     if (U_FAILURE(*fStatus)) {
3522         return;
3523     }
3524
3525     int32_t    end = fRXPat->fCompiledPat->size();
3526     UVector32  deltas(end, *fStatus);
3527
3528     // Make a first pass over the code, computing the amount that things
3529     //   will be offset at each location in the original code.
3530     int32_t   loc;
3531     int32_t   d = 0;
3532     for (loc=0; loc<end; loc++) {
3533         deltas.addElement(d, *fStatus);
3534         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3535         if (URX_TYPE(op) == URX_NOP) {
3536             d++;
3537         }
3538     }
3539
3540     UnicodeString caseStringBuffer;
3541
3542     // Make a second pass over the code, removing the NOPs by moving following
3543     //  code up, and patching operands that refer to code locations that
3544     //  are being moved.  The array of offsets from the first step is used
3545     //  to compute the new operand values.
3546     int32_t src;
3547     int32_t dst = 0;
3548     for (src=0; src<end; src++) {
3549         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3550         int32_t opType = URX_TYPE(op);
3551         switch (opType) {
3552         case URX_NOP:
3553             break;
3554
3555         case URX_STATE_SAVE:
3556         case URX_JMP:
3557         case URX_CTR_LOOP:
3558         case URX_CTR_LOOP_NG:
3559         case URX_RELOC_OPRND:
3560         case URX_JMPX:
3561         case URX_JMP_SAV:
3562         case URX_JMP_SAV_X:
3563             // These are instructions with operands that refer to code locations.
3564             {
3565                 int32_t  operandAddress = URX_VAL(op);
3566                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3567                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3568                 op = URX_BUILD(opType, fixedOperandAddress);
3569                 fRXPat->fCompiledPat->setElementAt(op, dst);
3570                 dst++;
3571                 break;
3572             }
3573
3574         case URX_BACKREF:
3575         case URX_BACKREF_I:
3576             {
3577                 int32_t where = URX_VAL(op);
3578                 if (where > fRXPat->fGroupMap->size()) {
3579                     error(U_REGEX_INVALID_BACK_REF);
3580                     break;
3581                 }
3582                 where = fRXPat->fGroupMap->elementAti(where-1);
3583                 op    = URX_BUILD(opType, where);
3584                 fRXPat->fCompiledPat->setElementAt(op, dst);
3585                 dst++;
3586
3587                 fRXPat->fNeedsAltInput = TRUE;
3588                 break;
3589             }
3590         case URX_RESERVED_OP:
3591         case URX_RESERVED_OP_N:
3592         case URX_BACKTRACK:
3593         case URX_END:
3594         case URX_ONECHAR:
3595         case URX_STRING:
3596         case URX_STRING_LEN:
3597         case URX_START_CAPTURE:
3598         case URX_END_CAPTURE:
3599         case URX_STATIC_SETREF:
3600         case URX_STAT_SETREF_N:
3601         case URX_SETREF:
3602         case URX_DOTANY:
3603         case URX_FAIL:
3604         case URX_BACKSLASH_B:
3605         case URX_BACKSLASH_BU:
3606         case URX_BACKSLASH_G:
3607         case URX_BACKSLASH_X:
3608         case URX_BACKSLASH_Z:
3609         case URX_DOTANY_ALL:
3610         case URX_BACKSLASH_D:
3611         case URX_CARET:
3612         case URX_DOLLAR:
3613         case URX_CTR_INIT:
3614         case URX_CTR_INIT_NG:
3615         case URX_DOTANY_UNIX:
3616         case URX_STO_SP:
3617         case URX_LD_SP:
3618         case URX_STO_INP_LOC:
3619         case URX_LA_START:
3620         case URX_LA_END:
3621         case URX_ONECHAR_I:
3622         case URX_STRING_I:
3623         case URX_DOLLAR_M:
3624         case URX_CARET_M:
3625         case URX_CARET_M_UNIX:
3626         case URX_LB_START:
3627         case URX_LB_CONT:
3628         case URX_LB_END:
3629         case URX_LBN_CONT:
3630         case URX_LBN_END:
3631         case URX_LOOP_SR_I:
3632         case URX_LOOP_DOT_I:
3633         case URX_LOOP_C:
3634         case URX_DOLLAR_D:
3635         case URX_DOLLAR_MD:
3636             // These instructions are unaltered by the relocation.
3637             fRXPat->fCompiledPat->setElementAt(op, dst);
3638             dst++;
3639             break;
3640
3641         default:
3642             // Some op is unaccounted for.
3643             U_ASSERT(FALSE);
3644             error(U_REGEX_INTERNAL_ERROR);
3645         }
3646     }
3647
3648     fRXPat->fCompiledPat->setSize(dst);
3649 }
3650
3651
3652
3653
3654 //------------------------------------------------------------------------------
3655 //
3656 //  Error         Report a rule parse error.
3657 //                Only report it if no previous error has been recorded.
3658 //
3659 //------------------------------------------------------------------------------
3660 void RegexCompile::error(UErrorCode e) {
3661     if (U_SUCCESS(*fStatus)) {
3662         *fStatus = e;
3663         // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3664         // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3665         // int64_t. If the values of the latter are out of range for the former,
3666         // set them to the appropriate "field not supported" values.
3667         if (fLineNum > 0x7FFFFFFF) {
3668             fParseErr->line   = 0;
3669             fParseErr->offset = -1;
3670         } else if (fCharNum > 0x7FFFFFFF) {
3671             fParseErr->line   = (int32_t)fLineNum;
3672             fParseErr->offset = -1;
3673         } else {
3674             fParseErr->line   = (int32_t)fLineNum;
3675             fParseErr->offset = (int32_t)fCharNum;
3676         }
3677
3678         UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3679
3680         // Fill in the context.
3681         //   Note: extractBetween() pins supplied indicies to the string bounds.
3682         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3683         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3684         utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3685         utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3686     }
3687 }
3688
3689
3690 //
3691 //  Assorted Unicode character constants.
3692 //     Numeric because there is no portable way to enter them as literals.
3693 //     (Think EBCDIC).
3694 //
3695 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
3696 static const UChar      chLF        = 0x0a;      // Line Feed
3697 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
3698 static const UChar      chDigit0    = 0x30;      // '0'
3699 static const UChar      chDigit7    = 0x37;      // '9'
3700 static const UChar      chColon     = 0x3A;      // ':'
3701 static const UChar      chE         = 0x45;      // 'E'
3702 static const UChar      chQ         = 0x51;      // 'Q'
3703 //static const UChar      chN         = 0x4E;      // 'N'
3704 static const UChar      chP         = 0x50;      // 'P'
3705 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
3706 //static const UChar      chLBracket  = 0x5b;      // '['
3707 static const UChar      chRBracket  = 0x5d;      // ']'
3708 static const UChar      chUp        = 0x5e;      // '^'
3709 static const UChar      chLowerP    = 0x70;
3710 static const UChar      chLBrace    = 0x7b;      // '{'
3711 static const UChar      chRBrace    = 0x7d;      // '}'
3712 static const UChar      chNEL       = 0x85;      //    NEL newline variant
3713 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
3714
3715
3716 //------------------------------------------------------------------------------
3717 //
3718 //  nextCharLL    Low Level Next Char from the regex pattern.
3719 //                Get a char from the string, keep track of input position
3720 //                     for error reporting.
3721 //
3722 //------------------------------------------------------------------------------
3723 UChar32  RegexCompile::nextCharLL() {
3724     UChar32       ch;
3725
3726     if (fPeekChar != -1) {
3727         ch = fPeekChar;
3728         fPeekChar = -1;
3729         return ch;
3730     }
3731
3732     // assume we're already in the right place
3733     ch = UTEXT_NEXT32(fRXPat->fPattern);
3734     if (ch == U_SENTINEL) {
3735         return ch;
3736     }
3737
3738     if (ch == chCR ||
3739         ch == chNEL ||
3740         ch == chLS   ||
3741         (ch == chLF && fLastChar != chCR)) {
3742         // Character is starting a new line.  Bump up the line number, and
3743         //  reset the column to 0.
3744         fLineNum++;
3745         fCharNum=0;
3746     }
3747     else {
3748         // Character is not starting a new line.  Except in the case of a
3749         //   LF following a CR, increment the column position.
3750         if (ch != chLF) {
3751             fCharNum++;
3752         }
3753     }
3754     fLastChar = ch;
3755     return ch;
3756 }
3757
3758 //------------------------------------------------------------------------------
3759 //
3760 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
3761 //                 character without actually getting it.
3762 //
3763 //------------------------------------------------------------------------------
3764 UChar32  RegexCompile::peekCharLL() {
3765     if (fPeekChar == -1) {
3766         fPeekChar = nextCharLL();
3767     }
3768     return fPeekChar;
3769 }
3770
3771
3772 //------------------------------------------------------------------------------
3773 //
3774 //   nextChar     for pattern scanning.  At this level, we handle stripping
3775 //                out comments and processing some backslash character escapes.
3776 //                The rest of the pattern grammar is handled at the next level up.
3777 //
3778 //------------------------------------------------------------------------------
3779 void RegexCompile::nextChar(RegexPatternChar &c) {
3780
3781     fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
3782     c.fChar    = nextCharLL();
3783     c.fQuoted  = FALSE;
3784
3785     if (fQuoteMode) {
3786         c.fQuoted = TRUE;
3787         if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
3788             c.fChar == (UChar32)-1) {
3789             fQuoteMode = FALSE;  //  Exit quote mode,
3790             nextCharLL();        // discard the E
3791             nextChar(c);         // recurse to get the real next char
3792         }
3793     }
3794     else if (fInBackslashQuote) {
3795         // The current character immediately follows a '\'
3796         // Don't check for any further escapes, just return it as-is.
3797         // Don't set c.fQuoted, because that would prevent the state machine from
3798         //    dispatching on the character.
3799         fInBackslashQuote = FALSE;
3800     }
3801     else
3802     {
3803         // We are not in a \Q quoted region \E of the source.
3804         //
3805         if (fModeFlags & UREGEX_COMMENTS) {
3806             //
3807             // We are in free-spacing and comments mode.
3808             //  Scan through any white space and comments, until we
3809             //  reach a significant character or the end of inut.
3810             for (;;) {
3811                 if (c.fChar == (UChar32)-1) {
3812                     break;     // End of Input
3813                 }
3814                 if  (c.fChar == chPound && fEOLComments == TRUE) {
3815                     // Start of a comment.  Consume the rest of it, until EOF or a new line
3816                     for (;;) {
3817                         c.fChar = nextCharLL();
3818                         if (c.fChar == (UChar32)-1 ||  // EOF
3819                             c.fChar == chCR        ||
3820                             c.fChar == chLF        ||
3821                             c.fChar == chNEL       ||
3822                             c.fChar == chLS)       {
3823                             break;
3824                         }
3825                     }
3826                 }
3827                 // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
3828                 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
3829                     break;
3830                 }
3831                 c.fChar = nextCharLL();
3832             }
3833         }
3834
3835         //
3836         //  check for backslash escaped characters.
3837         //
3838         if (c.fChar == chBackSlash) {
3839             int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
3840             if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
3841                 //
3842                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
3843                 //   Includes \uxxxx, \n, \r, many others.
3844                 //   Return the single equivalent character.
3845                 //
3846                 nextCharLL();                 // get & discard the peeked char.
3847                 c.fQuoted = TRUE;
3848
3849                 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
3850                     int32_t endIndex = (int32_t)pos;
3851                     c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
3852
3853                     if (endIndex == pos) {
3854                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
3855                     }
3856                     fCharNum += endIndex - pos;
3857                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
3858                 } else {
3859                     int32_t offset = 0;
3860                     struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
3861
3862                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
3863                     c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
3864
3865                     if (offset == 0) {
3866                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
3867                     } else if (context.lastOffset == offset) {
3868                         UTEXT_PREVIOUS32(fRXPat->fPattern);
3869                     } else if (context.lastOffset != offset-1) {
3870                         utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
3871                     }
3872                     fCharNum += offset;
3873                 }
3874             }
3875             else if (peekCharLL() == chDigit0) {
3876                 //  Octal Escape, using Java Regexp Conventions
3877                 //    which are \0 followed by 1-3 octal digits.
3878                 //    Different from ICU Unescape handling of Octal, which does not
3879                 //    require the leading 0.
3880                 //  Java also has the convention of only consuming 2 octal digits if
3881                 //    the three digit number would be > 0xff
3882                 //
3883                 c.fChar = 0;
3884                 nextCharLL();    // Consume the initial 0.
3885                 int index;
3886                 for (index=0; index<3; index++) {
3887                     int32_t ch = peekCharLL();
3888                     if (ch<chDigit0 || ch>chDigit7) {
3889                         if (index==0) {
3890                            // \0 is not followed by any octal digits.
3891                            error(U_REGEX_BAD_ESCAPE_SEQUENCE);
3892                         }
3893                         break;
3894                     }
3895                     c.fChar <<= 3;
3896                     c.fChar += ch&7;
3897                     if (c.fChar <= 255) {
3898                         nextCharLL();
3899                     } else {
3900                         // The last digit made the number too big.  Forget we saw it.
3901                         c.fChar >>= 3;
3902                     }
3903                 }
3904                 c.fQuoted = TRUE;
3905             }
3906             else if (peekCharLL() == chQ) {
3907                 //  "\Q"  enter quote mode, which will continue until "\E"
3908                 fQuoteMode = TRUE;
3909                 nextCharLL();       // discard the 'Q'.
3910                 nextChar(c);        // recurse to get the real next char.
3911             }
3912             else
3913             {
3914                 // We are in a '\' escape that will be handled by the state table scanner.
3915                 // Just return the backslash, but remember that the following char is to
3916                 //  be taken literally.
3917                 fInBackslashQuote = TRUE;
3918             }
3919         }
3920     }
3921
3922     // re-enable # to end-of-line comments, in case they were disabled.
3923     // They are disabled by the parser upon seeing '(?', but this lasts for
3924     //  the fetching of the next character only.
3925     fEOLComments = TRUE;
3926
3927     // putc(c.fChar, stdout);
3928 }
3929
3930
3931
3932 //------------------------------------------------------------------------------
3933 //
3934 //  scanNamedChar
3935  //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
3936 //
3937 //             The scan position will be at the 'N'.  On return
3938 //             the scan position should be just after the '}'
3939 //
3940 //             Return the UChar32
3941 //
3942 //------------------------------------------------------------------------------
3943 UChar32  RegexCompile::scanNamedChar() {
3944     if (U_FAILURE(*fStatus)) {
3945         return 0;
3946     }
3947
3948     nextChar(fC);
3949     if (fC.fChar != chLBrace) {
3950         error(U_REGEX_PROPERTY_SYNTAX);
3951         return 0;
3952     }
3953
3954     UnicodeString  charName;
3955     for (;;) {
3956         nextChar(fC);
3957         if (fC.fChar == chRBrace) {
3958             break;
3959         }
3960         if (fC.fChar == -1) {
3961             error(U_REGEX_PROPERTY_SYNTAX);
3962             return 0;
3963         }
3964         charName.append(fC.fChar);
3965     }
3966
3967     char name[100];
3968     if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
3969          (uint32_t)charName.length()>=sizeof(name)) {
3970         // All Unicode character names have only invariant characters.
3971         // The API to get a character, given a name, accepts only char *, forcing us to convert,
3972         //   which requires this error check
3973         error(U_REGEX_PROPERTY_SYNTAX);
3974         return 0;
3975     }
3976     charName.extract(0, charName.length(), name, sizeof(name), US_INV);
3977
3978     UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
3979     if (U_FAILURE(*fStatus)) {
3980         error(U_REGEX_PROPERTY_SYNTAX);
3981     }
3982
3983     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
3984     return theChar;
3985 }
3986
3987 //------------------------------------------------------------------------------
3988 //
3989 //  scanProp   Construct a UnicodeSet from the text at the current scan
3990 //             position, which will be of the form \p{whaterver}
3991 //
3992 //             The scan position will be at the 'p' or 'P'.  On return
3993 //             the scan position should be just after the '}'
3994 //
3995 //             Return a UnicodeSet, constructed from the \P pattern,
3996 //             or NULL if the pattern is invalid.
3997 //
3998 //------------------------------------------------------------------------------
3999 UnicodeSet *RegexCompile::scanProp() {
4000     UnicodeSet    *uset = NULL;
4001
4002     if (U_FAILURE(*fStatus)) {
4003         return NULL;
4004     }
4005     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4006     UBool negated = (fC.fChar == chP);
4007
4008     UnicodeString propertyName;
4009     nextChar(fC);
4010     if (fC.fChar != chLBrace) {
4011         error(U_REGEX_PROPERTY_SYNTAX);
4012         return NULL;
4013     }
4014     for (;;) {
4015         nextChar(fC);
4016         if (fC.fChar == chRBrace) {
4017             break;
4018         }
4019         if (fC.fChar == -1) {
4020             // Hit the end of the input string without finding the closing '}'
4021             error(U_REGEX_PROPERTY_SYNTAX);
4022             return NULL;
4023         }
4024         propertyName.append(fC.fChar);
4025     }
4026     uset = createSetForProperty(propertyName, negated);
4027     nextChar(fC);    // Move input scan to position following the closing '}'
4028     return uset;
4029 }
4030
4031 //------------------------------------------------------------------------------
4032 //
4033 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
4034 //             position, which is expected be of the form [:property expression:]
4035 //
4036 //             The scan position will be at the opening ':'.  On return
4037 //             the scan position must be on the closing ']'
4038 //
4039 //             Return a UnicodeSet constructed from the pattern,
4040 //             or NULL if this is not a valid POSIX-style set expression.
4041 //             If not a property expression, restore the initial scan position
4042 //                (to the opening ':')
4043 //
4044 //               Note:  the opening '[:' is not sufficient to guarantee that
4045 //                      this is a [:property:] expression.
4046 //                      [:'+=,] is a perfectly good ordinary set expression that
4047 //                              happens to include ':' as one of its characters.
4048 //
4049 //------------------------------------------------------------------------------
4050 UnicodeSet *RegexCompile::scanPosixProp() {
4051     UnicodeSet    *uset = NULL;
4052
4053     if (U_FAILURE(*fStatus)) {
4054         return NULL;
4055     }
4056
4057     U_ASSERT(fC.fChar == chColon);
4058
4059     // Save the scanner state.
4060     // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
4061     int64_t     savedScanIndex        = fScanIndex;
4062     int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4063     UBool       savedQuoteMode        = fQuoteMode;
4064     UBool       savedInBackslashQuote = fInBackslashQuote;
4065     UBool       savedEOLComments      = fEOLComments;
4066     int64_t     savedLineNum          = fLineNum;
4067     int64_t     savedCharNum          = fCharNum;
4068     UChar32     savedLastChar         = fLastChar;
4069     UChar32     savedPeekChar         = fPeekChar;
4070     RegexPatternChar savedfC          = fC;
4071
4072     // Scan for a closing ].   A little tricky because there are some perverse
4073     //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4074     //   ending on the second closing ].
4075
4076     UnicodeString propName;
4077     UBool         negated  = FALSE;
4078
4079     // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4080     nextChar(fC);
4081     if (fC.fChar == chUp) {
4082        negated = TRUE;
4083        nextChar(fC);
4084     }
4085
4086     // Scan for the closing ":]", collecting the property name along the way.
4087     UBool  sawPropSetTerminator = FALSE;
4088     for (;;) {
4089         propName.append(fC.fChar);
4090         nextChar(fC);
4091         if (fC.fQuoted || fC.fChar == -1) {
4092             // Escaped characters or end of input - either says this isn't a [:Property:]
4093             break;
4094         }
4095         if (fC.fChar == chColon) {
4096             nextChar(fC);
4097             if (fC.fChar == chRBracket) {
4098                 sawPropSetTerminator = TRUE;
4099             }
4100             break;
4101         }
4102     }
4103
4104     if (sawPropSetTerminator) {
4105         uset = createSetForProperty(propName, negated);
4106     }
4107     else
4108     {
4109         // No closing ":]".
4110         //  Restore the original scan position.
4111         //  The main scanner will retry the input as a normal set expression,
4112         //    not a [:Property:] expression.
4113         fScanIndex        = savedScanIndex;
4114         fQuoteMode        = savedQuoteMode;
4115         fInBackslashQuote = savedInBackslashQuote;
4116         fEOLComments      = savedEOLComments;
4117         fLineNum          = savedLineNum;
4118         fCharNum          = savedCharNum;
4119         fLastChar         = savedLastChar;
4120         fPeekChar         = savedPeekChar;
4121         fC                = savedfC;
4122         UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4123     }
4124     return uset;
4125 }
4126
4127 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4128     set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4129     addCategory(set, U_GC_CF_MASK, ec);
4130 }
4131
4132 //
4133 //  Create a Unicode Set from a Unicode Property expression.
4134 //     This is common code underlying both \p{...} ane [:...:] expressions.
4135 //     Includes trying the Java "properties" that aren't supported as
4136 //     normal ICU UnicodeSet properties
4137 //
4138 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
4139 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
4140 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4141     UnicodeString   setExpr;
4142     UnicodeSet      *set;
4143     uint32_t        usetFlags = 0;
4144
4145     if (U_FAILURE(*fStatus)) {
4146         return NULL;
4147     }
4148
4149     //
4150     //  First try the property as we received it
4151     //
4152     if (negated) {
4153         setExpr.append(negSetPrefix, -1);
4154     } else {
4155         setExpr.append(posSetPrefix, -1);
4156     }
4157     setExpr.append(propName);
4158     setExpr.append(chRBrace);
4159     setExpr.append(chRBracket);
4160     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4161         usetFlags |= USET_CASE_INSENSITIVE;
4162     }
4163     set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4164     if (U_SUCCESS(*fStatus)) {
4165        return set;
4166     }
4167     delete set;
4168     set = NULL;
4169
4170     //
4171     //  The property as it was didn't work.
4172
4173     //  Do [:word:]. It is not recognized as a property by UnicodeSet.  "word" not standard POSIX
4174     //     or standard Java, but many other regular expression packages do recognize it.
4175
4176     if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
4177         *fStatus = U_ZERO_ERROR;
4178         set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
4179         if (set == NULL) {
4180             *fStatus = U_MEMORY_ALLOCATION_ERROR;
4181             return set;
4182         }
4183         if (negated) {
4184             set->complement();
4185         }
4186         return set;
4187     }
4188
4189
4190     //    Do Java fixes -
4191     //       InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
4192     //       InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
4193     //
4194     //       Note on Spaces:  either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
4195     //                        is accepted by Java.  The property part of the name is compared
4196     //                        case-insenstively.  The spaces must be exactly as shown, either
4197     //                        all there, or all omitted, with exactly one at each position
4198     //                        if they are present.  From checking against JDK 1.6
4199     //
4200     //       This code should be removed when ICU properties support the Java  compatibility names
4201     //          (ICU 4.0?)
4202     //
4203     UnicodeString mPropName = propName;
4204     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
4205         mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
4206     }
4207     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
4208         mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
4209         mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
4210     }
4211     else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4212         mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
4213     }
4214
4215     //    See if the property looks like a Java "InBlockName", which
4216     //    we will recast as "Block=BlockName"
4217     //
4218     static const UChar IN[] = {0x49, 0x6E, 0};  // "In"
4219     static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00};  // "Block="
4220     if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
4221         setExpr.truncate(4);   // Leaves "[\p{", or "[\P{"
4222         setExpr.append(BLOCK, -1);
4223         setExpr.append(UnicodeString(mPropName, 2));  // Property with the leading "In" removed.
4224         setExpr.append(chRBrace);
4225         setExpr.append(chRBracket);
4226         *fStatus = U_ZERO_ERROR;
4227         set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4228         if (U_SUCCESS(*fStatus)) {
4229             return set;
4230         }
4231         delete set;
4232         set = NULL;
4233     }
4234
4235     if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
4236         propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
4237     {
4238         UErrorCode localStatus = U_ZERO_ERROR;
4239         //setExpr.remove();
4240         set = new UnicodeSet();
4241         //
4242         //  Try the various Java specific properties.
4243         //   These all begin with "java"
4244         //
4245         if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
4246             addCategory(set, U_GC_CN_MASK, localStatus);
4247             set->complement();
4248         }
4249         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
4250             addCategory(set, U_GC_ND_MASK, localStatus);
4251         }
4252         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
4253             addIdentifierIgnorable(set, localStatus);
4254         }
4255         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
4256             set->add(0, 0x1F).add(0x7F, 0x9F);
4257         }
4258         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
4259             addCategory(set, U_GC_L_MASK, localStatus);
4260             addCategory(set, U_GC_SC_MASK, localStatus);
4261             addCategory(set, U_GC_PC_MASK, localStatus);
4262             addCategory(set, U_GC_ND_MASK, localStatus);
4263             addCategory(set, U_GC_NL_MASK, localStatus);
4264             addCategory(set, U_GC_MC_MASK, localStatus);
4265             addCategory(set, U_GC_MN_MASK, localStatus);
4266             addIdentifierIgnorable(set, localStatus);
4267         }
4268         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
4269             addCategory(set, U_GC_L_MASK, localStatus);
4270             addCategory(set, U_GC_NL_MASK, localStatus);
4271             addCategory(set, U_GC_SC_MASK, localStatus);
4272             addCategory(set, U_GC_PC_MASK, localStatus);
4273         }
4274         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
4275             addCategory(set, U_GC_L_MASK, localStatus);
4276         }
4277         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
4278             addCategory(set, U_GC_L_MASK, localStatus);
4279             addCategory(set, U_GC_ND_MASK, localStatus);
4280         }
4281         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
4282             addCategory(set, U_GC_LL_MASK, localStatus);
4283         }
4284         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
4285             set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
4286         }
4287         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
4288             addCategory(set, U_GC_Z_MASK, localStatus);
4289         }
4290         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
4291             set->add(0x10000, UnicodeSet::MAX_VALUE);
4292         }
4293         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
4294             addCategory(set, U_GC_LT_MASK, localStatus);
4295         }
4296         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
4297             addCategory(set, U_GC_L_MASK, localStatus);
4298             addCategory(set, U_GC_NL_MASK, localStatus);
4299         }
4300         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
4301             addCategory(set, U_GC_L_MASK, localStatus);
4302             addCategory(set, U_GC_PC_MASK, localStatus);
4303             addCategory(set, U_GC_ND_MASK, localStatus);
4304             addCategory(set, U_GC_NL_MASK, localStatus);
4305             addCategory(set, U_GC_MC_MASK, localStatus);
4306             addCategory(set, U_GC_MN_MASK, localStatus);
4307             addIdentifierIgnorable(set, localStatus);
4308         }
4309         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
4310             addCategory(set, U_GC_LU_MASK, localStatus);
4311         }
4312         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
4313             set->add(0, UnicodeSet::MAX_VALUE);
4314         }
4315         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
4316             addCategory(set, U_GC_Z_MASK, localStatus);
4317             set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4318             set->add(9, 0x0d).add(0x1c, 0x1f);
4319         }
4320         else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4321             set->add(0, UnicodeSet::MAX_VALUE);
4322         }
4323
4324         if (U_SUCCESS(localStatus) && !set->isEmpty()) {
4325             *fStatus = U_ZERO_ERROR;
4326             if (usetFlags & USET_CASE_INSENSITIVE) {
4327                 set->closeOver(USET_CASE_INSENSITIVE);
4328             }
4329             if (negated) {
4330                 set->complement();
4331             }
4332             return set;
4333         }
4334         delete set;
4335         set = NULL;
4336     }
4337     error(*fStatus);
4338     return NULL;
4339 }
4340
4341
4342
4343 //
4344 //  SetEval   Part of the evaluation of [set expressions].
4345 //            Perform any pending (stacked) operations with precedence
4346 //            equal or greater to that of the next operator encountered
4347 //            in the expression.
4348 //
4349 void RegexCompile::setEval(int32_t nextOp) {
4350     UnicodeSet *rightOperand = NULL;
4351     UnicodeSet *leftOperand  = NULL;
4352     for (;;) {
4353         U_ASSERT(fSetOpStack.empty()==FALSE);
4354         int32_t pendingSetOperation = fSetOpStack.peeki();
4355         if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4356             break;
4357         }
4358         fSetOpStack.popi();
4359         U_ASSERT(fSetStack.empty() == FALSE);
4360         rightOperand = (UnicodeSet *)fSetStack.peek();
4361         switch (pendingSetOperation) {
4362             case setNegation:
4363                 rightOperand->complement();
4364                 break;
4365             case setCaseClose:
4366                 // TODO: need a simple close function.  Ticket 6065
4367                 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4368                 rightOperand->removeAllStrings();
4369                 break;
4370             case setDifference1:
4371             case setDifference2:
4372                 fSetStack.pop();
4373                 leftOperand = (UnicodeSet *)fSetStack.peek();
4374                 leftOperand->removeAll(*rightOperand);
4375                 delete rightOperand;
4376                 break;
4377             case setIntersection1:
4378             case setIntersection2:
4379                 fSetStack.pop();
4380                 leftOperand = (UnicodeSet *)fSetStack.peek();
4381                 leftOperand->retainAll(*rightOperand);
4382                 delete rightOperand;
4383                 break;
4384             case setUnion:
4385                 fSetStack.pop();
4386                 leftOperand = (UnicodeSet *)fSetStack.peek();
4387                 leftOperand->addAll(*rightOperand);
4388                 delete rightOperand;
4389                 break;
4390             default:
4391                 U_ASSERT(FALSE);
4392                 break;
4393             }
4394         }
4395     }
4396
4397 void RegexCompile::setPushOp(int32_t op) {
4398     setEval(op);
4399     fSetOpStack.push(op, *fStatus);
4400     fSetStack.push(new UnicodeSet(), *fStatus);
4401 }
4402
4403 U_NAMESPACE_END
4404 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4405