1 Index: source/common/unicode/utypes.h
2 ===================================================================
3 --- source/common/unicode/utypes.h (revision 292709)
4 +++ source/common/unicode/utypes.h (working copy)
6 U_REGEX_STACK_OVERFLOW, /**< Regular expression backtrack stack overflow. */
7 U_REGEX_TIME_OUT, /**< Maximum allowed match time exceeded */
8 U_REGEX_STOPPED_BY_CALLER, /**< Matching operation aborted by user callback fn. */
9 + U_REGEX_PATTERN_TOO_BIG, /**< Pattern exceeds limits on size or complexity. */
10 U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
13 Index: source/common/utypes.c
14 ===================================================================
15 --- source/common/utypes.c (revision 292709)
16 +++ source/common/utypes.c (working copy)
18 "U_REGEX_INVALID_RANGE",
19 "U_REGEX_STACK_OVERFLOW",
21 - "U_REGEX_STOPPED_BY_CALLER"
22 + "U_REGEX_STOPPED_BY_CALLER",
23 + "U_REGEX_PATTERN_TOO_BIG"
26 static const char * const
27 Index: source/i18n/regexcmp.cpp
28 ===================================================================
29 --- source/i18n/regexcmp.cpp (revision 292943)
30 +++ source/i18n/regexcmp.cpp (working copy)
32 // present in the saved state: the input string position (int64_t) and
33 // the position in the compiled pattern.
35 - fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT;
36 + allocateStackData(RESTACKFRAME_HDRCOUNT);
39 // Optimization pass 1: NOPs, back-references, and case-folding
41 // the start of an ( grouping.
42 //4 NOP Resreved, will be replaced by a save if there are
43 // OR | operators at the top level
44 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus);
45 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus);
46 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus);
47 + appendOp(URX_BUILD(URX_STATE_SAVE, 2));
48 + appendOp(URX_BUILD(URX_JMP, 3));
49 + appendOp(URX_BUILD(URX_FAIL, 0));
51 // Standard open nonCapture paren action emits the two NOPs and
52 // sets up the paren stack frame.
56 // add the END operation to the compiled pattern.
57 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus);
58 + appendOp(URX_BUILD(URX_END, 0));
60 // Terminate the pattern compilation state machine.
63 // the JMP will eventually be the location following the ')' for the
64 // group. This will be patched in later, when the ')' is encountered.
65 op = URX_BUILD(URX_JMP, 0);
66 - fRXPat->fCompiledPat->addElement(op, *fStatus);
69 // Push the position of the newly added JMP op onto the parentheses stack.
70 // This registers if for fixup when this block's close paren is encountered.
72 // Append a NOP to the compiled pattern. This is the slot reserved
73 // for a SAVE in the event that there is yet another '|' following
75 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
76 + appendOp(URX_BUILD(URX_NOP, 0));
77 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
81 // END_CAPTURE is encountered.
84 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
85 - int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame.
86 - fRXPat->fFrameSize += 3;
87 - int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc);
88 - fRXPat->fCompiledPat->addElement(cop, *fStatus);
89 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
90 + appendOp(URX_BUILD(URX_NOP, 0));
91 + int32_t varsLoc = allocateStackData(3); // Reserve three slots in match stack frame.
92 + int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc);
94 + appendOp(URX_BUILD(URX_NOP, 0));
96 // On the Parentheses stack, start a new frame and add the postions
97 // of the two NOPs. Depending on what follows in the pattern, the
99 // is an '|' alternation within the parens.
102 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
103 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
104 + appendOp(URX_BUILD(URX_NOP, 0));
105 + appendOp(URX_BUILD(URX_NOP, 0));
107 // On the Parentheses stack, start a new frame and add the postions
109 @@ -510,12 +509,11 @@
110 // is an '|' alternation within the parens.
113 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
114 - int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the
115 - fRXPat->fDataSize += 1; // state stack ptr.
116 - int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc);
117 - fRXPat->fCompiledPat->addElement(stoOp, *fStatus);
118 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
119 + appendOp(URX_BUILD(URX_NOP, 0));
120 + int32_t varLoc = allocateData(1); // Reserve a data location for saving the state stack ptr.
121 + int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc);
123 + appendOp(URX_BUILD(URX_NOP, 0));
125 // On the Parentheses stack, start a new frame and add the postions
126 // of the two NOPs. Depending on what follows in the pattern, the
127 @@ -558,26 +556,25 @@
128 // Two data slots are reserved, for saving the stack ptr and the input position.
131 - int32_t dataLoc = fRXPat->fDataSize;
132 - fRXPat->fDataSize += 2;
133 + int32_t dataLoc = allocateData(2);
134 int32_t op = URX_BUILD(URX_LA_START, dataLoc);
135 - fRXPat->fCompiledPat->addElement(op, *fStatus);
138 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
139 - fRXPat->fCompiledPat->addElement(op, *fStatus);
142 op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
143 - fRXPat->fCompiledPat->addElement(op, *fStatus);
147 op = URX_BUILD(URX_LA_END, dataLoc);
148 - fRXPat->fCompiledPat->addElement(op, *fStatus);
151 op = URX_BUILD(URX_BACKTRACK, 0);
152 - fRXPat->fCompiledPat->addElement(op, *fStatus);
156 op = URX_BUILD(URX_NOP, 0);
157 - fRXPat->fCompiledPat->addElement(op, *fStatus);
158 - fRXPat->fCompiledPat->addElement(op, *fStatus);
162 // On the Parentheses stack, start a new frame and add the postions
164 @@ -602,16 +599,15 @@
165 // an alternate (transparent) region.
168 - int32_t dataLoc = fRXPat->fDataSize;
169 - fRXPat->fDataSize += 2;
170 + int32_t dataLoc = allocateData(2);
171 int32_t op = URX_BUILD(URX_LA_START, dataLoc);
172 - fRXPat->fCompiledPat->addElement(op, *fStatus);
175 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patched later.
176 - fRXPat->fCompiledPat->addElement(op, *fStatus);
179 op = URX_BUILD(URX_NOP, 0);
180 - fRXPat->fCompiledPat->addElement(op, *fStatus);
183 // On the Parentheses stack, start a new frame and add the postions
184 // of the StateSave and NOP.
185 @@ -649,23 +645,22 @@
188 // Allocate data space
189 - int32_t dataLoc = fRXPat->fDataSize;
190 - fRXPat->fDataSize += 4;
191 + int32_t dataLoc = allocateData(4);
194 int32_t op = URX_BUILD(URX_LB_START, dataLoc);
195 - fRXPat->fCompiledPat->addElement(op, *fStatus);
199 op = URX_BUILD(URX_LB_CONT, dataLoc);
200 - fRXPat->fCompiledPat->addElement(op, *fStatus);
201 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later.
202 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later.
204 + appendOp(0); // MinMatchLength. To be filled later.
205 + appendOp(0); // MaxMatchLength. To be filled later.
208 op = URX_BUILD(URX_NOP, 0);
209 - fRXPat->fCompiledPat->addElement(op, *fStatus);
210 - fRXPat->fCompiledPat->addElement(op, *fStatus);
214 // On the Parentheses stack, start a new frame and add the postions
215 // of the URX_LB_CONT and the NOP.
216 @@ -705,24 +700,23 @@
219 // Allocate data space
220 - int32_t dataLoc = fRXPat->fDataSize;
221 - fRXPat->fDataSize += 4;
222 + int32_t dataLoc = allocateData(4);
225 int32_t op = URX_BUILD(URX_LB_START, dataLoc);
226 - fRXPat->fCompiledPat->addElement(op, *fStatus);
230 op = URX_BUILD(URX_LBN_CONT, dataLoc);
231 - fRXPat->fCompiledPat->addElement(op, *fStatus);
232 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later.
233 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later.
234 - fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc. To be filled later.
236 + appendOp(0); // MinMatchLength. To be filled later.
237 + appendOp(0); // MaxMatchLength. To be filled later.
238 + appendOp(0); // Continue Loc. To be filled later.
241 op = URX_BUILD(URX_NOP, 0);
242 - fRXPat->fCompiledPat->addElement(op, *fStatus);
243 - fRXPat->fCompiledPat->addElement(op, *fStatus);
247 // On the Parentheses stack, start a new frame and add the postions
248 // of the URX_LB_CONT and the NOP.
249 @@ -793,11 +787,10 @@
250 if (URX_TYPE(repeatedOp) == URX_SETREF) {
251 // Emit optimized code for [char set]+
252 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
253 - fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
254 - frameLoc = fRXPat->fFrameSize;
255 - fRXPat->fFrameSize++;
257 + frameLoc = allocateStackData(1);
258 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
259 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
264 @@ -813,11 +806,10 @@
265 if (fModeFlags & UREGEX_UNIX_LINES) {
268 - fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
269 - frameLoc = fRXPat->fFrameSize;
270 - fRXPat->fFrameSize++;
272 + frameLoc = allocateStackData(1);
273 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
274 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
279 @@ -831,18 +823,17 @@
280 // Zero length match is possible.
281 // Emit the code sequence that can handle it.
283 - frameLoc = fRXPat->fFrameSize;
284 - fRXPat->fFrameSize++;
285 + frameLoc = allocateStackData(1);
287 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc);
288 fRXPat->fCompiledPat->setElementAt(op, topLoc);
290 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1);
291 - fRXPat->fCompiledPat->addElement(op, *fStatus);
294 // Simpler code when the repeated body must match something non-empty
295 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc);
296 - fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
303 int32_t topLoc = blockTopLoc(FALSE);
304 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc);
305 - fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
306 + appendOp(saveStateOp);
310 @@ -892,10 +883,10 @@
311 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
313 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2);
314 - fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus);
317 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1);
318 - fRXPat->fCompiledPat->addElement(save_op, *fStatus);
324 // Emit optimized code for a [char set]*
325 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
326 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
327 - dataLoc = fRXPat->fFrameSize;
328 - fRXPat->fFrameSize++;
329 + dataLoc = allocateStackData(1);
330 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
331 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
339 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
340 - dataLoc = fRXPat->fFrameSize;
341 - fRXPat->fFrameSize++;
342 + dataLoc = allocateStackData(1);
343 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
344 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
350 // extra loop-breaking code.
351 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
352 insertOp(saveStateLoc);
353 - dataLoc = fRXPat->fFrameSize;
354 - fRXPat->fFrameSize++;
355 + dataLoc = allocateStackData(1);
357 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
358 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
360 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
362 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
363 - fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
369 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc);
370 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
371 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
372 - fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
373 + appendOp(stateSaveOp);
377 @@ -1078,9 +1066,9 @@
379 // First the STO_SP before the start of the loop
381 - int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the
382 - fRXPat->fDataSize += 1; // state stack ptr.
383 - int32_t op = URX_BUILD(URX_STO_SP, varLoc);
385 + int32_t varLoc = allocateData(1); // Reserve a data location for saving the
386 + int32_t op = URX_BUILD(URX_STO_SP, varLoc);
387 fRXPat->fCompiledPat->setElementAt(op, topLoc);
389 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
390 @@ -1090,7 +1078,7 @@
392 // Then the LD_SP after the end of the loop
393 op = URX_BUILD(URX_LD_SP, varLoc);
394 - fRXPat->fCompiledPat->addElement(op, *fStatus);
399 @@ -1134,7 +1122,7 @@
401 op = URX_BUILD(URX_DOTANY, 0);
403 - fRXPat->fCompiledPat->addElement(op, *fStatus);
408 @@ -1151,7 +1139,7 @@
409 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
410 op = URX_CARET_M_UNIX;
412 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
413 + appendOp(URX_BUILD(op, 0));
417 @@ -1168,13 +1156,13 @@
418 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
421 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
422 + appendOp(URX_BUILD(op, 0));
428 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
429 + appendOp(URX_BUILD(URX_CARET, 0));
433 @@ -1186,7 +1174,7 @@
436 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
437 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus);
438 + appendOp(URX_BUILD(op, 1));
442 @@ -1199,63 +1187,59 @@
445 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
446 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
447 + appendOp(URX_BUILD(op, 0));
453 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus);
454 + appendOp(URX_BUILD(URX_BACKSLASH_D, 1));
459 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus);
460 + appendOp(URX_BUILD(URX_BACKSLASH_D, 0));
465 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
466 + appendOp(URX_BUILD(URX_BACKSLASH_G, 0));
471 - fRXPat->fCompiledPat->addElement(
472 - URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus);
473 + appendOp(URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET));
478 - fRXPat->fCompiledPat->addElement(
479 - URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus);
480 + appendOp(URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET));
485 - fRXPat->fCompiledPat->addElement(
486 - URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus);
487 + appendOp(URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET));
492 - fRXPat->fCompiledPat->addElement(
493 - URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
494 + appendOp(URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET));
499 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
500 + appendOp(URX_BUILD(URX_BACKSLASH_X, 0));
506 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
507 + appendOp(URX_BUILD(URX_DOLLAR, 0));
512 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
513 + appendOp(URX_BUILD(URX_BACKSLASH_Z, 0));
517 @@ -1321,7 +1305,7 @@
519 op = URX_BUILD(URX_BACKREF, groupNum);
521 - fRXPat->fCompiledPat->addElement(op, *fStatus);
526 @@ -1342,22 +1326,21 @@
529 int32_t topLoc = blockTopLoc(TRUE);
530 - int32_t stoLoc = fRXPat->fDataSize;
531 - fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
532 + int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
533 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
534 fRXPat->fCompiledPat->setElementAt(op, topLoc);
536 // Emit the STATE_SAVE
537 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
538 - fRXPat->fCompiledPat->addElement(op, *fStatus);
542 op = URX_BUILD(URX_JMP, topLoc+1);
543 - fRXPat->fCompiledPat->addElement(op, *fStatus);
547 op = URX_BUILD(URX_LD_SP, stoLoc);
548 - fRXPat->fCompiledPat->addElement(op, *fStatus);
553 @@ -1377,8 +1360,7 @@
557 - int32_t stoLoc = fRXPat->fDataSize;
558 - fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
559 + int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
560 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
561 fRXPat->fCompiledPat->setElementAt(op, topLoc);
563 @@ -1389,11 +1371,11 @@
565 // Append the JMP operation.
566 op = URX_BUILD(URX_JMP, topLoc+1);
567 - fRXPat->fCompiledPat->addElement(op, *fStatus);
570 // Emit the LD_SP loc
571 op = URX_BUILD(URX_LD_SP, stoLoc);
572 - fRXPat->fCompiledPat->addElement(op, *fStatus);
577 @@ -1412,8 +1394,7 @@
581 - int32_t stoLoc = fRXPat->fDataSize;
582 - fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
583 + int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
584 int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
585 fRXPat->fCompiledPat->setElementAt(op, topLoc);
587 @@ -1424,7 +1405,7 @@
590 op = URX_BUILD(URX_LD_SP, stoLoc);
591 - fRXPat->fCompiledPat->addElement(op, *fStatus);
596 @@ -1481,8 +1462,8 @@
597 // is an '|' alternation within the parens.
600 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
601 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
602 + appendOp(URX_BUILD(URX_NOP, 0));
603 + appendOp(URX_BUILD(URX_NOP, 0));
605 // On the Parentheses stack, start a new frame and add the postions
606 // of the two NOPs (a normal non-capturing () frame, except for the
607 @@ -1862,7 +1843,7 @@
609 op = URX_BUILD(URX_ONECHAR, lastCodePoint);
611 - fRXPat->fCompiledPat->addElement(op, *fStatus);
614 // Two or more chars, emit a URX_STRING to match them.
615 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
616 @@ -1872,12 +1853,19 @@
617 // into two single char ops, for efficiency.
618 op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length());
620 - fRXPat->fCompiledPat->addElement(op, *fStatus);
622 op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length());
623 - fRXPat->fCompiledPat->addElement(op, *fStatus);
627 // Add this string into the accumulated strings of the compiled pattern.
628 + // The total size of the accumulated strings must be restricted to 24 bits because
629 + // string indexes appear as compiled pattern operand values.
630 + // This is the only place that the pattern.fLiteralText string is modified.
632 fRXPat->fLiteralText.append(fLiteralChars);
633 + if (U_SUCCESS(*fStatus) && fRXPat->fLiteralText.length() > 0x00ffffff) {
634 + *fStatus = U_REGEX_PATTERN_TOO_BIG;
638 fLiteralChars.remove();
639 @@ -1884,10 +1872,22 @@
643 +//------------------------------------------------------------------------------
645 +// appendOp() Append a new instruction onto the compiled pattern
646 +// Includes error checking, limiting the size of the
647 +// pattern to lengths that can be represented in the
648 +// 24 bit operand field of an instruction.
650 +//------------------------------------------------------------------------------
651 +void RegexCompile::appendOp(int32_t op) {
652 + fRXPat->fCompiledPat->addElement(op, *fStatus);
653 + if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
654 + *fStatus = U_REGEX_PATTERN_TOO_BIG;
661 //------------------------------------------------------------------------------
663 // insertOp() Insert a slot for a new opcode into the already
664 @@ -1947,9 +1947,61 @@
668 +//------------------------------------------------------------------------------
670 +// allocateData() Allocate storage in the matcher's static data area.
671 +// Return the index for the newly allocated data.
672 +// The storage won't actually exist until we are running a match
673 +// operation, but the storage indexes are inserted into various
674 +// opcodes while compiling the pattern.
676 +//------------------------------------------------------------------------------
677 +int32_t RegexCompile::allocateData(int32_t size) {
678 + if (U_FAILURE(*fStatus)) {
681 + if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
682 + *fStatus = U_REGEX_INTERNAL_ERROR;
685 + int32_t dataIndex = fRXPat->fDataSize;
686 + fRXPat->fDataSize += size;
687 + if (fRXPat->fDataSize >= 0x00fffff0) {
688 + *fStatus = U_REGEX_PATTERN_TOO_BIG;
694 //------------------------------------------------------------------------------
696 +// allocateStackData() Allocate space in the back-tracking stack frame.
697 +// Return the index for the newly allocated data.
698 +// The frame indexes are inserted into various
699 +// opcodes while compiling the pattern, meaning that frame
700 +// size must be restricted to the size that will fit
701 +// as an operand (24 bits).
703 +//------------------------------------------------------------------------------
704 +int32_t RegexCompile::allocateStackData(int32_t size) {
705 + if (U_FAILURE(*fStatus)) {
708 + if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
709 + *fStatus = U_REGEX_INTERNAL_ERROR;
712 + int32_t dataIndex = fRXPat->fFrameSize;
713 + fRXPat->fFrameSize += size;
714 + if (fRXPat->fFrameSize >= 0x00fffff0) {
715 + *fStatus = U_REGEX_PATTERN_TOO_BIG;
721 +//------------------------------------------------------------------------------
723 // blockTopLoc() Find or create a location in the compiled pattern
724 // at the start of the operation or block that has
725 // just been compiled. Needed when a quantifier (* or
726 @@ -2065,7 +2117,7 @@
728 int32_t frameVarLocation = URX_VAL(captureOp);
729 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation);
730 - fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
731 + appendOp(endCaptureOp);
735 @@ -2077,7 +2129,7 @@
736 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
737 int32_t stoLoc = URX_VAL(stoOp);
738 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc);
739 - fRXPat->fCompiledPat->addElement(ldOp, *fStatus);
744 @@ -2087,7 +2139,7 @@
745 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
746 int32_t dataLoc = URX_VAL(startOp);
747 int32_t op = URX_BUILD(URX_LA_END, dataLoc);
748 - fRXPat->fCompiledPat->addElement(op, *fStatus);
753 @@ -2098,11 +2150,11 @@
754 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
755 int32_t dataLoc = URX_VAL(startOp);
756 int32_t op = URX_BUILD(URX_LA_END, dataLoc);
757 - fRXPat->fCompiledPat->addElement(op, *fStatus);
759 op = URX_BUILD(URX_BACKTRACK, 0);
760 - fRXPat->fCompiledPat->addElement(op, *fStatus);
762 op = URX_BUILD(URX_LA_END, dataLoc);
763 - fRXPat->fCompiledPat->addElement(op, *fStatus);
766 // Patch the URX_SAVE near the top of the block.
767 // The destination of the SAVE is the final LA_END that was just added.
768 @@ -2123,9 +2175,9 @@
769 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
770 int32_t dataLoc = URX_VAL(startOp);
771 int32_t op = URX_BUILD(URX_LB_END, dataLoc);
772 - fRXPat->fCompiledPat->addElement(op, *fStatus);
773 - op = URX_BUILD(URX_LA_END, dataLoc);
774 - fRXPat->fCompiledPat->addElement(op, *fStatus);
776 + op = URX_BUILD(URX_LA_END, dataLoc);
779 // Determine the min and max bounds for the length of the
780 // string that the pattern can match.
781 @@ -2162,7 +2214,7 @@
782 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
783 int32_t dataLoc = URX_VAL(startOp);
784 int32_t op = URX_BUILD(URX_LBN_END, dataLoc);
785 - fRXPat->fCompiledPat->addElement(op, *fStatus);
788 // Determine the min and max bounds for the length of the
789 // string that the pattern can match.
790 @@ -2228,7 +2280,7 @@
793 // Set of no elements. Always fails to match.
794 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
795 + appendOp(URX_BUILD(URX_BACKTRACK, 0));
799 @@ -2250,7 +2302,7 @@
800 int32_t setNumber = fRXPat->fSets->size();
801 fRXPat->fSets->addElement(theSet, *fStatus);
802 int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
803 - fRXPat->fCompiledPat->addElement(setOp, *fStatus);
808 @@ -2289,11 +2341,8 @@
809 // counterLoc --> Loop counter
810 // +1 --> Input index (for breaking non-progressing loops)
811 // (Only present if unbounded upper limit on loop)
812 - int32_t counterLoc = fRXPat->fFrameSize;
813 - fRXPat->fFrameSize++;
814 - if (fIntervalUpper < 0) {
815 - fRXPat->fFrameSize++;
817 + int32_t dataSize = fIntervalUpper < 0 ? 2 : 1;
818 + int32_t counterLoc = allocateStackData(dataSize);
820 int32_t op = URX_BUILD(InitOp, counterLoc);
821 fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
822 @@ -2313,7 +2362,7 @@
823 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
824 // Goes at end of the block being looped over, so just append to the code so far.
825 op = URX_BUILD(LoopOp, topOfBlock);
826 - fRXPat->fCompiledPat->addElement(op, *fStatus);
829 if ((fIntervalLow & 0xff000000) != 0 ||
830 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
831 @@ -2380,12 +2429,12 @@
833 for (i=1; i<fIntervalUpper; i++ ) {
834 if (i == fIntervalLow) {
835 - fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
838 if (i > fIntervalLow) {
839 - fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
842 - fRXPat->fCompiledPat->addElement(op, *fStatus);
847 Index: source/i18n/regexcmp.h
848 ===================================================================
849 --- source/i18n/regexcmp.h (revision 292943)
850 +++ source/i18n/regexcmp.h (working copy)
852 void fixLiterals(UBool split=FALSE); // Generate code for pending literal characters.
853 void insertOp(int32_t where); // Open up a slot for a new op in the
854 // generated code at the specified location.
855 + void appendOp(int32_t op); // Append a new op to the compiled pattern.
856 + int32_t allocateData(int32_t size); // Allocate space in the matcher data area.
857 + // Return index of the newly allocated data.
858 + int32_t allocateStackData(int32_t size); // Allocate space in the match back-track stack frame.
859 + // Return offset index in the frame.
860 int32_t minMatchLength(int32_t start,
862 int32_t maxMatchLength(int32_t start,
863 Index: source/i18n/regeximp.cpp
864 ===================================================================
865 --- source/i18n/regeximp.cpp (revision 292709)
866 +++ source/i18n/regeximp.cpp (working copy)
869 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
870 #include "regeximp.h"
872 +#include "uassert.h"
873 #include "unicode/utf16.h"
880 +// Assemble a pcode instruction from the opcode and operand values.
881 +// Out-of-range values should not occur - if they do it is from an internal
882 +// error in the regex compiler.
884 +// TODO: move into regexcmp, where it has access to fStatus.
885 +// NOP cleanly if U_FAILURE.
886 +// Set U_REGEX_INTERNAL_ERROR on bad operands.
888 +int32_t URX_BUILD(int32_t type, int32_t val) {
889 + if (type < 0 || type > 255) {
891 + type = URX_RESERVED_OP;
893 + if (val > 0x00ffffff) {
897 + return (type << 24) | val;
904 Index: source/i18n/regeximp.h
905 ===================================================================
906 --- source/i18n/regeximp.h (revision 292709)
907 +++ source/i18n/regeximp.h (working copy)
910 // Convenience macros for assembling and disassembling a compiled operation.
912 -#define URX_BUILD(type, val) (int32_t)((type << 24) | (val))
913 +int32_t URX_BUILD(int32_t val, int32_t type);
914 #define URX_TYPE(x) ((uint32_t)(x) >> 24)
915 #define URX_VAL(x) ((x) & 0xffffff)
917 Index: source/test/intltest/regextst.cpp
918 ===================================================================
919 --- source/test/intltest/regextst.cpp (revision 292709)
920 +++ source/test/intltest/regextst.cpp (working copy)
922 case 21: name = "Bug 9283";
925 + case 22: name = "TestBug11371";
926 + if (exec) TestBug11371();
930 break; //needed to end loop
931 @@ -5229,5 +5232,47 @@
935 +void RegexTest::TestBug11371() {
936 + UErrorCode status = U_ZERO_ERROR;
937 + UnicodeString patternString;
939 + for (int i=0; i<8000000; i++) {
940 + patternString.append(UnicodeString("()"));
942 + LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
943 + if (status != U_REGEX_PATTERN_TOO_BIG) {
944 + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
945 + __FILE__, __LINE__, u_errorName(status));
948 + status = U_ZERO_ERROR;
949 + patternString = "(";
950 + for (int i=0; i<20000000; i++) {
951 + patternString.append(UnicodeString("A++"));
953 + patternString.append(UnicodeString("){0}B++"));
954 + LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
955 + if (status != U_REGEX_PATTERN_TOO_BIG) {
956 + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
957 + __FILE__, __LINE__, u_errorName(status));
960 + // Pattern with too much string data, such that string indexes overflow operand data.
961 + status = U_ZERO_ERROR;
962 + patternString = "";
963 + while (patternString.length() < 0x00ffffff) {
964 + patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
966 + patternString.append(UnicodeString("X? trailing string"));
967 + LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
968 + compiledPat3->dumpPattern();
969 + if (status != U_REGEX_PATTERN_TOO_BIG) {
970 + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
971 + __FILE__, __LINE__, u_errorName(status));
978 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
980 Index: source/test/intltest/regextst.h
981 ===================================================================
982 --- source/test/intltest/regextst.h (revision 292709)
983 +++ source/test/intltest/regextst.h (working copy)
985 virtual void Bug7029();
986 virtual void Bug9283();
987 virtual void CheckInvBufSize();
988 + virtual void TestBug11371();
990 // The following functions are internal to the regexp tests.
991 virtual void assertUText(const char *expected, UText *actual, const char *file, int line);