1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 // A simple interpreter for the Irregexp byte code.
35 #include "bytecodes-irregexp.h"
36 #include "interpreter-irregexp.h"
38 #include "regexp-macro-assembler.h"
44 typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize;
46 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
50 Vector<const uc16> subject) {
51 for (int i = 0; i < len; i++) {
52 unibrow::uchar old_char = subject[from++];
53 unibrow::uchar new_char = subject[current++];
54 if (old_char == new_char) continue;
55 unibrow::uchar old_string[1] = { old_char };
56 unibrow::uchar new_string[1] = { new_char };
57 interp_canonicalize->get(old_char, '\0', old_string);
58 interp_canonicalize->get(new_char, '\0', new_string);
59 if (old_string[0] != new_string[0]) {
67 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
71 Vector<const char> subject) {
72 for (int i = 0; i < len; i++) {
73 unsigned int old_char = subject[from++];
74 unsigned int new_char = subject[current++];
75 if (old_char == new_char) continue;
76 if (old_char - 'A' <= 'Z' - 'A') old_char |= 0x20;
77 if (new_char - 'A' <= 'Z' - 'A') new_char |= 0x20;
78 if (old_char != new_char) return false;
85 static void TraceInterpreter(const byte* code_base,
89 uint32_t current_char,
91 const char* bytecode_name) {
92 if (FLAG_trace_regexp_bytecodes) {
93 bool printable = (current_char < 127 && current_char >= 32);
96 "pc = %02x, sp = %d, curpos = %d, curchar = %08x (%c), bc = %s" :
97 "pc = %02x, sp = %d, curpos = %d, curchar = %08x .%c., bc = %s";
103 printable ? current_char : '.',
105 for (int i = 0; i < bytecode_length; i++) {
106 printf(", %02x", pc[i]);
109 for (int i = 1; i < bytecode_length; i++) {
110 unsigned char b = pc[i];
111 if (b < 127 && b >= 32) {
122 #define BYTECODE(name) \
124 TraceInterpreter(code_base, \
126 static_cast<int>(backtrack_sp - backtrack_stack_base), \
129 BC_##name##_LENGTH, \
132 #define BYTECODE(name) \
137 static int32_t Load32Aligned(const byte* pc) {
138 ASSERT((reinterpret_cast<intptr_t>(pc) & 3) == 0);
139 return *reinterpret_cast<const int32_t *>(pc);
143 static int32_t Load16Aligned(const byte* pc) {
144 ASSERT((reinterpret_cast<intptr_t>(pc) & 1) == 0);
145 return *reinterpret_cast<const uint16_t *>(pc);
149 // A simple abstraction over the backtracking stack used by the interpreter.
150 // This backtracking stack does not grow automatically, but it ensures that the
151 // the memory held by the stack is released or remembered in a cache if the
152 // matching terminates.
153 class BacktrackStack {
155 explicit BacktrackStack(Isolate* isolate) : isolate_(isolate) {
156 if (isolate->irregexp_interpreter_backtrack_stack_cache() != NULL) {
157 // If the cache is not empty reuse the previously allocated stack.
158 data_ = isolate->irregexp_interpreter_backtrack_stack_cache();
159 isolate->set_irregexp_interpreter_backtrack_stack_cache(NULL);
161 // Cache was empty. Allocate a new backtrack stack.
162 data_ = NewArray<int>(kBacktrackStackSize);
167 if (isolate_->irregexp_interpreter_backtrack_stack_cache() == NULL) {
168 // The cache is empty. Keep this backtrack stack around.
169 isolate_->set_irregexp_interpreter_backtrack_stack_cache(data_);
171 // A backtrack stack was already cached, just release this one.
176 int* data() const { return data_; }
178 int max_size() const { return kBacktrackStackSize; }
181 static const int kBacktrackStackSize = 10000;
186 DISALLOW_COPY_AND_ASSIGN(BacktrackStack);
190 template <typename Char>
191 static RegExpImpl::IrregexpResult RawMatch(Isolate* isolate,
192 const byte* code_base,
193 Vector<const Char> subject,
196 uint32_t current_char) {
197 const byte* pc = code_base;
198 // BacktrackStack ensures that the memory allocated for the backtracking stack
199 // is returned to the system or cached if there is no stack being cached at
201 BacktrackStack backtrack_stack(isolate);
202 int* backtrack_stack_base = backtrack_stack.data();
203 int* backtrack_sp = backtrack_stack_base;
204 int backtrack_stack_space = backtrack_stack.max_size();
206 if (FLAG_trace_regexp_bytecodes) {
207 PrintF("\n\nStart bytecode interpreter\n\n");
211 int32_t insn = Load32Aligned(pc);
212 switch (insn & BYTECODE_MASK) {
215 return RegExpImpl::RE_FAILURE;
217 if (--backtrack_stack_space < 0) {
218 return RegExpImpl::RE_EXCEPTION;
220 *backtrack_sp++ = current;
221 pc += BC_PUSH_CP_LENGTH;
224 if (--backtrack_stack_space < 0) {
225 return RegExpImpl::RE_EXCEPTION;
227 *backtrack_sp++ = Load32Aligned(pc + 4);
228 pc += BC_PUSH_BT_LENGTH;
230 BYTECODE(PUSH_REGISTER)
231 if (--backtrack_stack_space < 0) {
232 return RegExpImpl::RE_EXCEPTION;
234 *backtrack_sp++ = registers[insn >> BYTECODE_SHIFT];
235 pc += BC_PUSH_REGISTER_LENGTH;
237 BYTECODE(SET_REGISTER)
238 registers[insn >> BYTECODE_SHIFT] = Load32Aligned(pc + 4);
239 pc += BC_SET_REGISTER_LENGTH;
241 BYTECODE(ADVANCE_REGISTER)
242 registers[insn >> BYTECODE_SHIFT] += Load32Aligned(pc + 4);
243 pc += BC_ADVANCE_REGISTER_LENGTH;
245 BYTECODE(SET_REGISTER_TO_CP)
246 registers[insn >> BYTECODE_SHIFT] = current + Load32Aligned(pc + 4);
247 pc += BC_SET_REGISTER_TO_CP_LENGTH;
249 BYTECODE(SET_CP_TO_REGISTER)
250 current = registers[insn >> BYTECODE_SHIFT];
251 pc += BC_SET_CP_TO_REGISTER_LENGTH;
253 BYTECODE(SET_REGISTER_TO_SP)
254 registers[insn >> BYTECODE_SHIFT] =
255 static_cast<int>(backtrack_sp - backtrack_stack_base);
256 pc += BC_SET_REGISTER_TO_SP_LENGTH;
258 BYTECODE(SET_SP_TO_REGISTER)
259 backtrack_sp = backtrack_stack_base + registers[insn >> BYTECODE_SHIFT];
260 backtrack_stack_space = backtrack_stack.max_size() -
261 static_cast<int>(backtrack_sp - backtrack_stack_base);
262 pc += BC_SET_SP_TO_REGISTER_LENGTH;
265 backtrack_stack_space++;
267 current = *backtrack_sp;
268 pc += BC_POP_CP_LENGTH;
271 backtrack_stack_space++;
273 pc = code_base + *backtrack_sp;
275 BYTECODE(POP_REGISTER)
276 backtrack_stack_space++;
278 registers[insn >> BYTECODE_SHIFT] = *backtrack_sp;
279 pc += BC_POP_REGISTER_LENGTH;
282 return RegExpImpl::RE_FAILURE;
284 return RegExpImpl::RE_SUCCESS;
286 current += insn >> BYTECODE_SHIFT;
287 pc += BC_ADVANCE_CP_LENGTH;
290 pc = code_base + Load32Aligned(pc + 4);
292 BYTECODE(ADVANCE_CP_AND_GOTO)
293 current += insn >> BYTECODE_SHIFT;
294 pc = code_base + Load32Aligned(pc + 4);
296 BYTECODE(CHECK_GREEDY)
297 if (current == backtrack_sp[-1]) {
299 backtrack_stack_space++;
300 pc = code_base + Load32Aligned(pc + 4);
302 pc += BC_CHECK_GREEDY_LENGTH;
305 BYTECODE(LOAD_CURRENT_CHAR) {
306 int pos = current + (insn >> BYTECODE_SHIFT);
307 if (pos >= subject.length()) {
308 pc = code_base + Load32Aligned(pc + 4);
310 current_char = subject[pos];
311 pc += BC_LOAD_CURRENT_CHAR_LENGTH;
315 BYTECODE(LOAD_CURRENT_CHAR_UNCHECKED) {
316 int pos = current + (insn >> BYTECODE_SHIFT);
317 current_char = subject[pos];
318 pc += BC_LOAD_CURRENT_CHAR_UNCHECKED_LENGTH;
321 BYTECODE(LOAD_2_CURRENT_CHARS) {
322 int pos = current + (insn >> BYTECODE_SHIFT);
323 if (pos + 2 > subject.length()) {
324 pc = code_base + Load32Aligned(pc + 4);
326 Char next = subject[pos + 1];
328 (subject[pos] | (next << (kBitsPerByte * sizeof(Char))));
329 pc += BC_LOAD_2_CURRENT_CHARS_LENGTH;
333 BYTECODE(LOAD_2_CURRENT_CHARS_UNCHECKED) {
334 int pos = current + (insn >> BYTECODE_SHIFT);
335 Char next = subject[pos + 1];
336 current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char))));
337 pc += BC_LOAD_2_CURRENT_CHARS_UNCHECKED_LENGTH;
340 BYTECODE(LOAD_4_CURRENT_CHARS) {
341 ASSERT(sizeof(Char) == 1);
342 int pos = current + (insn >> BYTECODE_SHIFT);
343 if (pos + 4 > subject.length()) {
344 pc = code_base + Load32Aligned(pc + 4);
346 Char next1 = subject[pos + 1];
347 Char next2 = subject[pos + 2];
348 Char next3 = subject[pos + 3];
349 current_char = (subject[pos] |
353 pc += BC_LOAD_4_CURRENT_CHARS_LENGTH;
357 BYTECODE(LOAD_4_CURRENT_CHARS_UNCHECKED) {
358 ASSERT(sizeof(Char) == 1);
359 int pos = current + (insn >> BYTECODE_SHIFT);
360 Char next1 = subject[pos + 1];
361 Char next2 = subject[pos + 2];
362 Char next3 = subject[pos + 3];
363 current_char = (subject[pos] |
367 pc += BC_LOAD_4_CURRENT_CHARS_UNCHECKED_LENGTH;
370 BYTECODE(CHECK_4_CHARS) {
371 uint32_t c = Load32Aligned(pc + 4);
372 if (c == current_char) {
373 pc = code_base + Load32Aligned(pc + 8);
375 pc += BC_CHECK_4_CHARS_LENGTH;
379 BYTECODE(CHECK_CHAR) {
380 uint32_t c = (insn >> BYTECODE_SHIFT);
381 if (c == current_char) {
382 pc = code_base + Load32Aligned(pc + 4);
384 pc += BC_CHECK_CHAR_LENGTH;
388 BYTECODE(CHECK_NOT_4_CHARS) {
389 uint32_t c = Load32Aligned(pc + 4);
390 if (c != current_char) {
391 pc = code_base + Load32Aligned(pc + 8);
393 pc += BC_CHECK_NOT_4_CHARS_LENGTH;
397 BYTECODE(CHECK_NOT_CHAR) {
398 uint32_t c = (insn >> BYTECODE_SHIFT);
399 if (c != current_char) {
400 pc = code_base + Load32Aligned(pc + 4);
402 pc += BC_CHECK_NOT_CHAR_LENGTH;
406 BYTECODE(AND_CHECK_4_CHARS) {
407 uint32_t c = Load32Aligned(pc + 4);
408 if (c == (current_char & Load32Aligned(pc + 8))) {
409 pc = code_base + Load32Aligned(pc + 12);
411 pc += BC_AND_CHECK_4_CHARS_LENGTH;
415 BYTECODE(AND_CHECK_CHAR) {
416 uint32_t c = (insn >> BYTECODE_SHIFT);
417 if (c == (current_char & Load32Aligned(pc + 4))) {
418 pc = code_base + Load32Aligned(pc + 8);
420 pc += BC_AND_CHECK_CHAR_LENGTH;
424 BYTECODE(AND_CHECK_NOT_4_CHARS) {
425 uint32_t c = Load32Aligned(pc + 4);
426 if (c != (current_char & Load32Aligned(pc + 8))) {
427 pc = code_base + Load32Aligned(pc + 12);
429 pc += BC_AND_CHECK_NOT_4_CHARS_LENGTH;
433 BYTECODE(AND_CHECK_NOT_CHAR) {
434 uint32_t c = (insn >> BYTECODE_SHIFT);
435 if (c != (current_char & Load32Aligned(pc + 4))) {
436 pc = code_base + Load32Aligned(pc + 8);
438 pc += BC_AND_CHECK_NOT_CHAR_LENGTH;
442 BYTECODE(MINUS_AND_CHECK_NOT_CHAR) {
443 uint32_t c = (insn >> BYTECODE_SHIFT);
444 uint32_t minus = Load16Aligned(pc + 4);
445 uint32_t mask = Load16Aligned(pc + 6);
446 if (c != ((current_char - minus) & mask)) {
447 pc = code_base + Load32Aligned(pc + 8);
449 pc += BC_MINUS_AND_CHECK_NOT_CHAR_LENGTH;
453 BYTECODE(CHECK_CHAR_IN_RANGE) {
454 uint32_t from = Load16Aligned(pc + 4);
455 uint32_t to = Load16Aligned(pc + 6);
456 if (from <= current_char && current_char <= to) {
457 pc = code_base + Load32Aligned(pc + 8);
459 pc += BC_CHECK_CHAR_IN_RANGE_LENGTH;
463 BYTECODE(CHECK_CHAR_NOT_IN_RANGE) {
464 uint32_t from = Load16Aligned(pc + 4);
465 uint32_t to = Load16Aligned(pc + 6);
466 if (from > current_char || current_char > to) {
467 pc = code_base + Load32Aligned(pc + 8);
469 pc += BC_CHECK_CHAR_NOT_IN_RANGE_LENGTH;
473 BYTECODE(CHECK_BIT_IN_TABLE) {
474 int mask = RegExpMacroAssembler::kTableMask;
475 byte b = pc[8 + ((current_char & mask) >> kBitsPerByteLog2)];
476 int bit = (current_char & (kBitsPerByte - 1));
477 if ((b & (1 << bit)) != 0) {
478 pc = code_base + Load32Aligned(pc + 4);
480 pc += BC_CHECK_BIT_IN_TABLE_LENGTH;
485 uint32_t limit = (insn >> BYTECODE_SHIFT);
486 if (current_char < limit) {
487 pc = code_base + Load32Aligned(pc + 4);
489 pc += BC_CHECK_LT_LENGTH;
494 uint32_t limit = (insn >> BYTECODE_SHIFT);
495 if (current_char > limit) {
496 pc = code_base + Load32Aligned(pc + 4);
498 pc += BC_CHECK_GT_LENGTH;
502 BYTECODE(CHECK_REGISTER_LT)
503 if (registers[insn >> BYTECODE_SHIFT] < Load32Aligned(pc + 4)) {
504 pc = code_base + Load32Aligned(pc + 8);
506 pc += BC_CHECK_REGISTER_LT_LENGTH;
509 BYTECODE(CHECK_REGISTER_GE)
510 if (registers[insn >> BYTECODE_SHIFT] >= Load32Aligned(pc + 4)) {
511 pc = code_base + Load32Aligned(pc + 8);
513 pc += BC_CHECK_REGISTER_GE_LENGTH;
516 BYTECODE(CHECK_REGISTER_EQ_POS)
517 if (registers[insn >> BYTECODE_SHIFT] == current) {
518 pc = code_base + Load32Aligned(pc + 4);
520 pc += BC_CHECK_REGISTER_EQ_POS_LENGTH;
523 BYTECODE(CHECK_NOT_REGS_EQUAL)
524 if (registers[insn >> BYTECODE_SHIFT] ==
525 registers[Load32Aligned(pc + 4)]) {
526 pc += BC_CHECK_NOT_REGS_EQUAL_LENGTH;
528 pc = code_base + Load32Aligned(pc + 8);
531 BYTECODE(CHECK_NOT_BACK_REF) {
532 int from = registers[insn >> BYTECODE_SHIFT];
533 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
534 if (from < 0 || len <= 0) {
535 pc += BC_CHECK_NOT_BACK_REF_LENGTH;
538 if (current + len > subject.length()) {
539 pc = code_base + Load32Aligned(pc + 4);
543 for (i = 0; i < len; i++) {
544 if (subject[from + i] != subject[current + i]) {
545 pc = code_base + Load32Aligned(pc + 4);
552 pc += BC_CHECK_NOT_BACK_REF_LENGTH;
555 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {
556 int from = registers[insn >> BYTECODE_SHIFT];
557 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
558 if (from < 0 || len <= 0) {
559 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
562 if (current + len > subject.length()) {
563 pc = code_base + Load32Aligned(pc + 4);
566 if (BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(),
567 from, current, len, subject)) {
569 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
571 pc = code_base + Load32Aligned(pc + 4);
576 BYTECODE(CHECK_AT_START)
578 pc = code_base + Load32Aligned(pc + 4);
580 pc += BC_CHECK_AT_START_LENGTH;
583 BYTECODE(CHECK_NOT_AT_START)
585 pc += BC_CHECK_NOT_AT_START_LENGTH;
587 pc = code_base + Load32Aligned(pc + 4);
590 BYTECODE(SET_CURRENT_POSITION_FROM_END) {
591 int by = static_cast<uint32_t>(insn) >> BYTECODE_SHIFT;
592 if (subject.length() - current > by) {
593 current = subject.length() - by;
594 current_char = subject[current - 1];
596 pc += BC_SET_CURRENT_POSITION_FROM_END_LENGTH;
607 RegExpImpl::IrregexpResult IrregexpInterpreter::Match(
609 Handle<ByteArray> code_array,
610 Handle<String> subject,
612 int start_position) {
613 ASSERT(subject->IsFlat());
615 AssertNoAllocation a;
616 const byte* code_base = code_array->GetDataStartAddress();
617 uc16 previous_char = '\n';
618 String::FlatContent subject_content = subject->GetFlatContent();
619 if (subject_content.IsAscii()) {
620 Vector<const char> subject_vector = subject_content.ToAsciiVector();
621 if (start_position != 0) previous_char = subject_vector[start_position - 1];
622 return RawMatch(isolate,
629 ASSERT(subject_content.IsTwoByte());
630 Vector<const uc16> subject_vector = subject_content.ToUC16Vector();
631 if (start_position != 0) previous_char = subject_vector[start_position - 1];
632 return RawMatch(isolate,
641 } } // namespace v8::internal