1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // A simple interpreter for the Irregexp byte code.
12 #include "bytecodes-irregexp.h"
13 #include "interpreter-irregexp.h"
15 #include "regexp-macro-assembler.h"
21 typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize;
23 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
27 Vector<const uc16> subject) {
28 for (int i = 0; i < len; i++) {
29 unibrow::uchar old_char = subject[from++];
30 unibrow::uchar new_char = subject[current++];
31 if (old_char == new_char) continue;
32 unibrow::uchar old_string[1] = { old_char };
33 unibrow::uchar new_string[1] = { new_char };
34 interp_canonicalize->get(old_char, '\0', old_string);
35 interp_canonicalize->get(new_char, '\0', new_string);
36 if (old_string[0] != new_string[0]) {
44 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
48 Vector<const uint8_t> subject) {
49 for (int i = 0; i < len; i++) {
50 unsigned int old_char = subject[from++];
51 unsigned int new_char = subject[current++];
52 if (old_char == new_char) continue;
53 // Convert both characters to lower case.
56 if (old_char != new_char) return false;
57 // Not letters in the ASCII range and Latin-1 range.
58 if (!(old_char - 'a' <= 'z' - 'a') &&
59 !(old_char - 224 <= 254 - 224 && old_char != 247)) {
68 static void TraceInterpreter(const byte* code_base,
72 uint32_t current_char,
74 const char* bytecode_name) {
75 if (FLAG_trace_regexp_bytecodes) {
76 bool printable = (current_char < 127 && current_char >= 32);
79 "pc = %02x, sp = %d, curpos = %d, curchar = %08x (%c), bc = %s" :
80 "pc = %02x, sp = %d, curpos = %d, curchar = %08x .%c., bc = %s";
86 printable ? current_char : '.',
88 for (int i = 0; i < bytecode_length; i++) {
89 printf(", %02x", pc[i]);
92 for (int i = 1; i < bytecode_length; i++) {
93 unsigned char b = pc[i];
94 if (b < 127 && b >= 32) {
105 #define BYTECODE(name) \
107 TraceInterpreter(code_base, \
109 static_cast<int>(backtrack_sp - backtrack_stack_base), \
112 BC_##name##_LENGTH, \
115 #define BYTECODE(name) \
120 static int32_t Load32Aligned(const byte* pc) {
121 ASSERT((reinterpret_cast<intptr_t>(pc) & 3) == 0);
122 return *reinterpret_cast<const int32_t *>(pc);
126 static int32_t Load16Aligned(const byte* pc) {
127 ASSERT((reinterpret_cast<intptr_t>(pc) & 1) == 0);
128 return *reinterpret_cast<const uint16_t *>(pc);
132 // A simple abstraction over the backtracking stack used by the interpreter.
133 // This backtracking stack does not grow automatically, but it ensures that the
134 // the memory held by the stack is released or remembered in a cache if the
135 // matching terminates.
136 class BacktrackStack {
138 explicit BacktrackStack() {
139 data_ = NewArray<int>(kBacktrackStackSize);
146 int* data() const { return data_; }
148 int max_size() const { return kBacktrackStackSize; }
151 static const int kBacktrackStackSize = 10000;
155 DISALLOW_COPY_AND_ASSIGN(BacktrackStack);
159 template <typename Char>
160 static RegExpImpl::IrregexpResult RawMatch(Isolate* isolate,
161 const byte* code_base,
162 Vector<const Char> subject,
165 uint32_t current_char) {
166 const byte* pc = code_base;
167 // BacktrackStack ensures that the memory allocated for the backtracking stack
168 // is returned to the system or cached if there is no stack being cached at
170 BacktrackStack backtrack_stack;
171 int* backtrack_stack_base = backtrack_stack.data();
172 int* backtrack_sp = backtrack_stack_base;
173 int backtrack_stack_space = backtrack_stack.max_size();
175 if (FLAG_trace_regexp_bytecodes) {
176 PrintF("\n\nStart bytecode interpreter\n\n");
180 int32_t insn = Load32Aligned(pc);
181 switch (insn & BYTECODE_MASK) {
184 return RegExpImpl::RE_FAILURE;
186 if (--backtrack_stack_space < 0) {
187 return RegExpImpl::RE_EXCEPTION;
189 *backtrack_sp++ = current;
190 pc += BC_PUSH_CP_LENGTH;
193 if (--backtrack_stack_space < 0) {
194 return RegExpImpl::RE_EXCEPTION;
196 *backtrack_sp++ = Load32Aligned(pc + 4);
197 pc += BC_PUSH_BT_LENGTH;
199 BYTECODE(PUSH_REGISTER)
200 if (--backtrack_stack_space < 0) {
201 return RegExpImpl::RE_EXCEPTION;
203 *backtrack_sp++ = registers[insn >> BYTECODE_SHIFT];
204 pc += BC_PUSH_REGISTER_LENGTH;
206 BYTECODE(SET_REGISTER)
207 registers[insn >> BYTECODE_SHIFT] = Load32Aligned(pc + 4);
208 pc += BC_SET_REGISTER_LENGTH;
210 BYTECODE(ADVANCE_REGISTER)
211 registers[insn >> BYTECODE_SHIFT] += Load32Aligned(pc + 4);
212 pc += BC_ADVANCE_REGISTER_LENGTH;
214 BYTECODE(SET_REGISTER_TO_CP)
215 registers[insn >> BYTECODE_SHIFT] = current + Load32Aligned(pc + 4);
216 pc += BC_SET_REGISTER_TO_CP_LENGTH;
218 BYTECODE(SET_CP_TO_REGISTER)
219 current = registers[insn >> BYTECODE_SHIFT];
220 pc += BC_SET_CP_TO_REGISTER_LENGTH;
222 BYTECODE(SET_REGISTER_TO_SP)
223 registers[insn >> BYTECODE_SHIFT] =
224 static_cast<int>(backtrack_sp - backtrack_stack_base);
225 pc += BC_SET_REGISTER_TO_SP_LENGTH;
227 BYTECODE(SET_SP_TO_REGISTER)
228 backtrack_sp = backtrack_stack_base + registers[insn >> BYTECODE_SHIFT];
229 backtrack_stack_space = backtrack_stack.max_size() -
230 static_cast<int>(backtrack_sp - backtrack_stack_base);
231 pc += BC_SET_SP_TO_REGISTER_LENGTH;
234 backtrack_stack_space++;
236 current = *backtrack_sp;
237 pc += BC_POP_CP_LENGTH;
240 backtrack_stack_space++;
242 pc = code_base + *backtrack_sp;
244 BYTECODE(POP_REGISTER)
245 backtrack_stack_space++;
247 registers[insn >> BYTECODE_SHIFT] = *backtrack_sp;
248 pc += BC_POP_REGISTER_LENGTH;
251 return RegExpImpl::RE_FAILURE;
253 return RegExpImpl::RE_SUCCESS;
255 current += insn >> BYTECODE_SHIFT;
256 pc += BC_ADVANCE_CP_LENGTH;
259 pc = code_base + Load32Aligned(pc + 4);
261 BYTECODE(ADVANCE_CP_AND_GOTO)
262 current += insn >> BYTECODE_SHIFT;
263 pc = code_base + Load32Aligned(pc + 4);
265 BYTECODE(CHECK_GREEDY)
266 if (current == backtrack_sp[-1]) {
268 backtrack_stack_space++;
269 pc = code_base + Load32Aligned(pc + 4);
271 pc += BC_CHECK_GREEDY_LENGTH;
274 BYTECODE(LOAD_CURRENT_CHAR) {
275 int pos = current + (insn >> BYTECODE_SHIFT);
276 if (pos >= subject.length()) {
277 pc = code_base + Load32Aligned(pc + 4);
279 current_char = subject[pos];
280 pc += BC_LOAD_CURRENT_CHAR_LENGTH;
284 BYTECODE(LOAD_CURRENT_CHAR_UNCHECKED) {
285 int pos = current + (insn >> BYTECODE_SHIFT);
286 current_char = subject[pos];
287 pc += BC_LOAD_CURRENT_CHAR_UNCHECKED_LENGTH;
290 BYTECODE(LOAD_2_CURRENT_CHARS) {
291 int pos = current + (insn >> BYTECODE_SHIFT);
292 if (pos + 2 > subject.length()) {
293 pc = code_base + Load32Aligned(pc + 4);
295 Char next = subject[pos + 1];
297 (subject[pos] | (next << (kBitsPerByte * sizeof(Char))));
298 pc += BC_LOAD_2_CURRENT_CHARS_LENGTH;
302 BYTECODE(LOAD_2_CURRENT_CHARS_UNCHECKED) {
303 int pos = current + (insn >> BYTECODE_SHIFT);
304 Char next = subject[pos + 1];
305 current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char))));
306 pc += BC_LOAD_2_CURRENT_CHARS_UNCHECKED_LENGTH;
309 BYTECODE(LOAD_4_CURRENT_CHARS) {
310 ASSERT(sizeof(Char) == 1);
311 int pos = current + (insn >> BYTECODE_SHIFT);
312 if (pos + 4 > subject.length()) {
313 pc = code_base + Load32Aligned(pc + 4);
315 Char next1 = subject[pos + 1];
316 Char next2 = subject[pos + 2];
317 Char next3 = subject[pos + 3];
318 current_char = (subject[pos] |
322 pc += BC_LOAD_4_CURRENT_CHARS_LENGTH;
326 BYTECODE(LOAD_4_CURRENT_CHARS_UNCHECKED) {
327 ASSERT(sizeof(Char) == 1);
328 int pos = current + (insn >> BYTECODE_SHIFT);
329 Char next1 = subject[pos + 1];
330 Char next2 = subject[pos + 2];
331 Char next3 = subject[pos + 3];
332 current_char = (subject[pos] |
336 pc += BC_LOAD_4_CURRENT_CHARS_UNCHECKED_LENGTH;
339 BYTECODE(CHECK_4_CHARS) {
340 uint32_t c = Load32Aligned(pc + 4);
341 if (c == current_char) {
342 pc = code_base + Load32Aligned(pc + 8);
344 pc += BC_CHECK_4_CHARS_LENGTH;
348 BYTECODE(CHECK_CHAR) {
349 uint32_t c = (insn >> BYTECODE_SHIFT);
350 if (c == current_char) {
351 pc = code_base + Load32Aligned(pc + 4);
353 pc += BC_CHECK_CHAR_LENGTH;
357 BYTECODE(CHECK_NOT_4_CHARS) {
358 uint32_t c = Load32Aligned(pc + 4);
359 if (c != current_char) {
360 pc = code_base + Load32Aligned(pc + 8);
362 pc += BC_CHECK_NOT_4_CHARS_LENGTH;
366 BYTECODE(CHECK_NOT_CHAR) {
367 uint32_t c = (insn >> BYTECODE_SHIFT);
368 if (c != current_char) {
369 pc = code_base + Load32Aligned(pc + 4);
371 pc += BC_CHECK_NOT_CHAR_LENGTH;
375 BYTECODE(AND_CHECK_4_CHARS) {
376 uint32_t c = Load32Aligned(pc + 4);
377 if (c == (current_char & Load32Aligned(pc + 8))) {
378 pc = code_base + Load32Aligned(pc + 12);
380 pc += BC_AND_CHECK_4_CHARS_LENGTH;
384 BYTECODE(AND_CHECK_CHAR) {
385 uint32_t c = (insn >> BYTECODE_SHIFT);
386 if (c == (current_char & Load32Aligned(pc + 4))) {
387 pc = code_base + Load32Aligned(pc + 8);
389 pc += BC_AND_CHECK_CHAR_LENGTH;
393 BYTECODE(AND_CHECK_NOT_4_CHARS) {
394 uint32_t c = Load32Aligned(pc + 4);
395 if (c != (current_char & Load32Aligned(pc + 8))) {
396 pc = code_base + Load32Aligned(pc + 12);
398 pc += BC_AND_CHECK_NOT_4_CHARS_LENGTH;
402 BYTECODE(AND_CHECK_NOT_CHAR) {
403 uint32_t c = (insn >> BYTECODE_SHIFT);
404 if (c != (current_char & Load32Aligned(pc + 4))) {
405 pc = code_base + Load32Aligned(pc + 8);
407 pc += BC_AND_CHECK_NOT_CHAR_LENGTH;
411 BYTECODE(MINUS_AND_CHECK_NOT_CHAR) {
412 uint32_t c = (insn >> BYTECODE_SHIFT);
413 uint32_t minus = Load16Aligned(pc + 4);
414 uint32_t mask = Load16Aligned(pc + 6);
415 if (c != ((current_char - minus) & mask)) {
416 pc = code_base + Load32Aligned(pc + 8);
418 pc += BC_MINUS_AND_CHECK_NOT_CHAR_LENGTH;
422 BYTECODE(CHECK_CHAR_IN_RANGE) {
423 uint32_t from = Load16Aligned(pc + 4);
424 uint32_t to = Load16Aligned(pc + 6);
425 if (from <= current_char && current_char <= to) {
426 pc = code_base + Load32Aligned(pc + 8);
428 pc += BC_CHECK_CHAR_IN_RANGE_LENGTH;
432 BYTECODE(CHECK_CHAR_NOT_IN_RANGE) {
433 uint32_t from = Load16Aligned(pc + 4);
434 uint32_t to = Load16Aligned(pc + 6);
435 if (from > current_char || current_char > to) {
436 pc = code_base + Load32Aligned(pc + 8);
438 pc += BC_CHECK_CHAR_NOT_IN_RANGE_LENGTH;
442 BYTECODE(CHECK_BIT_IN_TABLE) {
443 int mask = RegExpMacroAssembler::kTableMask;
444 byte b = pc[8 + ((current_char & mask) >> kBitsPerByteLog2)];
445 int bit = (current_char & (kBitsPerByte - 1));
446 if ((b & (1 << bit)) != 0) {
447 pc = code_base + Load32Aligned(pc + 4);
449 pc += BC_CHECK_BIT_IN_TABLE_LENGTH;
454 uint32_t limit = (insn >> BYTECODE_SHIFT);
455 if (current_char < limit) {
456 pc = code_base + Load32Aligned(pc + 4);
458 pc += BC_CHECK_LT_LENGTH;
463 uint32_t limit = (insn >> BYTECODE_SHIFT);
464 if (current_char > limit) {
465 pc = code_base + Load32Aligned(pc + 4);
467 pc += BC_CHECK_GT_LENGTH;
471 BYTECODE(CHECK_REGISTER_LT)
472 if (registers[insn >> BYTECODE_SHIFT] < Load32Aligned(pc + 4)) {
473 pc = code_base + Load32Aligned(pc + 8);
475 pc += BC_CHECK_REGISTER_LT_LENGTH;
478 BYTECODE(CHECK_REGISTER_GE)
479 if (registers[insn >> BYTECODE_SHIFT] >= Load32Aligned(pc + 4)) {
480 pc = code_base + Load32Aligned(pc + 8);
482 pc += BC_CHECK_REGISTER_GE_LENGTH;
485 BYTECODE(CHECK_REGISTER_EQ_POS)
486 if (registers[insn >> BYTECODE_SHIFT] == current) {
487 pc = code_base + Load32Aligned(pc + 4);
489 pc += BC_CHECK_REGISTER_EQ_POS_LENGTH;
492 BYTECODE(CHECK_NOT_REGS_EQUAL)
493 if (registers[insn >> BYTECODE_SHIFT] ==
494 registers[Load32Aligned(pc + 4)]) {
495 pc += BC_CHECK_NOT_REGS_EQUAL_LENGTH;
497 pc = code_base + Load32Aligned(pc + 8);
500 BYTECODE(CHECK_NOT_BACK_REF) {
501 int from = registers[insn >> BYTECODE_SHIFT];
502 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
503 if (from < 0 || len <= 0) {
504 pc += BC_CHECK_NOT_BACK_REF_LENGTH;
507 if (current + len > subject.length()) {
508 pc = code_base + Load32Aligned(pc + 4);
512 for (i = 0; i < len; i++) {
513 if (subject[from + i] != subject[current + i]) {
514 pc = code_base + Load32Aligned(pc + 4);
521 pc += BC_CHECK_NOT_BACK_REF_LENGTH;
524 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {
525 int from = registers[insn >> BYTECODE_SHIFT];
526 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
527 if (from < 0 || len <= 0) {
528 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
531 if (current + len > subject.length()) {
532 pc = code_base + Load32Aligned(pc + 4);
535 if (BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(),
536 from, current, len, subject)) {
538 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
540 pc = code_base + Load32Aligned(pc + 4);
545 BYTECODE(CHECK_AT_START)
547 pc = code_base + Load32Aligned(pc + 4);
549 pc += BC_CHECK_AT_START_LENGTH;
552 BYTECODE(CHECK_NOT_AT_START)
554 pc += BC_CHECK_NOT_AT_START_LENGTH;
556 pc = code_base + Load32Aligned(pc + 4);
559 BYTECODE(SET_CURRENT_POSITION_FROM_END) {
560 int by = static_cast<uint32_t>(insn) >> BYTECODE_SHIFT;
561 if (subject.length() - current > by) {
562 current = subject.length() - by;
563 current_char = subject[current - 1];
565 pc += BC_SET_CURRENT_POSITION_FROM_END_LENGTH;
576 RegExpImpl::IrregexpResult IrregexpInterpreter::Match(
578 Handle<ByteArray> code_array,
579 Handle<String> subject,
581 int start_position) {
582 ASSERT(subject->IsFlat());
584 DisallowHeapAllocation no_gc;
585 const byte* code_base = code_array->GetDataStartAddress();
586 uc16 previous_char = '\n';
587 String::FlatContent subject_content = subject->GetFlatContent();
588 if (subject_content.IsAscii()) {
589 Vector<const uint8_t> subject_vector = subject_content.ToOneByteVector();
590 if (start_position != 0) previous_char = subject_vector[start_position - 1];
591 return RawMatch(isolate,
598 ASSERT(subject_content.IsTwoByte());
599 Vector<const uc16> subject_vector = subject_content.ToUC16Vector();
600 if (start_position != 0) previous_char = subject_vector[start_position - 1];
601 return RawMatch(isolate,
610 } } // namespace v8::internal