From: christian.plesner.hansen@gmail.com Date: Tue, 25 Nov 2008 11:07:48 +0000 (+0000) Subject: Merge regexp2000 back into bleeding_edge X-Git-Tag: upstream/4.7.83~24973 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b57b4a15cd89672d8f8102a9c586768605dc7524;p=platform%2Fupstream%2Fv8.git Merge regexp2000 back into bleeding_edge Review URL: http://codereview.chromium.org/12427 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@832 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- diff --git a/src/SConscript b/src/SConscript index 8226a8d..f12f301 100644 --- a/src/SConscript +++ b/src/SConscript @@ -35,15 +35,17 @@ Import('context') SOURCES = { 'all': [ - 'accessors.cc', 'allocation.cc', 'api.cc', 'assembler.cc', 'ast.cc', - 'bootstrapper.cc', 'builtins.cc', 'checks.cc', 'code-stubs.cc', - 'codegen.cc', 'compilation-cache.cc', 'compiler.cc', 'contexts.cc', - 'conversions.cc', 'counters.cc', 'dateparser.cc', 'debug.cc', - 'disassembler.cc', 'execution.cc', 'factory.cc', 'flags.cc', 'frames.cc', - 'global-handles.cc', 'handles.cc', 'hashmap.cc', 'heap.cc', 'ic.cc', - 'jsregexp.cc', 'log.cc', 'mark-compact.cc', 'messages.cc', 'objects.cc', - 'parser.cc', 'property.cc', 'rewriter.cc', 'runtime.cc', 'scanner.cc', - 'scopeinfo.cc', 'scopes.cc', 'serialize.cc', 'snapshot-common.cc', + 'accessors.cc', 'allocation.cc', 'api.cc', 'assembler.cc', + 'assembler-irregexp.cc', 'ast.cc', 'bootstrapper.cc', 'builtins.cc', + 'checks.cc', 'code-stubs.cc', 'codegen.cc', 'compilation-cache.cc', + 'compiler.cc', 'contexts.cc', 'conversions.cc', 'counters.cc', + 'dateparser.cc', 'debug.cc', 'disassembler.cc', 'execution.cc', + 'factory.cc', 'flags.cc', 'frames.cc', 'global-handles.cc', + 'handles.cc', 'hashmap.cc', 'heap.cc', 'ic.cc', 'interpreter-irregexp.cc', + 'jsregexp.cc', 'log.cc', 'mark-compact.cc', 'messages.cc', + 'objects.cc', 'parser.cc', 'property.cc', 'regexp-macro-assembler.cc', + 'regexp-macro-assembler-irregexp.cc', 'rewriter.cc', 'runtime.cc', 'scanner.cc', + 'scopeinfo.cc', 'scopes.cc', 'serialize.cc', 'snapshot-common.cc', 'spaces.cc', 'string-stream.cc', 'stub-cache.cc', 'token.cc', 'top.cc', 'unicode.cc', 'usage-analyzer.cc', 'utils.cc', 'v8-counters.cc', 'v8.cc', 'v8threads.cc', 'variables.cc', 'zone.cc' @@ -53,7 +55,8 @@ SOURCES = { 'macro-assembler-arm.cc', 'stub-cache-arm.cc'], 'arch:ia32': ['assembler-ia32.cc', 'builtins-ia32.cc', 'codegen-ia32.cc', 'cpu-ia32.cc', 'disasm-ia32.cc', 'frames-ia32.cc', 'ic-ia32.cc', - 'macro-assembler-ia32.cc', 'stub-cache-ia32.cc'], + 'macro-assembler-ia32.cc', 'regexp-macro-assembler-ia32.cc', + 'stub-cache-ia32.cc'], 'simulator:arm': ['simulator-arm.cc'], 'os:freebsd': ['platform-freebsd.cc'], 'os:linux': ['platform-linux.cc'], diff --git a/src/assembler-ia32-inl.h b/src/assembler-ia32-inl.h index 9b3567a..534d57e 100644 --- a/src/assembler-ia32-inl.h +++ b/src/assembler-ia32-inl.h @@ -205,6 +205,14 @@ void Assembler::emit(const Immediate& x) { } +void Assembler::emit_w(const Immediate& x) { + ASSERT(x.rmode_ == RelocInfo::NONE); + uint16_t value = static_cast(x.x_); + reinterpret_cast(pc_)[0] = value; + pc_ += sizeof(uint16_t); +} + + Address Assembler::target_address_at(Address pc) { return pc + sizeof(int32_t) + *reinterpret_cast(pc); } diff --git a/src/assembler-ia32.cc b/src/assembler-ia32.cc index 264ef42..1555ecc 100644 --- a/src/assembler-ia32.cc +++ b/src/assembler-ia32.cc @@ -122,7 +122,8 @@ void CpuFeatures::Probe() { #undef __ CodeDesc desc; assm.GetCode(&desc); - Object* code = Heap::CreateCode(desc, NULL, Code::ComputeFlags(Code::STUB)); + Object* code = + Heap::CreateCode(desc, NULL, Code::ComputeFlags(Code::STUB), NULL); if (!code->IsCode()) return; F0 f = FUNCTION_CAST(Code::cast(code)->entry()); uint32_t res = f(); @@ -294,7 +295,6 @@ Assembler::Assembler(void* buffer, int buffer_size) { } buffer_size_ = buffer_size; own_buffer_ = true; - } else { // use externally provided buffer instead ASSERT(buffer_size > 0); @@ -420,6 +420,29 @@ void Assembler::push(const Operand& src) { } +void Assembler::push(Label* label, RelocInfo::Mode reloc_mode) { + ASSERT_NOT_NULL(label); + EnsureSpace ensure_space(this); + last_pc_ = pc_; + // If reloc_mode == NONE, the label is stored as buffer relative. + ASSERT(reloc_mode == RelocInfo::NONE); + if (label->is_bound()) { + // Index of position in Code object: + int pos = label->pos() + Code::kHeaderSize; + if (pos >= 0 && pos < 256) { + EMIT(0x6a); + EMIT(pos); + } else { + EMIT(0x68); + emit(pos); + } + } else { + EMIT(0x68); + emit_disp(label, Displacement::CODE_RELATIVE); + } +} + + void Assembler::pop(Register dst) { ASSERT(reloc_info_writer.last_pc() != NULL); if (FLAG_push_pop_elimination && (reloc_info_writer.last_pc() <= last_pc_)) { @@ -546,6 +569,22 @@ void Assembler::pop(const Operand& dst) { } +void Assembler::enter(const Immediate& size) { + EnsureSpace ensure_space(this); + last_pc_ = pc_; + EMIT(0xC8); + emit_w(size); + EMIT(0); +} + + +void Assembler::leave() { + EnsureSpace ensure_space(this); + last_pc_ = pc_; + EMIT(0xC9); +} + + void Assembler::mov_b(Register dst, const Operand& src) { EnsureSpace ensure_space(this); last_pc_ = pc_; @@ -830,6 +869,23 @@ void Assembler::cmp(const Operand& op, const Immediate& imm) { } +void Assembler::rep_cmpsb() { + EnsureSpace ensure_space(this); + last_pc_ = pc_; + EMIT(0xFC); // CLD to ensure forward operation + EMIT(0xF3); // REP + EMIT(0xA6); // CMPSB +} + +void Assembler::rep_cmpsw() { + EnsureSpace ensure_space(this); + last_pc_ = pc_; + EMIT(0xFC); // CLD to ensure forward operation + EMIT(0xF3); // REP + EMIT(0xA7); // CMPSW +} + + void Assembler::dec_b(Register dst) { EnsureSpace ensure_space(this); last_pc_ = pc_; @@ -1074,6 +1130,14 @@ void Assembler::shr(Register dst) { } +void Assembler::shr_cl(Register dst) { + EnsureSpace ensure_space(this); + last_pc_ = pc_; + EMIT(0xD1); + EMIT(0xE8 | dst.code()); +} + + void Assembler::sub(const Operand& dst, const Immediate& x) { EnsureSpace ensure_space(this); last_pc_ = pc_; @@ -1171,6 +1235,15 @@ void Assembler::xor_(const Operand& dst, const Immediate& x) { } +void Assembler::bt(const Operand& dst, Register src) { + EnsureSpace ensure_space(this); + last_pc_ = pc_; + EMIT(0x0F); + EMIT(0xA3); + emit_operand(src, dst); +} + + void Assembler::bts(const Operand& dst, Register src) { EnsureSpace ensure_space(this); last_pc_ = pc_; @@ -1224,13 +1297,6 @@ void Assembler::ret(int imm16) { } -void Assembler::leave() { - EnsureSpace ensure_space(this); - last_pc_ = pc_; - EMIT(0xC9); -} - - // Labels refer to positions in the (to be) generated code. // There are bound, linked, and unused labels. // @@ -1270,12 +1336,16 @@ void Assembler::bind_to(Label* L, int pos) { while (L->is_linked()) { Displacement disp = disp_at(L); int fixup_pos = L->pos(); - if (disp.type() == Displacement::UNCONDITIONAL_JUMP) { - ASSERT(byte_at(fixup_pos - 1) == 0xE9); // jmp expected + if (disp.type() == Displacement::CODE_RELATIVE) { + long_at_put(fixup_pos, pos + Code::kHeaderSize); + } else { + if (disp.type() == Displacement::UNCONDITIONAL_JUMP) { + ASSERT(byte_at(fixup_pos - 1) == 0xE9); // jmp expected + } + // relative address, relative to point after address + int imm32 = pos - (fixup_pos + sizeof(int32_t)); + long_at_put(fixup_pos, imm32); } - // relative address, relative to point after address - int imm32 = pos - (fixup_pos + sizeof(int32_t)); - long_at_put(fixup_pos, imm32); disp.next(L); } L->bind_to(pos); diff --git a/src/assembler-ia32.h b/src/assembler-ia32.h index 9647446..3efd2f2 100644 --- a/src/assembler-ia32.h +++ b/src/assembler-ia32.h @@ -118,8 +118,8 @@ enum Condition { not_equal = 5, below_equal = 6, above = 7, - sign = 8, - not_sign = 9, + negative = 8, + positive = 9, parity_even = 10, parity_odd = 11, less = 12, @@ -128,10 +128,12 @@ enum Condition { greater = 15, // aliases + carry = below, + not_carry = above_equal, zero = equal, not_zero = not_equal, - negative = sign, - positive = not_sign + sign = negative, + not_sign = positive }; @@ -283,13 +285,14 @@ class Operand BASE_EMBEDDED { // // Displacement _data field layout // -// |31.....1| ......0| +// |31.....2|1......0| // [ next | type | class Displacement BASE_EMBEDDED { public: enum Type { UNCONDITIONAL_JUMP, + CODE_RELATIVE, OTHER }; @@ -313,8 +316,8 @@ class Displacement BASE_EMBEDDED { private: int data_; - class TypeField: public BitField {}; - class NextField: public BitField {}; + class TypeField: public BitField {}; + class NextField: public BitField {}; void init(Label* L, Type type); }; @@ -440,10 +443,14 @@ class Assembler : public Malloced { void push(const Immediate& x); void push(Register src); void push(const Operand& src); + void push(Label* label, RelocInfo::Mode relocation_mode); void pop(Register dst); void pop(const Operand& dst); + void enter(const Immediate& size); + void leave(); + // Moves void mov_b(Register dst, const Operand& src); void mov_b(const Operand& dst, int8_t imm8); @@ -491,6 +498,9 @@ class Assembler : public Malloced { void cmp(Register reg, const Operand& op); void cmp(const Operand& op, const Immediate& imm); + void rep_cmpsb(); + void rep_cmpsw(); + void dec_b(Register dst); void dec(Register dst); @@ -535,6 +545,7 @@ class Assembler : public Malloced { void shr(Register dst, uint8_t imm8); void shr(Register dst); + void shr_cl(Register dst); void sub(const Operand& dst, const Immediate& x); void sub(Register dst, const Operand& src); @@ -550,6 +561,7 @@ class Assembler : public Malloced { void xor_(const Operand& dst, const Immediate& x); // Bit operations. + void bt(const Operand& dst, Register src); void bts(const Operand& dst, Register src); // Miscellaneous @@ -558,7 +570,6 @@ class Assembler : public Malloced { void nop(); void rdtsc(); void ret(int imm16); - void leave(); // Label operations & relative jumps (PPUM Appendix D) // @@ -748,6 +759,7 @@ class Assembler : public Malloced { inline void emit(Handle handle); inline void emit(uint32_t x, RelocInfo::Mode rmode); inline void emit(const Immediate& x); + inline void emit_w(const Immediate& x); // instruction generation void emit_arith_b(int op1, int op2, Register dst, int imm8); diff --git a/src/assembler-irregexp-inl.h b/src/assembler-irregexp-inl.h new file mode 100644 index 0000000..d4faeea --- /dev/null +++ b/src/assembler-irregexp-inl.h @@ -0,0 +1,82 @@ +// Copyright 2008 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// A light-weight assembler for the Regexp2000 byte code. + + +#include "v8.h" +#include "ast.h" +#include "bytecodes-irregexp.h" +#include "assembler-irregexp.h" + + +namespace v8 { namespace internal { + + +void IrregexpAssembler::Emit(uint32_t byte) { + ASSERT(pc_ <= buffer_.length()); + if (pc_ == buffer_.length()) { + Expand(); + } + buffer_[pc_++] = byte; +} + + +void IrregexpAssembler::Emit16(uint32_t word) { + ASSERT(pc_ <= buffer_.length()); + if (pc_ + 1 >= buffer_.length()) { + Expand(); + } + Store16(buffer_.start() + pc_, word); + pc_ += 2; +} + + +void IrregexpAssembler::Emit32(uint32_t word) { + ASSERT(pc_ <= buffer_.length()); + if (pc_ + 3 >= buffer_.length()) { + Expand(); + } + Store32(buffer_.start() + pc_, word); + pc_ += 4; +} + + +void IrregexpAssembler::EmitOrLink(Label* l) { + if (l->is_bound()) { + Emit32(l->pos()); + } else { + int pos = 0; + if (l->is_linked()) { + pos = l->pos(); + } + l->link_to(pc_); + Emit32(pos); + } + } + +} } // namespace v8::internal diff --git a/src/assembler-irregexp.cc b/src/assembler-irregexp.cc new file mode 100644 index 0000000..bc3d0ae --- /dev/null +++ b/src/assembler-irregexp.cc @@ -0,0 +1,339 @@ +// Copyright 2008 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// A light-weight assembler for the Irregexp byte code. + + +#include "v8.h" +#include "ast.h" +#include "bytecodes-irregexp.h" +#include "assembler-irregexp.h" + +#include "assembler-irregexp-inl.h" + + +namespace v8 { namespace internal { + + +IrregexpAssembler::IrregexpAssembler(Vector buffer) + : buffer_(buffer), + pc_(0), + own_buffer_(false) { +} + + +IrregexpAssembler::~IrregexpAssembler() { + if (own_buffer_) { + buffer_.Dispose(); + } +} + + +void IrregexpAssembler::PushCurrentPosition(int cp_offset) { + ASSERT(cp_offset >= 0); + Emit(BC_PUSH_CP); + Emit32(cp_offset); +} + + +void IrregexpAssembler::PushBacktrack(Label* l) { + Emit(BC_PUSH_BT); + EmitOrLink(l); +} + + +void IrregexpAssembler::PushRegister(int index) { + ASSERT(index >= 0); + Emit(BC_PUSH_REGISTER); + Emit(index); +} + + +void IrregexpAssembler::WriteCurrentPositionToRegister(int index, + int cp_offset) { + ASSERT(cp_offset >= 0); + ASSERT(index >= 0); + Emit(BC_SET_REGISTER_TO_CP); + Emit(index); + Emit32(cp_offset); +} + + +void IrregexpAssembler::ReadCurrentPositionFromRegister(int index) { + ASSERT(index >= 0); + Emit(BC_SET_CP_TO_REGISTER); + Emit(index); +} + + +void IrregexpAssembler::WriteStackPointerToRegister(int index) { + ASSERT(index >= 0); + Emit(BC_SET_REGISTER_TO_SP); + Emit(index); +} + + +void IrregexpAssembler::ReadStackPointerFromRegister(int index) { + ASSERT(index >= 0); + Emit(BC_SET_SP_TO_REGISTER); + Emit(index); +} + + +void IrregexpAssembler::SetRegister(int index, int value) { + ASSERT(index >= 0); + Emit(BC_SET_REGISTER); + Emit(index); + Emit32(value); +} + + +void IrregexpAssembler::AdvanceRegister(int index, int by) { + ASSERT(index >= 0); + Emit(BC_ADVANCE_REGISTER); + Emit(index); + Emit32(by); +} + + +void IrregexpAssembler::PopCurrentPosition() { + Emit(BC_POP_CP); +} + + +void IrregexpAssembler::PopBacktrack() { + Emit(BC_POP_BT); +} + + +void IrregexpAssembler::PopRegister(int index) { + Emit(BC_POP_REGISTER); + Emit(index); +} + + +void IrregexpAssembler::Fail() { + Emit(BC_FAIL); +} + + +void IrregexpAssembler::Break() { + Emit(BC_BREAK); +} + + +void IrregexpAssembler::Succeed() { + Emit(BC_SUCCEED); +} + + +void IrregexpAssembler::Bind(Label* l) { + ASSERT(!l->is_bound()); + if (l->is_linked()) { + int pos = l->pos(); + while (pos != 0) { + int fixup = pos; + pos = Load32(buffer_.start() + fixup); + Store32(buffer_.start() + fixup, pc_); + } + } + l->bind_to(pc_); +} + + +void IrregexpAssembler::AdvanceCP(int cp_offset) { + Emit(BC_ADVANCE_CP); + Emit32(cp_offset); +} + + +void IrregexpAssembler::GoTo(Label* l) { + Emit(BC_GOTO); + EmitOrLink(l); +} + + +void IrregexpAssembler::LoadCurrentChar(int cp_offset, Label* on_end) { + Emit(BC_LOAD_CURRENT_CHAR); + Emit32(cp_offset); + EmitOrLink(on_end); +} + + +void IrregexpAssembler::CheckCharacter(uc16 c, Label* on_match) { + Emit(BC_CHECK_CHAR); + Emit16(c); + EmitOrLink(on_match); +} + + +void IrregexpAssembler::CheckNotCharacter(uc16 c, Label* on_mismatch) { + Emit(BC_CHECK_NOT_CHAR); + Emit16(c); + EmitOrLink(on_mismatch); +} + +void IrregexpAssembler::OrThenCheckNotCharacter(uc16 c, + uc16 mask, + Label* on_mismatch) { + Emit(BC_OR_CHECK_NOT_CHAR); + Emit16(c); + Emit16(mask); + EmitOrLink(on_mismatch); +} + + +void IrregexpAssembler::MinusOrThenCheckNotCharacter(uc16 c, + uc16 mask, + Label* on_mismatch) { + Emit(BC_MINUS_OR_CHECK_NOT_CHAR); + Emit16(c); + Emit16(mask); + EmitOrLink(on_mismatch); +} + + +void IrregexpAssembler::CheckCharacterLT(uc16 limit, Label* on_less) { + Emit(BC_CHECK_LT); + Emit16(limit); + EmitOrLink(on_less); +} + + +void IrregexpAssembler::CheckCharacterGT(uc16 limit, Label* on_greater) { + Emit(BC_CHECK_GT); + Emit16(limit); + EmitOrLink(on_greater); +} + + +void IrregexpAssembler::CheckNotBackReference(int capture_index, + Label* on_mismatch) { + Emit(BC_CHECK_NOT_BACK_REF); + Emit(capture_index); + EmitOrLink(on_mismatch); +} + + +void IrregexpAssembler::CheckRegister(int byte_code, + int reg_index, + uint16_t vs, + Label* on_true) { + Emit(byte_code); + Emit(reg_index); + Emit16(vs); + EmitOrLink(on_true); +} + + +void IrregexpAssembler::CheckRegisterLT(int reg_index, + uint16_t vs, + Label* on_less_than) { + CheckRegister(BC_CHECK_REGISTER_LT, reg_index, vs, on_less_than); +} + + +void IrregexpAssembler::CheckRegisterGE(int reg_index, + uint16_t vs, + Label* on_greater_than_equal) { + CheckRegister(BC_CHECK_REGISTER_GE, reg_index, vs, on_greater_than_equal); +} + + +void IrregexpAssembler::LookupMap1(uc16 start, Label* bit_map, Label* on_zero) { + Emit(BC_LOOKUP_MAP1); + Emit16(start); + EmitOrLink(bit_map); + EmitOrLink(on_zero); +} + + +void IrregexpAssembler::LookupMap2(uc16 start, + Label* half_nibble_map, + const Vector& table) { + Emit(BC_LOOKUP_MAP2); + Emit16(start); + EmitOrLink(half_nibble_map); + ASSERT(table.length() > 0); + ASSERT(table.length() <= 4); + for (int i = 0; i < table.length(); i++) { + EmitOrLink(table[i]); + } +} + + +void IrregexpAssembler::LookupMap8(uc16 start, + Label* byte_map, + const Vector& table) { + Emit(BC_LOOKUP_MAP8); + Emit16(start); + EmitOrLink(byte_map); + ASSERT(table.length() > 0); + ASSERT(table.length() <= 256); + for (int i = 0; i < table.length(); i++) { + EmitOrLink(table[i]); + } +} + + +void IrregexpAssembler::LookupHighMap8(byte start, + Label* byte_map, + const Vector& table) { + Emit(BC_LOOKUP_HI_MAP8); + Emit(start); + EmitOrLink(byte_map); + ASSERT(table.length() > 0); + ASSERT(table.length() <= 256); + for (int i = 0; i < table.length(); i++) { + EmitOrLink(table[i]); + } +} + + +int IrregexpAssembler::length() { + return pc_; +} + + +void IrregexpAssembler::Copy(Address a) { + memcpy(a, buffer_.start(), length()); +} + + +void IrregexpAssembler::Expand() { + bool old_buffer_was_our_own = own_buffer_; + Vector old_buffer = buffer_; + buffer_ = Vector::New(old_buffer.length() * 2); + own_buffer_ = true; + memcpy(buffer_.start(), old_buffer.start(), old_buffer.length()); + if (old_buffer_was_our_own) { + old_buffer.Dispose(); + } +} + + +} } // namespace v8::internal diff --git a/src/assembler-irregexp.h b/src/assembler-irregexp.h new file mode 100644 index 0000000..9852521 --- /dev/null +++ b/src/assembler-irregexp.h @@ -0,0 +1,137 @@ +// Copyright 2006-2008 the V8 project authors. All rights reserved. + +// A light-weight assembler for the Irregexp byte code. + +#ifndef V8_ASSEMBLER_IRREGEXP_H_ +#define V8_ASSEMBLER_IRREGEXP_H_ + +namespace v8 { namespace internal { + + +class IrregexpAssembler { + public: + // Create an assembler. Instructions and relocation information are emitted + // into a buffer, with the instructions starting from the beginning and the + // relocation information starting from the end of the buffer. See CodeDesc + // for a detailed comment on the layout (globals.h). + // + // If the provided buffer is NULL, the assembler allocates and grows its own + // buffer, and buffer_size determines the initial buffer size. The buffer is + // owned by the assembler and deallocated upon destruction of the assembler. + // + // If the provided buffer is not NULL, the assembler uses the provided buffer + // for code generation and assumes its size to be buffer_size. If the buffer + // is too small, a fatal error occurs. No deallocation of the buffer is done + // upon destruction of the assembler. + explicit IrregexpAssembler(Vector); + ~IrregexpAssembler(); + + // CP = current position in source. + // BT = backtrack label. + + // Stack. + void PushCurrentPosition(int cp_offset = 0); + void PushBacktrack(Label* l); + void PushRegister(int index); + void WriteCurrentPositionToRegister(int index, int cp_offset = 0); + void ReadCurrentPositionFromRegister(int index); + void WriteStackPointerToRegister(int index); + void ReadStackPointerFromRegister(int index); + void SetRegister(int index, int value); + void AdvanceRegister(int index, int by); + + void PopCurrentPosition(); + void PopBacktrack(); + void PopRegister(int index); + + void Fail(); + void Succeed(); + + void Break(); // This instruction will cause a fatal VM error if hit. + + void Bind(Label* l); // Binds an unbound label L to the current code posn. + + void AdvanceCP(int by); + + void GoTo(Label* l); + + // Loads current char into a machine register. Jumps to the label if we + // reached the end of the subject string. Fall through otherwise. + void LoadCurrentChar(int cp_offset, Label* on_end); + + // Checks current char register against a singleton. + void CheckCharacter(uc16 c, Label* on_match); + void CheckNotCharacter(uc16 c, Label* on_mismatch); + void OrThenCheckNotCharacter(uc16 c, uc16 mask, Label* on_mismatch); + void MinusOrThenCheckNotCharacter(uc16 c, uc16 mask, Label* on_mismatch); + + // Used to check current char register against a range. + void CheckCharacterLT(uc16 limit, Label* on_less); + void CheckCharacterGT(uc16 limit, Label* on_greater); + + // Checks current position for a match against a + // previous capture. Advances current position by the length of the capture + // iff it matches. The capture is stored in a given register and the + // the register after. If a register contains -1 then the other register + // must always contain -1 and the on_mismatch label will never be called. + void CheckNotBackReference(int capture_index, Label* on_mismatch); + + // Checks a register for strictly-less-than or greater-than-or-equal. + void CheckRegisterLT(int reg_index, uint16_t vs, Label* on_less_than); + void CheckRegisterGE(int reg_index, uint16_t vs, Label* on_greater_equal); + + // Subtracts a 16 bit value from the current character, uses the result to + // look up in a bit array, uses the result of that decide whether to fall + // though (on 1) or jump to the on_zero label (on 0). + void LookupMap1(uc16 start, Label* bit_map, Label* on_zero); + + // Subtracts a 16 bit value from the current character, uses the result to + // look up in a 2-bit array, uses the result of that to look up in a label + // table and jumps to the label. + void LookupMap2(uc16 start, + Label* half_nibble_map, + const Vector& table); + + // Subtracts a 16 bit value from the current character, uses the result to + // look up in a byte array, uses the result of that to look up in a label + // array and jumps to the label. + void LookupMap8(uc16 start, Label* byte_map, const Vector& table); + + // Takes the high byte of the current character, uses the result to + // look up in a byte array, uses the result of that to look up in a label + // array and jumps to the label. + void LookupHighMap8(byte start, Label* byte_map, const Vector& table); + + // Code and bitmap emission. + inline void Emit32(uint32_t x); + inline void Emit16(uint32_t x); + inline void Emit(uint32_t x); + + // Bytecode buffer. + int length(); + void Copy(Address a); + + inline void EmitOrLink(Label* l); + private: + // Don't use this. + IrregexpAssembler() { UNREACHABLE(); } + // The buffer into which code and relocation info are generated. + Vector buffer_; + + inline void CheckRegister(int byte_code, + int reg_index, + uint16_t vs, + Label* on_true); + // Code generation. + int pc_; // The program counter; moves forward. + + // True if the assembler owns the buffer, false if buffer is external. + bool own_buffer_; + + void Expand(); +}; + + +} } // namespace v8::internal + +#endif // V8_ASSEMBLER_IRREGEXP_H_ diff --git a/src/assembler.h b/src/assembler.h index 81c8056..b613d9b 100644 --- a/src/assembler.h +++ b/src/assembler.h @@ -50,7 +50,8 @@ namespace v8 { namespace internal { class Label : public ZoneObject { // LabelShadows are dynamically allocated. public: - INLINE(Label()) { Unuse(); } + INLINE(Label()) + { Unuse(); } INLINE(~Label()) { ASSERT(!is_linked()); } INLINE(void Unuse()) { pos_ = 0; } @@ -82,8 +83,10 @@ class Label : public ZoneObject { // LabelShadows are dynamically allocated. } friend class Assembler; + friend class RegexpAssembler; friend class Displacement; friend class LabelShadow; + friend class IrregexpAssembler; }; diff --git a/src/ast.cc b/src/ast.cc index 2f21d92..ce4dfb8 100644 --- a/src/ast.cc +++ b/src/ast.cc @@ -29,6 +29,7 @@ #include "ast.h" #include "scopes.h" +#include "string-stream.h" namespace v8 { namespace internal { @@ -179,4 +180,204 @@ void Visitor::VisitExpressions(ZoneList* expressions) { } +// ---------------------------------------------------------------------------- +// Regular expressions + +#define MAKE_ACCEPT(Name) \ + void* RegExp##Name::Accept(RegExpVisitor* visitor, void* data) { \ + return visitor->Visit##Name(this, data); \ + } +FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ACCEPT) +#undef MAKE_ACCEPT + +#define MAKE_TYPE_CASE(Name) \ + RegExp##Name* RegExpTree::As##Name() { \ + return NULL; \ + } \ + bool RegExpTree::Is##Name() { return false; } + FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE) +#undef MAKE_TYPE_CASE + +#define MAKE_TYPE_CASE(Name) \ + RegExp##Name* RegExp##Name::As##Name() { \ + return this; \ + } \ + bool RegExp##Name::Is##Name() { return true; } +FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE) +#undef MAKE_TYPE_CASE + +RegExpEmpty RegExpEmpty::kInstance; + + +// Convert regular expression trees to a simple sexp representation. +// This representation should be different from the input grammar +// in as many cases as possible, to make it more difficult for incorrect +// parses to look as correct ones which is likely if the input and +// output formats are alike. +class RegExpUnparser: public RegExpVisitor { + public: + RegExpUnparser(); + void VisitCharacterRange(CharacterRange that); + SmartPointer ToString() { return stream_.ToCString(); } +#define MAKE_CASE(Name) virtual void* Visit##Name(RegExp##Name*, void* data); + FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) +#undef MAKE_CASE + private: + StringStream* stream() { return &stream_; } + HeapStringAllocator alloc_; + StringStream stream_; +}; + + +RegExpUnparser::RegExpUnparser() : stream_(&alloc_) { +} + + +void* RegExpUnparser::VisitDisjunction(RegExpDisjunction* that, void* data) { + stream()->Add("(|"); + for (int i = 0; i < that->alternatives()->length(); i++) { + stream()->Add(" "); + that->alternatives()->at(i)->Accept(this, data); + } + stream()->Add(")"); + return NULL; +} + + +void* RegExpUnparser::VisitAlternative(RegExpAlternative* that, void* data) { + stream()->Add("(:"); + for (int i = 0; i < that->nodes()->length(); i++) { + stream()->Add(" "); + that->nodes()->at(i)->Accept(this, data); + } + stream()->Add(")"); + return NULL; +} + + +void RegExpUnparser::VisitCharacterRange(CharacterRange that) { + stream()->Add("%k", that.from()); + if (!that.IsSingleton()) { + stream()->Add("-%k", that.to()); + } +} + + + +void* RegExpUnparser::VisitCharacterClass(RegExpCharacterClass* that, + void* data) { + if (that->is_negated()) + stream()->Add("^"); + stream()->Add("["); + for (int i = 0; i < that->ranges()->length(); i++) { + if (i > 0) stream()->Add(" "); + VisitCharacterRange(that->ranges()->at(i)); + } + stream()->Add("]"); + return NULL; +} + + +void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) { + switch (that->type()) { + case RegExpAssertion::START_OF_INPUT: + stream()->Add("@^i"); + break; + case RegExpAssertion::END_OF_INPUT: + stream()->Add("@$i"); + break; + case RegExpAssertion::START_OF_LINE: + stream()->Add("@^l"); + break; + case RegExpAssertion::END_OF_LINE: + stream()->Add("@$l"); + break; + case RegExpAssertion::BOUNDARY: + stream()->Add("@b"); + break; + case RegExpAssertion::NON_BOUNDARY: + stream()->Add("@B"); + break; + } + return NULL; +} + + +void* RegExpUnparser::VisitAtom(RegExpAtom* that, void* data) { + stream()->Add("'"); + Vector chardata = that->data(); + for (int i = 0; i < chardata.length(); i++) { + stream()->Add("%k", chardata[i]); + } + stream()->Add("'"); + return NULL; +} + + +void* RegExpUnparser::VisitText(RegExpText* that, void* data) { + if (that->elements()->length() == 1) { + that->elements()->at(0).data.u_atom->Accept(this, data); + } else { + stream()->Add("(!"); + for (int i = 0; i < that->elements()->length(); i++) { + stream()->Add(" "); + that->elements()->at(i).data.u_atom->Accept(this, data); + } + stream()->Add(")"); + } + return NULL; +} + + +void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) { + stream()->Add("(# %i ", that->min()); + if (that->max() == RegExpQuantifier::kInfinity) { + stream()->Add("- "); + } else { + stream()->Add("%i ", that->max()); + } + stream()->Add(that->is_greedy() ? "g " : "n "); + that->body()->Accept(this, data); + stream()->Add(")"); + return NULL; +} + + +void* RegExpUnparser::VisitCapture(RegExpCapture* that, void* data) { + stream()->Add("(^ "); + that->body()->Accept(this, data); + stream()->Add(")"); + return NULL; +} + + +void* RegExpUnparser::VisitLookahead(RegExpLookahead* that, void* data) { + stream()->Add("(-> "); + stream()->Add(that->is_positive() ? "+ " : "- "); + that->body()->Accept(this, data); + stream()->Add(")"); + return NULL; +} + + +void* RegExpUnparser::VisitBackReference(RegExpBackReference* that, + void* data) { + stream()->Add("(<- %i)", that->index()); + return NULL; +} + + +void* RegExpUnparser::VisitEmpty(RegExpEmpty* that, void* data) { + stream()->Put('%'); + return NULL; +} + + +SmartPointer RegExpTree::ToString() { + RegExpUnparser unparser; + Accept(&unparser, NULL); + return unparser.ToString(); +} + + } } // namespace v8::internal diff --git a/src/ast.h b/src/ast.h index b383f76..f3a03eb 100644 --- a/src/ast.h +++ b/src/ast.h @@ -34,6 +34,7 @@ #include "token.h" #include "variables.h" #include "macro-assembler.h" +#include "jsregexp.h" namespace v8 { namespace internal { @@ -1192,6 +1193,268 @@ class ThisFunction: public Expression { // ---------------------------------------------------------------------------- +// Regular expressions + + +class RegExpTree: public ZoneObject { + public: + virtual ~RegExpTree() { } + virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; + virtual RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure) = 0; + virtual bool IsTextElement() { return false; } + virtual void AppendToText(RegExpText* text); + SmartPointer ToString(); +#define MAKE_ASTYPE(Name) \ + virtual RegExp##Name* As##Name(); \ + virtual bool Is##Name(); + FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) +#undef MAKE_ASTYPE +}; + + +class RegExpDisjunction: public RegExpTree { + public: + explicit RegExpDisjunction(ZoneList* alternatives) + : alternatives_(alternatives) { } + virtual void* Accept(RegExpVisitor* visitor, void* data); + virtual RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure); + virtual RegExpDisjunction* AsDisjunction(); + virtual bool IsDisjunction(); + ZoneList* alternatives() { return alternatives_; } + private: + ZoneList* alternatives_; +}; + + +class RegExpAlternative: public RegExpTree { + public: + explicit RegExpAlternative(ZoneList* nodes) : nodes_(nodes) { } + virtual void* Accept(RegExpVisitor* visitor, void* data); + virtual RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure); + virtual RegExpAlternative* AsAlternative(); + virtual bool IsAlternative(); + ZoneList* nodes() { return nodes_; } + private: + ZoneList* nodes_; +}; + + +class RegExpText: public RegExpTree { + public: + RegExpText() : elements_(2) { } + virtual void* Accept(RegExpVisitor* visitor, void* data); + virtual RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure); + virtual RegExpText* AsText(); + virtual bool IsText(); + virtual bool IsTextElement() { return true; } + virtual void AppendToText(RegExpText* text); + void AddElement(TextElement elm) { elements_.Add(elm); } + ZoneList* elements() { return &elements_; } + private: + ZoneList elements_; +}; + + +class RegExpAssertion: public RegExpTree { + public: + enum Type { + START_OF_LINE, START_OF_INPUT, END_OF_LINE, END_OF_INPUT, + BOUNDARY, NON_BOUNDARY + }; + explicit RegExpAssertion(Type type) : type_(type) { } + virtual void* Accept(RegExpVisitor* visitor, void* data); + virtual RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure); + virtual RegExpAssertion* AsAssertion(); + virtual bool IsAssertion(); + Type type() { return type_; } + private: + Type type_; +}; + + +class RegExpCharacterClass: public RegExpTree { + public: + RegExpCharacterClass(ZoneList* ranges, bool is_negated) + : ranges_(ranges), + is_negated_(is_negated) { } + explicit RegExpCharacterClass(uc16 type) + : ranges_(new ZoneList(2)), + is_negated_(false) { + CharacterRange::AddClassEscape(type, ranges_); + } + virtual void* Accept(RegExpVisitor* visitor, void* data); + virtual RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure); + virtual RegExpCharacterClass* AsCharacterClass(); + virtual bool IsCharacterClass(); + virtual bool IsTextElement() { return true; } + virtual void AppendToText(RegExpText* text); + ZoneList* ranges() { return ranges_; } + bool is_negated() { return is_negated_; } + private: + ZoneList* ranges_; + bool is_negated_; +}; + + +class RegExpAtom: public RegExpTree { + public: + explicit RegExpAtom(Vector data) : data_(data) { } + virtual void* Accept(RegExpVisitor* visitor, void* data); + virtual RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure); + virtual RegExpAtom* AsAtom(); + virtual bool IsAtom(); + virtual bool IsTextElement() { return true; } + virtual void AppendToText(RegExpText* text); + Vector data() { return data_; } + private: + Vector data_; +}; + + +class RegExpQuantifier: public RegExpTree { + public: + RegExpQuantifier(int min, int max, bool is_greedy, RegExpTree* body) + : min_(min), + max_(max), + is_greedy_(is_greedy), + body_(body) { } + virtual void* Accept(RegExpVisitor* visitor, void* data); + virtual RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure); + static RegExpNode* ToNode(int min, + int max, + bool is_greedy, + RegExpTree* body, + RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure); + virtual RegExpQuantifier* AsQuantifier(); + virtual bool IsQuantifier(); + int min() { return min_; } + int max() { return max_; } + bool is_greedy() { return is_greedy_; } + RegExpTree* body() { return body_; } + // We just use a very large integer value as infinity because 2^30 + // is infinite in practice. + static const int kInfinity = (1 << 30); + private: + int min_; + int max_; + bool is_greedy_; + RegExpTree* body_; +}; + + +enum CaptureAvailability { + CAPTURE_AVAILABLE, CAPTURE_UNREACHABLE, CAPTURE_PERMANENTLY_UNREACHABLE }; + +class RegExpCapture: public RegExpTree { + public: + explicit RegExpCapture(RegExpTree* body, int index) + : body_(body), index_(index), available_(CAPTURE_AVAILABLE) { } + virtual void* Accept(RegExpVisitor* visitor, void* data); + virtual RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure); + static RegExpNode* ToNode(RegExpTree* body, + int index, + RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure); + virtual RegExpCapture* AsCapture(); + virtual bool IsCapture(); + RegExpTree* body() { return body_; } + int index() { return index_; } + inline CaptureAvailability available() { return available_; } + inline void set_available(CaptureAvailability availability) { + available_ = availability; + } + static int StartRegister(int index) { return index * 2; } + static int EndRegister(int index) { return index * 2 + 1; } + private: + RegExpTree* body_; + int index_; + CaptureAvailability available_; +}; + + +class RegExpLookahead: public RegExpTree { + public: + RegExpLookahead(RegExpTree* body, bool is_positive) + : body_(body), + is_positive_(is_positive) { } + virtual void* Accept(RegExpVisitor* visitor, void* data); + virtual RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure); + virtual RegExpLookahead* AsLookahead(); + virtual bool IsLookahead(); + RegExpTree* body() { return body_; } + bool is_positive() { return is_positive_; } + private: + RegExpTree* body_; + bool is_positive_; +}; + + +class RegExpBackReference: public RegExpTree { + public: + explicit RegExpBackReference(RegExpCapture* capture) + : capture_(capture) { } + virtual void* Accept(RegExpVisitor* visitor, void* data); + virtual RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure); + virtual RegExpBackReference* AsBackReference(); + virtual bool IsBackReference(); + int index() { return capture_->index(); } + RegExpCapture* capture() { return capture_; } + private: + RegExpCapture* capture_; +}; + + +class RegExpEmpty: public RegExpTree { + public: + RegExpEmpty() { } + virtual void* Accept(RegExpVisitor* visitor, void* data); + virtual RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure); + virtual RegExpEmpty* AsEmpty(); + virtual bool IsEmpty(); + static RegExpEmpty* GetInstance() { return &kInstance; } + private: + static RegExpEmpty kInstance; +}; + + +class RegExpVisitor BASE_EMBEDDED { + public: + virtual ~RegExpVisitor() { } +#define MAKE_CASE(Name) \ + virtual void* Visit##Name(RegExp##Name*, void* data) = 0; + FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) +#undef MAKE_CASE +}; + + +// ---------------------------------------------------------------------------- // Basic visitor // - leaf node visitors are abstract. diff --git a/src/builtins.cc b/src/builtins.cc index fdff2d2..c44b2e2 100644 --- a/src/builtins.cc +++ b/src/builtins.cc @@ -647,7 +647,7 @@ void Builtins::Setup(bool create_heap_objects) { // During startup it's OK to always allocate and defer GC to later. // This simplifies things because we don't need to retry. AlwaysAllocateScope __scope__; - code = Heap::CreateCode(desc, NULL, flags); + code = Heap::CreateCode(desc, NULL, flags, NULL); if (code->IsFailure()) { v8::internal::V8::FatalProcessOutOfMemory("CreateCode"); } diff --git a/src/bytecodes-irregexp.h b/src/bytecodes-irregexp.h new file mode 100644 index 0000000..9469c04 --- /dev/null +++ b/src/bytecodes-irregexp.h @@ -0,0 +1,78 @@ +// Copyright 2008 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +#ifndef V8_BYTECODES_IRREGEXP_H_ +#define V8_BYTECODES_IRREGEXP_H_ + +namespace v8 { namespace internal { + +#define BYTECODE_ITERATOR(V) \ +V(BREAK, 0, 1) /* break */ \ +V(PUSH_CP, 1, 5) /* push_cp offset32 */ \ +V(PUSH_BT, 2, 5) /* push_bt addr32 */ \ +V(PUSH_REGISTER, 3, 2) /* push_register register_index */ \ +V(SET_REGISTER_TO_CP, 4, 6) /* set_register_to_cp register_index offset32 */ \ +V(SET_CP_TO_REGISTER, 5, 2) /* set_cp_to_registger register_index */ \ +V(SET_REGISTER_TO_SP, 6, 2) /* set_register_to_sp register_index */ \ +V(SET_SP_TO_REGISTER, 7, 2) /* set_sp_to_registger register_index */ \ +V(SET_REGISTER, 8, 6) /* set_register register_index value32 */ \ +V(ADVANCE_REGISTER, 9, 6) /* advance_register register_index value32 */ \ +V(POP_CP, 10, 1) /* pop_cp */ \ +V(POP_BT, 11, 1) /* pop_bt */ \ +V(POP_REGISTER, 12, 2) /* pop_register register_index */ \ +V(FAIL, 13, 1) /* fail */ \ +V(SUCCEED, 14, 1) /* succeed */ \ +V(ADVANCE_CP, 15, 5) /* advance_cp offset32 */ \ +V(GOTO, 16, 5) /* goto addr32 */ \ +V(LOAD_CURRENT_CHAR, 17, 9) /* load offset32 addr32 */ \ +V(CHECK_CHAR, 18, 7) /* check_char uc16 addr32 */ \ +V(CHECK_NOT_CHAR, 19, 7) /* check_not_char uc16 addr32 */ \ +V(OR_CHECK_NOT_CHAR, 20, 9) /* or_check_not_char uc16 uc16 addr32 */ \ +V(MINUS_OR_CHECK_NOT_CHAR, 21, 9) /* minus_or_check_not_char uc16 uc16 ad...*/ \ +V(CHECK_LT, 22, 7) /* check_lt uc16 addr32 */ \ +V(CHECK_GT, 23, 7) /* check_gr uc16 addr32 */ \ +V(CHECK_NOT_BACK_REF, 24, 6) /* check_not_back_ref capture_idx addr32 */ \ +V(LOOKUP_MAP1, 25, 11) /* l_map1 start16 bit_map_addr32 addr32 */ \ +V(LOOKUP_MAP2, 26, 99) /* l_map2 start16 half_nibble_map_addr32* */ \ +V(LOOKUP_MAP8, 27, 99) /* l_map8 start16 byte_map addr32* */ \ +V(LOOKUP_HI_MAP8, 28, 99) /* l_himap8 start8 byte_map_addr32 addr32* */ \ +V(CHECK_REGISTER_LT, 29, 8) /* check_reg_lt register_index value16 addr32 */ \ +V(CHECK_REGISTER_GE, 30, 8) /* check_reg_ge register_index value16 addr32 */ \ + +#define DECLARE_BYTECODES(name, code, length) \ + static const int BC_##name = code; +BYTECODE_ITERATOR(DECLARE_BYTECODES) +#undef DECLARE_BYTECODES + +#define DECLARE_BYTECODE_LENGTH(name, code, length) \ + static const int BC_##name##_LENGTH = length; +BYTECODE_ITERATOR(DECLARE_BYTECODE_LENGTH) +#undef DECLARE_BYTECODE_LENGTH +} } + +#endif // V8_BYTECODES_IRREGEXP_H_ diff --git a/src/checks.h b/src/checks.h index 13075f8..77c43bc 100644 --- a/src/checks.h +++ b/src/checks.h @@ -237,12 +237,14 @@ template class StaticAssertionHelper { }; // The ASSERT macro is equivalent to CHECK except that it only // generates code in debug builds. Ditto STATIC_ASSERT. #ifdef DEBUG +#define ASSERT_RESULT(expr) CHECK(expr) #define ASSERT(condition) CHECK(condition) #define ASSERT_EQ(v1, v2) CHECK_EQ(v1, v2) #define ASSERT_NE(v1, v2) CHECK_NE(v1, v2) #define STATIC_ASSERT(test) STATIC_CHECK(test) #define SLOW_ASSERT(condition) if (FLAG_enable_slow_asserts) CHECK(condition) #else +#define ASSERT_RESULT(expr) (expr) #define ASSERT(condition) ((void) 0) #define ASSERT_EQ(v1, v2) ((void) 0) #define ASSERT_NE(v1, v2) ((void) 0) @@ -256,4 +258,6 @@ template class StaticAssertionHelper { }; #define ASSERT_SIZE_TAG_ALIGNED(size) ASSERT((size & kHeapObjectTagMask) == 0) +#define ASSERT_NOT_NULL(p) ASSERT_NE(NULL, p) + #endif // V8_CHECKS_H_ diff --git a/src/constants-arm.h b/src/constants-arm.h index c74708b..46ddb15 100644 --- a/src/constants-arm.h +++ b/src/constants-arm.h @@ -120,7 +120,7 @@ typedef int32_t instr_t; // bits. // // bool InstructionSetsConditionCodes(byte* ptr) { -// Instr *instr = Instr::At(ptr); +// Instr* instr = Instr::At(ptr); // int type = instr->TypeField(); // return ((type == 0) || (type == 1)) && instr->HasS(); // } diff --git a/src/factory.cc b/src/factory.cc index 209cd4d..2c82b1d 100644 --- a/src/factory.cc +++ b/src/factory.cc @@ -170,9 +170,9 @@ Handle Factory::NewProxy(const AccessorDescriptor* desc) { } -Handle Factory::NewByteArray(int length) { +Handle Factory::NewByteArray(int length, PretenureFlag pretenure) { ASSERT(0 <= length); - CALL_HEAP_FUNCTION(Heap::AllocateByteArray(length), ByteArray); + CALL_HEAP_FUNCTION(Heap::AllocateByteArray(length, pretenure), ByteArray); } @@ -458,8 +458,14 @@ Handle Factory::NewFunctionWithPrototype(Handle name, Handle Factory::NewCode(const CodeDesc& desc, ScopeInfo<>* sinfo, + Code::Flags flags, Handle self_ref) { + CALL_HEAP_FUNCTION(Heap::CreateCode( + desc, sinfo, flags, reinterpret_cast(self_ref.location())), Code); +} + +Handle Factory::NewCode(const CodeDesc& desc, ScopeInfo<>* sinfo, Code::Flags flags) { - CALL_HEAP_FUNCTION(Heap::CreateCode(desc, sinfo, flags), Code); + CALL_HEAP_FUNCTION(Heap::CreateCode(desc, sinfo, flags, NULL), Code); } @@ -706,8 +712,11 @@ Handle Factory::CreateApiFunction( ASSERT(type != INVALID_TYPE); Handle result = - Factory::NewFunction(Factory::empty_symbol(), type, instance_size, - code, true); + Factory::NewFunction(Factory::empty_symbol(), + type, + instance_size, + code, + true); // Set class name. Handle class_name = Handle(obj->class_name()); if (class_name->IsString()) { diff --git a/src/factory.h b/src/factory.h index 89e2d69..429c483 100644 --- a/src/factory.h +++ b/src/factory.h @@ -147,7 +147,8 @@ class Factory : public AllStatic { // the old generation). static Handle NewProxy(const AccessorDescriptor* proxy); - static Handle NewByteArray(int length); + static Handle NewByteArray(int length, + PretenureFlag pretenure = NOT_TENURED); static Handle NewMap(InstanceType type, int instance_size); @@ -206,6 +207,9 @@ class Factory : public AllStatic { Handle context); static Handle NewCode(const CodeDesc& desc, ScopeInfo<>* sinfo, + Code::Flags flags, Handle self_reference); + + static Handle NewCode(const CodeDesc& desc, ScopeInfo<>* sinfo, Code::Flags flags); static Handle CopyCode(Handle code); diff --git a/src/flag-definitions.h b/src/flag-definitions.h index 2c253db..59926ea 100644 --- a/src/flag-definitions.h +++ b/src/flag-definitions.h @@ -289,6 +289,12 @@ DEFINE_bool(collect_heap_spill_statistics, false, "report heap spill statistics along with heap_stats " "(requires heap_stats)") +DEFINE_bool(irregexp, false, "new regular expression code") +DEFINE_bool(trace_regexps, false, "trace Irregexp execution") +DEFINE_bool(trace_regexp_bytecodes, false, "trace Irregexp bytecode executon") +DEFINE_bool(attempt_case_independent, false, "attempt to run Irregexp case independent") +DEFINE_bool(irregexp_native, false, "use native code Irregexp implementation (IA32 only)") + // // Logging and profiling only flags // diff --git a/src/globals.h b/src/globals.h index 387ed88..69e9006 100644 --- a/src/globals.h +++ b/src/globals.h @@ -178,10 +178,16 @@ class Map; class MapSpace; class MarkCompactCollector; class NewSpace; +class NodeVisitor; class Object; class OldSpace; class Property; class Proxy; +class RegExpNode; +struct RegExpParseResult; +class RegExpTree; +class RegExpCompiler; +class RegExpVisitor; class Scope; template class ScopeInfo; class Script; diff --git a/src/heap.cc b/src/heap.cc index 804e525..53b0774 100644 --- a/src/heap.cc +++ b/src/heap.cc @@ -392,8 +392,7 @@ void Heap::PerformGarbageCollection(AllocationSpace space, } Counters::objs_since_last_young.Set(0); - // Process weak handles post gc. - GlobalHandles::PostGarbageCollectionProcessing(); + PostGarbageCollectionProcessing(); if (collector == MARK_COMPACTOR) { // Register the amount of external allocated memory. @@ -408,6 +407,14 @@ void Heap::PerformGarbageCollection(AllocationSpace space, } +void Heap::PostGarbageCollectionProcessing() { + // Process weak handles post gc. + GlobalHandles::PostGarbageCollectionProcessing(); + // Update flat string readers. + FlatStringReader::PostGarbageCollectionProcessing(); +} + + void Heap::MarkCompact(GCTracer* tracer) { gc_state_ = MARK_COMPACT; mc_count_++; @@ -1582,6 +1589,24 @@ Object* Heap::LookupSingleCharacterStringFromCode(uint16_t code) { } +Object* Heap::AllocateByteArray(int length, PretenureFlag pretenure) { + if (pretenure == NOT_TENURED) { + return AllocateByteArray(length); + } + int size = ByteArray::SizeFor(length); + AllocationSpace space = + size > MaxHeapObjectSize() ? LO_SPACE : OLD_DATA_SPACE; + + Object* result = AllocateRaw(size, space, OLD_DATA_SPACE); + + if (result->IsFailure()) return result; + + reinterpret_cast(result)->set_map(byte_array_map()); + reinterpret_cast(result)->set_length(length); + return result; +} + + Object* Heap::AllocateByteArray(int length) { int size = ByteArray::SizeFor(length); AllocationSpace space = @@ -1599,7 +1624,8 @@ Object* Heap::AllocateByteArray(int length) { Object* Heap::CreateCode(const CodeDesc& desc, ScopeInfo<>* sinfo, - Code::Flags flags) { + Code::Flags flags, + Code** self_reference) { // Compute size int body_size = RoundUp(desc.instr_size + desc.reloc_size, kObjectAlignment); int sinfo_size = 0; @@ -1622,7 +1648,16 @@ Object* Heap::CreateCode(const CodeDesc& desc, code->set_sinfo_size(sinfo_size); code->set_flags(flags); code->set_ic_flag(Code::IC_TARGET_IS_ADDRESS); - code->CopyFrom(desc); // migrate generated code + // Allow self references to created code object. + if (self_reference != NULL) { + *self_reference = code; + } + // Migrate generated code. + // The generated code can contain Object** values (typically from handles) + // that are dereferenced during the copy to point directly to the actual heap + // objects. These pointers can include references to the code object itself, + // through the self_reference parameter. + code->CopyFrom(desc); if (sinfo != NULL) sinfo->Serialize(code); // write scope info #ifdef DEBUG diff --git a/src/heap.h b/src/heap.h index 694ee56..bbd679b 100644 --- a/src/heap.h +++ b/src/heap.h @@ -391,7 +391,13 @@ class Heap : public AllStatic { // Allocate a byte array of the specified length // Returns Failure::RetryAfterGC(requested_bytes, space) if the allocation // failed. - // Please not this does not perform a garbage collection. + // Please note this does not perform a garbage collection. + static Object* AllocateByteArray(int length, PretenureFlag pretenure); + + // Allocate a non-tenured byte array of the specified length + // Returns Failure::RetryAfterGC(requested_bytes, space) if the allocation + // failed. + // Please note this does not perform a garbage collection. static Object* AllocateByteArray(int length); // Allocates a fixed array initialized with undefined values @@ -549,11 +555,14 @@ class Heap : public AllStatic { // Makes a new native code object // Returns Failure::RetryAfterGC(requested_bytes, space) if the allocation - // failed. + // failed. On success, the pointer to the Code object is stored in the + // self_reference. This allows generated code to reference its own Code + // object by containing this pointer. // Please note this function does not perform a garbage collection. static Object* CreateCode(const CodeDesc& desc, ScopeInfo<>* sinfo, - Code::Flags flags); + Code::Flags flags, + Code** self_reference = NULL); static Object* CopyCode(Code* code); // Finds the symbol for string in the symbol table. @@ -582,6 +591,9 @@ class Heap : public AllStatic { static void GarbageCollectionPrologue(); static void GarbageCollectionEpilogue(); + // Code that should be executed after the garbage collection proper. + static void PostGarbageCollectionProcessing(); + // Performs garbage collection operation. // Returns whether required_space bytes are available after the collection. static bool CollectGarbage(int required_space, AllocationSpace space); diff --git a/src/interpreter-irregexp.cc b/src/interpreter-irregexp.cc new file mode 100644 index 0000000..7c915eb --- /dev/null +++ b/src/interpreter-irregexp.cc @@ -0,0 +1,347 @@ +// Copyright 2008 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// A simple interpreter for the Irregexp byte code. + + +#include "v8.h" +#include "utils.h" +#include "ast.h" +#include "bytecodes-irregexp.h" +#include "interpreter-irregexp.h" + + +namespace v8 { namespace internal { + + +#ifdef DEBUG +static void TraceInterpreter(const byte* code_base, + const byte* pc, + int stack_depth, + int current_position, + int bytecode_length, + const char* bytecode_name) { + if (FLAG_trace_regexp_bytecodes) { + PrintF("pc = %02x, sp = %d, current = %d, bc = %s", + pc - code_base, + stack_depth, + current_position, + bytecode_name); + for (int i = 1; i < bytecode_length; i++) { + printf(", %02x", pc[i]); + } + printf("\n"); + } +} + + +# define BYTECODE(name) case BC_##name: \ + TraceInterpreter(code_base, \ + pc, \ + backtrack_sp - backtrack_stack, \ + current, \ + BC_##name##_LENGTH, \ + #name); +#else +# define BYTECODE(name) case BC_##name: // NOLINT +#endif + + + +static bool RawMatch(const byte* code_base, + Vector subject, + int* registers, + int current) { + const byte* pc = code_base; + static const int kBacktrackStackSize = 10000; + int backtrack_stack[kBacktrackStackSize]; + int backtrack_stack_space = kBacktrackStackSize; + int* backtrack_sp = backtrack_stack; + int current_char = -1; +#ifdef DEBUG + if (FLAG_trace_regexp_bytecodes) { + PrintF("\n\nStart bytecode interpreter\n\n"); + } +#endif + while (true) { + switch (*pc) { + BYTECODE(BREAK) + UNREACHABLE(); + return false; + BYTECODE(PUSH_CP) + if (--backtrack_stack_space < 0) { + return false; // No match on backtrack stack overflow. + } + *backtrack_sp++ = current + Load32(pc + 1); + pc += BC_PUSH_CP_LENGTH; + break; + BYTECODE(PUSH_BT) + if (--backtrack_stack_space < 0) { + return false; // No match on backtrack stack overflow. + } + *backtrack_sp++ = Load32(pc + 1); + pc += BC_PUSH_BT_LENGTH; + break; + BYTECODE(PUSH_REGISTER) + if (--backtrack_stack_space < 0) { + return false; // No match on backtrack stack overflow. + } + *backtrack_sp++ = registers[pc[1]]; + pc += BC_PUSH_REGISTER_LENGTH; + break; + BYTECODE(SET_REGISTER) + registers[pc[1]] = Load32(pc + 2); + pc += BC_SET_REGISTER_LENGTH; + break; + BYTECODE(ADVANCE_REGISTER) + registers[pc[1]] += Load32(pc + 2); + pc += BC_ADVANCE_REGISTER_LENGTH; + break; + BYTECODE(SET_REGISTER_TO_CP) + registers[pc[1]] = current + Load32(pc + 2); + pc += BC_SET_REGISTER_TO_CP_LENGTH; + break; + BYTECODE(SET_CP_TO_REGISTER) + current = registers[pc[1]]; + pc += BC_SET_CP_TO_REGISTER_LENGTH; + break; + BYTECODE(SET_REGISTER_TO_SP) + registers[pc[1]] = backtrack_sp - backtrack_stack; + pc += BC_SET_REGISTER_TO_SP_LENGTH; + break; + BYTECODE(SET_SP_TO_REGISTER) + backtrack_sp = backtrack_stack + registers[pc[1]]; + backtrack_stack_space = kBacktrackStackSize - + (backtrack_sp - backtrack_stack); + pc += BC_SET_SP_TO_REGISTER_LENGTH; + break; + BYTECODE(POP_CP) + backtrack_stack_space++; + --backtrack_sp; + current = *backtrack_sp; + pc += BC_POP_CP_LENGTH; + break; + BYTECODE(POP_BT) + backtrack_stack_space++; + --backtrack_sp; + pc = code_base + *backtrack_sp; + break; + BYTECODE(POP_REGISTER) + backtrack_stack_space++; + --backtrack_sp; + registers[pc[1]] = *backtrack_sp; + pc += BC_POP_REGISTER_LENGTH; + break; + BYTECODE(FAIL) + return false; + BYTECODE(SUCCEED) + return true; + BYTECODE(ADVANCE_CP) + current += Load32(pc + 1); + pc += BC_ADVANCE_CP_LENGTH; + break; + BYTECODE(GOTO) + pc = code_base + Load32(pc + 1); + break; + BYTECODE(LOAD_CURRENT_CHAR) { + int pos = current + Load32(pc + 1); + if (pos >= subject.length()) { + pc = code_base + Load32(pc + 5); + } else { + current_char = subject[pos]; + pc += BC_LOAD_CURRENT_CHAR_LENGTH; + } + break; + } + BYTECODE(CHECK_CHAR) { + int c = Load16(pc + 1); + if (c == current_char) { + pc = code_base + Load32(pc + 3); + } else { + pc += BC_CHECK_CHAR_LENGTH; + } + break; + } + BYTECODE(CHECK_NOT_CHAR) { + int c = Load16(pc + 1); + if (c != current_char) { + pc = code_base + Load32(pc + 3); + } else { + pc += BC_CHECK_NOT_CHAR_LENGTH; + } + break; + } + BYTECODE(OR_CHECK_NOT_CHAR) { + int c = Load16(pc + 1); + if (c != (current_char | Load16(pc + 3))) { + pc = code_base + Load32(pc + 5); + } else { + pc += BC_OR_CHECK_NOT_CHAR_LENGTH; + } + break; + } + BYTECODE(MINUS_OR_CHECK_NOT_CHAR) { + int c = Load16(pc + 1); + int m = Load16(pc + 3); + if (c != ((current_char - m) | m)) { + pc = code_base + Load32(pc + 5); + } else { + pc += BC_MINUS_OR_CHECK_NOT_CHAR_LENGTH; + } + break; + } + BYTECODE(CHECK_LT) { + int limit = Load16(pc + 1); + if (current_char < limit) { + pc = code_base + Load32(pc + 3); + } else { + pc += BC_CHECK_LT_LENGTH; + } + break; + } + BYTECODE(CHECK_GT) { + int limit = Load16(pc + 1); + if (current_char > limit) { + pc = code_base + Load32(pc + 3); + } else { + pc += BC_CHECK_GT_LENGTH; + } + break; + } + BYTECODE(CHECK_REGISTER_LT) + if (registers[pc[1]] < Load16(pc + 2)) { + pc = code_base + Load32(pc + 4); + } else { + pc += BC_CHECK_REGISTER_LT_LENGTH; + } + break; + BYTECODE(CHECK_REGISTER_GE) + if (registers[pc[1]] >= Load16(pc + 2)) { + pc = code_base + Load32(pc + 4); + } else { + pc += BC_CHECK_REGISTER_GE_LENGTH; + } + break; + BYTECODE(LOOKUP_MAP1) { + // Look up character in a bitmap. If we find a 0, then jump to the + // location at pc + 7. Otherwise fall through! + int index = current_char - Load16(pc + 1); + byte map = code_base[Load32(pc + 3) + (index >> 3)]; + map = ((map >> (index & 7)) & 1); + if (map == 0) { + pc = code_base + Load32(pc + 7); + } else { + pc += BC_LOOKUP_MAP1_LENGTH; + } + break; + } + BYTECODE(LOOKUP_MAP2) { + // Look up character in a half-nibble map. If we find 00, then jump to + // the location at pc + 7. If we find 01 then jump to location at + // pc + 11, etc. + int index = (current_char - Load16(pc + 1)) << 1; + byte map = code_base[Load32(pc + 3) + (index >> 3)]; + map = ((map >> (index & 7)) & 3); + if (map < 2) { + if (map == 0) { + pc = code_base + Load32(pc + 7); + } else { + pc = code_base + Load32(pc + 11); + } + } else { + if (map == 2) { + pc = code_base + Load32(pc + 15); + } else { + pc = code_base + Load32(pc + 19); + } + } + break; + } + BYTECODE(LOOKUP_MAP8) { + // Look up character in a byte map. Use the byte as an index into a + // table that follows this instruction immediately. + int index = current_char - Load16(pc + 1); + byte map = code_base[Load32(pc + 3) + index]; + const byte* new_pc = code_base + Load32(pc + 7) + (map << 2); + pc = code_base + Load32(new_pc); + break; + } + BYTECODE(LOOKUP_HI_MAP8) { + // Look up high byte of this character in a byte map. Use the byte as + // an index into a table that follows this instruction immediately. + int index = (current_char >> 8) - pc[1]; + byte map = code_base[Load32(pc + 2) + index]; + const byte* new_pc = code_base + Load32(pc + 6) + (map << 2); + pc = code_base + Load32(new_pc); + break; + } + BYTECODE(CHECK_NOT_BACK_REF) { + int from = registers[pc[1]]; + int len = registers[pc[1] + 1] - from; + if (current + len > subject.length()) { + pc = code_base + Load32(pc + 2); + break; + } else { + int i; + for (i = 0; i < len; i++) { + if (subject[from + i] != subject[current + i]) { + pc = code_base + Load32(pc + 2); + break; + } + } + if (i < len) break; + current += len; + } + pc += BC_CHECK_NOT_BACK_REF_LENGTH; + break; + } + default: + UNREACHABLE(); + break; + } + } +} + + +bool IrregexpInterpreter::Match(Handle code_array, + Handle subject16, + int* registers, + int start_position) { + ASSERT(StringShape(*subject16).IsTwoByteRepresentation()); + ASSERT(subject16->IsFlat(StringShape(*subject16))); + + + AssertNoAllocation a; + const byte* code_base = code_array->GetDataStartAddress(); + return RawMatch(code_base, + Vector(subject16->GetTwoByteData(), + subject16->length()), + registers, + start_position); +} + +} } // namespace v8::internal diff --git a/src/interpreter-irregexp.h b/src/interpreter-irregexp.h new file mode 100644 index 0000000..ee8d440 --- /dev/null +++ b/src/interpreter-irregexp.h @@ -0,0 +1,47 @@ +// Copyright 2008 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// A simple interpreter for the Regexp2000 byte code. + +#ifndef V8_INTERPRETER_IRREGEXP_H_ +#define V8_INTERPRETER_IRREGEXP_H_ + +namespace v8 { namespace internal { + + +class IrregexpInterpreter { + public: + static bool Match(Handle code, + Handle subject16, + int* captures, + int start_position); +}; + + +} } // namespace v8::internal + +#endif // V8_INTERPRETER_IRREGEXP_H_ diff --git a/src/jsregexp-inl.h b/src/jsregexp-inl.h new file mode 100644 index 0000000..ec3b5ee --- /dev/null +++ b/src/jsregexp-inl.h @@ -0,0 +1,266 @@ +// Copyright 2006-2008 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef V8_JSREGEXP_INL_H_ +#define V8_JSREGEXP_INL_H_ + + +#include "jsregexp.h" +#include "regexp-macro-assembler.h" + + +namespace v8 { +namespace internal { + + +template +bool ZoneSplayTree::Insert(const Key& key, Locator* locator) { + if (is_empty()) { + // If the tree is empty, insert the new node. + root_ = new Node(key, C::kNoValue); + } else { + // Splay on the key to move the last node on the search path + // for the key to the root of the tree. + Splay(key); + // Ignore repeated insertions with the same key. + int cmp = C::Compare(key, root_->key_); + if (cmp == 0) { + locator->bind(root_); + return false; + } + // Insert the new node. + Node* node = new Node(key, C::kNoValue); + if (cmp > 0) { + node->left_ = root_; + node->right_ = root_->right_; + root_->right_ = NULL; + } else { + node->right_ = root_; + node->left_ = root_->left_; + root_->left_ = NULL; + } + root_ = node; + } + locator->bind(root_); + return true; +} + + +template +bool ZoneSplayTree::Find(const Key& key, Locator* locator) { + if (is_empty()) + return false; + Splay(key); + if (C::Compare(key, root_->key_) == 0) { + locator->bind(root_); + return true; + } else { + return false; + } +} + + +template +bool ZoneSplayTree::FindGreatestLessThan(const Key& key, + Locator* locator) { + if (is_empty()) + return false; + // Splay on the key to move the node with the given key or the last + // node on the search path to the top of the tree. + Splay(key); + // Now the result is either the root node or the greatest node in + // the left subtree. + int cmp = C::Compare(root_->key_, key); + if (cmp <= 0) { + locator->bind(root_); + return true; + } else { + Node* temp = root_; + root_ = root_->left_; + bool result = FindGreatest(locator); + root_ = temp; + return result; + } +} + + +template +bool ZoneSplayTree::FindLeastGreaterThan(const Key& key, + Locator* locator) { + if (is_empty()) + return false; + // Splay on the key to move the node with the given key or the last + // node on the search path to the top of the tree. + Splay(key); + // Now the result is either the root node or the least node in + // the right subtree. + int cmp = C::Compare(root_->key_, key); + if (cmp >= 0) { + locator->bind(root_); + return true; + } else { + Node* temp = root_; + root_ = root_->right_; + bool result = FindLeast(locator); + root_ = temp; + return result; + } +} + + +template +bool ZoneSplayTree::FindGreatest(Locator* locator) { + if (is_empty()) + return false; + Node* current = root_; + while (current->right_ != NULL) + current = current->right_; + locator->bind(current); + return true; +} + + +template +bool ZoneSplayTree::FindLeast(Locator* locator) { + if (is_empty()) + return false; + Node* current = root_; + while (current->left_ != NULL) + current = current->left_; + locator->bind(current); + return true; +} + + +template +bool ZoneSplayTree::Remove(const Key& key) { + // Bail if the tree is empty + if (is_empty()) + return false; + // Splay on the key to move the node with the given key to the top. + Splay(key); + // Bail if the key is not in the tree + if (C::Compare(key, root_->key_) != 0) + return false; + if (root_->left_ == NULL) { + // No left child, so the new tree is just the right child. + root_ = root_->right_; + } else { + // Left child exists. + Node* right = root_->right_; + // Make the original left child the new root. + root_ = root_->left_; + // Splay to make sure that the new root has an empty right child. + Splay(key); + // Insert the original right child as the right child of the new + // root. + root_->right_ = right; + } + return true; +} + + +template +void ZoneSplayTree::Splay(const Key& key) { + if (is_empty()) + return; + Node dummy_node(C::kNoKey, C::kNoValue); + // Create a dummy node. The use of the dummy node is a bit + // counter-intuitive: The right child of the dummy node will hold + // the L tree of the algorithm. The left child of the dummy node + // will hold the R tree of the algorithm. Using a dummy node, left + // and right will always be nodes and we avoid special cases. + Node* dummy = &dummy_node; + Node* left = dummy; + Node* right = dummy; + Node* current = root_; + while (true) { + int cmp = C::Compare(key, current->key_); + if (cmp < 0) { + if (current->left_ == NULL) + break; + if (C::Compare(key, current->left_->key_) < 0) { + // Rotate right. + Node* temp = current->left_; + current->left_ = temp->right_; + temp->right_ = current; + current = temp; + if (current->left_ == NULL) + break; + } + // Link right. + right->left_ = current; + right = current; + current = current->left_; + } else if (cmp > 0) { + if (current->right_ == NULL) + break; + if (C::Compare(key, current->right_->key_) > 0) { + // Rotate left. + Node* temp = current->right_; + current->right_ = temp->left_; + temp->left_ = current; + current = temp; + if (current->right_ == NULL) + break; + } + // Link left. + left->right_ = current; + left = current; + current = current->right_; + } else { + break; + } + } + // Assemble. + left->right_ = current->left_; + right->left_ = current->right_; + current->left_ = dummy->right_; + current->right_ = dummy->left_; + root_ = current; +} + + +template +static void DoForEach(Node* node, Callback* callback) { + if (node == NULL) return; + DoForEach(node->left(), callback); + callback->Call(node->key(), node->value()); + DoForEach(node->right(), callback); +} + + +void RegExpNode::Bind(RegExpMacroAssembler* macro) { + macro->Bind(&label_); +} + + +} // namespace internal +} // namespace v8 + + +#endif // V8_JSREGEXP_INL_H_ diff --git a/src/jsregexp.cc b/src/jsregexp.cc index cacf5ed..7e25ea8 100644 --- a/src/jsregexp.cc +++ b/src/jsregexp.cc @@ -25,15 +25,30 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#define _HAS_EXCEPTIONS 0 +#include + #include "v8.h" +#include "ast.h" #include "execution.h" #include "factory.h" -#include "jsregexp.h" +#include "jsregexp-inl.h" #include "platform.h" #include "runtime.h" #include "top.h" #include "compilation-cache.h" +#include "string-stream.h" +#include "parser.h" +#include "assembler-irregexp.h" +#include "regexp-macro-assembler.h" +#include "regexp-macro-assembler-irregexp.h" +#if defined __arm__ || defined __thumb__ || defined ARM +// include regexp-macro-assembler-arm.h when created. +#else // ia32 +#include "regexp-macro-assembler-ia32.h" +#endif +#include "interpreter-irregexp.h" // Including pcre.h undefines DEBUG to avoid getting debug output from // the JSCRE implementation. Make sure to redefine it in debug mode @@ -45,11 +60,9 @@ #include "third_party/jscre/pcre.h" #endif -namespace v8 { namespace internal { +namespace v8 { namespace internal { -#define CAPTURE_INDEX 0 -#define INTERNAL_INDEX 1 static Failure* malloc_failure; @@ -176,7 +189,16 @@ static JSRegExp::Flags RegExpFlagsFromString(Handle str) { } -unibrow::Predicate is_reg_exp_special_char; +static inline void ThrowRegExpException(Handle re, + Handle pattern, + Handle error_text, + const char* message) { + Handle array = Factory::NewJSArray(2); + SetElement(array, 0, pattern); + SetElement(array, 1, error_text); + Handle regexp_err = Factory::NewSyntaxError(message, array); + Top::Throw(*regexp_err); +} Handle RegExpImpl::Compile(Handle re, @@ -186,20 +208,42 @@ Handle RegExpImpl::Compile(Handle re, Handle cached = CompilationCache::LookupRegExp(pattern, flags); bool in_cache = !cached.is_null(); Handle result; - StringShape shape(*pattern); if (in_cache) { re->set_data(*cached); result = re; } else { - bool is_atom = !flags.is_ignore_case(); - for (int i = 0; is_atom && i < pattern->length(shape); i++) { - if (is_reg_exp_special_char.get(pattern->Get(shape, i))) - is_atom = false; + FlattenString(pattern); + RegExpParseResult parse_result; + FlatStringReader reader(pattern); + if (!ParseRegExp(&reader, &parse_result)) { + // Throw an exception if we fail to parse the pattern. + ThrowRegExpException(re, + pattern, + parse_result.error, + "malformed_regexp"); + return Handle(); } - if (is_atom) { - result = AtomCompile(re, pattern, flags); + RegExpAtom* atom = parse_result.tree->AsAtom(); + if (atom != NULL && !flags.is_ignore_case()) { + if (parse_result.has_character_escapes) { + Vector atom_pattern = atom->data(); + Handle atom_string = + Factory::NewStringFromTwoByte(atom_pattern); + result = AtomCompile(re, pattern, flags, atom_string); + } else { + result = AtomCompile(re, pattern, flags, pattern); + } } else { - result = JsreCompile(re, pattern, flags); + RegExpNode* node = NULL; + Handle irregexp_data = + RegExpEngine::Compile(&parse_result, + &node, + flags.is_ignore_case()); + if (irregexp_data.is_null()) { + result = JscrePrepare(re, pattern, flags); + } else { + result = IrregexpPrepare(re, pattern, flags, irregexp_data); + } } Object* data = re->data(); if (data->IsFixedArray()) { @@ -220,9 +264,11 @@ Handle RegExpImpl::Exec(Handle regexp, Handle index) { switch (regexp->TypeTag()) { case JSRegExp::JSCRE: - return JsreExec(regexp, subject, index); + return JscreExec(regexp, subject, index); case JSRegExp::ATOM: return AtomExec(regexp, subject, index); + case JSRegExp::IRREGEXP: + return IrregexpExec(regexp, subject, index); default: UNREACHABLE(); return Handle(); @@ -234,9 +280,11 @@ Handle RegExpImpl::ExecGlobal(Handle regexp, Handle subject) { switch (regexp->TypeTag()) { case JSRegExp::JSCRE: - return JsreExecGlobal(regexp, subject); + return JscreExecGlobal(regexp, subject); case JSRegExp::ATOM: return AtomExecGlobal(regexp, subject); + case JSRegExp::IRREGEXP: + return IrregexpExecGlobal(regexp, subject); default: UNREACHABLE(); return Handle(); @@ -246,8 +294,9 @@ Handle RegExpImpl::ExecGlobal(Handle regexp, Handle RegExpImpl::AtomCompile(Handle re, Handle pattern, - JSRegExp::Flags flags) { - Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, pattern); + JSRegExp::Flags flags, + Handle match_pattern) { + Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, match_pattern); return re; } @@ -267,12 +316,8 @@ Handle RegExpImpl::AtomExec(Handle re, if (value == -1) return Factory::null_value(); Handle array = Factory::NewFixedArray(2); - array->set(0, - Smi::FromInt(value), - SKIP_WRITE_BARRIER); - array->set(1, - Smi::FromInt(value + needle->length()), - SKIP_WRITE_BARRIER); + array->set(0, Smi::FromInt(value)); + array->set(1, Smi::FromInt(value + needle->length())); return Factory::NewJSArrayWithElements(array); } @@ -296,12 +341,8 @@ Handle RegExpImpl::AtomExecGlobal(Handle re, int end = value + needle_length; Handle array = Factory::NewFixedArray(2); - array->set(0, - Smi::FromInt(value), - SKIP_WRITE_BARRIER); - array->set(1, - Smi::FromInt(end), - SKIP_WRITE_BARRIER); + array->set(0, Smi::FromInt(value)); + array->set(1, Smi::FromInt(end)); Handle pair = Factory::NewJSArrayWithElements(array); SetElement(result, match_count, pair); match_count++; @@ -312,6 +353,24 @@ Handle RegExpImpl::AtomExecGlobal(Handle re, } +HandleRegExpImpl::JscrePrepare(Handle re, + Handle pattern, + JSRegExp::Flags flags) { + Handle value(Heap::undefined_value()); + Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value); + return re; +} + + +HandleRegExpImpl::IrregexpPrepare(Handle re, + Handle pattern, + JSRegExp::Flags flags, + Handle irregexp_data) { + Factory::SetRegExpData(re, JSRegExp::IRREGEXP, pattern, flags, irregexp_data); + return re; +} + + static inline Object* DoCompile(String* pattern, JSRegExp::Flags flags, unsigned* number_of_captures, @@ -358,9 +417,13 @@ void CompileWithRetryAfterGC(Handle pattern, } -Handle RegExpImpl::JsreCompile(Handle re, - Handle pattern, - JSRegExp::Flags flags) { +Handle RegExpImpl::JscreCompile(Handle re) { + ASSERT_EQ(re->TypeTag(), JSRegExp::JSCRE); + ASSERT(re->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()); + + Handle pattern(re->Pattern()); + JSRegExp::Flags flags = re->GetFlags(); + Handle two_byte_pattern = StringToTwoByte(pattern); unsigned number_of_captures; @@ -391,26 +454,110 @@ Handle RegExpImpl::JsreCompile(Handle re, Handle internal( ByteArray::FromDataStartAddress(reinterpret_cast
(code))); - Handle value = Factory::NewFixedArray(2); - value->set(CAPTURE_INDEX, Smi::FromInt(number_of_captures)); - value->set(INTERNAL_INDEX, *internal); + Handle value = Factory::NewFixedArray(kJscreDataLength); + value->set(kJscreNumberOfCapturesIndex, Smi::FromInt(number_of_captures)); + value->set(kJscreInternalIndex, *internal); Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value); return re; } -Handle RegExpImpl::JsreExecOnce(Handle regexp, - int num_captures, - Handle subject, - int previous_index, - const uc16* two_byte_subject, - int* offsets_vector, - int offsets_vector_length) { +Handle RegExpImpl::IrregexpExecOnce(Handle regexp, + int num_captures, + Handle two_byte_subject, + int previous_index, + int* offsets_vector, + int offsets_vector_length) { +#ifdef DEBUG + if (FLAG_trace_regexp_bytecodes) { + String* pattern = regexp->Pattern(); + PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString())); + PrintF("\n\nSubject string: '%s'\n\n", *(two_byte_subject->ToCString())); + } +#endif + ASSERT(StringShape(*two_byte_subject).IsTwoByteRepresentation()); + ASSERT(two_byte_subject->IsFlat(StringShape(*two_byte_subject))); + bool rc; + { + for (int i = (num_captures + 1) * 2 - 1; i >= 0; i--) { + offsets_vector[i] = -1; + } + + LOG(RegExpExecEvent(regexp, previous_index, two_byte_subject)); + + FixedArray* irregexp = + FixedArray::cast(regexp->DataAt(JSRegExp::kIrregexpDataIndex)); + int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value(); + + switch (tag) { + case RegExpMacroAssembler::kIA32Implementation: { + Code* code = Code::cast(irregexp->get(kIrregexpCodeIndex)); + SmartPointer captures(NewArray((num_captures + 1) * 2)); + Address start_addr = + Handle::cast(two_byte_subject)->GetCharsAddress(); + int start_offset = + start_addr - reinterpret_cast
(*two_byte_subject); + int end_offset = + start_offset + (two_byte_subject->length() - previous_index) * 2; + typedef bool testfunc(String**, int, int, int*); + testfunc* test = FUNCTION_CAST(code->entry()); + rc = test(two_byte_subject.location(), + start_offset, + end_offset, + *captures); + if (rc) { + // Capture values are relative to start_offset only. + for (int i = 0; i < offsets_vector_length; i++) { + if (offsets_vector[i] >= 0) { + offsets_vector[i] += previous_index; + } + } + } + break; + } + default: + case RegExpMacroAssembler::kARMImplementation: + UNREACHABLE(); + rc = false; + break; + case RegExpMacroAssembler::kBytecodeImplementation: { + Handle byte_codes = IrregexpCode(regexp); + + rc = IrregexpInterpreter::Match(byte_codes, + two_byte_subject, + offsets_vector, + previous_index); + break; + } + } + } + + if (!rc) { + return Factory::null_value(); + } + + Handle array = Factory::NewFixedArray(2 * (num_captures+1)); + // The captures come in (start, end+1) pairs. + for (int i = 0; i < 2 * (num_captures+1); i += 2) { + array->set(i, Smi::FromInt(offsets_vector[i])); + array->set(i+1, Smi::FromInt(offsets_vector[i+1])); + } + return Factory::NewJSArrayWithElements(array); +} + + +Handle RegExpImpl::JscreExecOnce(Handle regexp, + int num_captures, + Handle subject, + int previous_index, + const uc16* two_byte_subject, + int* offsets_vector, + int offsets_vector_length) { int rc; { AssertNoAllocation a; - ByteArray* internal = JsreInternal(regexp); + ByteArray* internal = JscreInternal(regexp); const JscreRegExp* js_regexp = reinterpret_cast(internal->GetDataStartAddress()); @@ -444,12 +591,8 @@ Handle RegExpImpl::JsreExecOnce(Handle regexp, Handle array = Factory::NewFixedArray(2 * (num_captures+1)); // The captures come in (start, end+1) pairs. for (int i = 0; i < 2 * (num_captures+1); i += 2) { - array->set(i, - Smi::FromInt(offsets_vector[i]), - SKIP_WRITE_BARRIER); - array->set(i+1, - Smi::FromInt(offsets_vector[i+1]), - SKIP_WRITE_BARRIER); + array->set(i, Smi::FromInt(offsets_vector[i])); + array->set(i+1, Smi::FromInt(offsets_vector[i+1])); } return Factory::NewJSArrayWithElements(array); } @@ -457,8 +600,8 @@ Handle RegExpImpl::JsreExecOnce(Handle regexp, class OffsetsVector { public: - inline OffsetsVector(int num_captures) { - offsets_vector_length_ = (num_captures + 1) * 3; + inline OffsetsVector(int num_registers) : + offsets_vector_length_(num_registers) { if (offsets_vector_length_ > kStaticOffsetsVectorSize) { vector_ = NewArray(offsets_vector_length_); } else { @@ -487,7 +630,7 @@ class OffsetsVector { private: int* vector_; int offsets_vector_length_; - static const int kStaticOffsetsVectorSize = 30; + static const int kStaticOffsetsVectorSize = 50; static int static_offsets_vector_[kStaticOffsetsVectorSize]; }; @@ -496,33 +639,126 @@ int OffsetsVector::static_offsets_vector_[ OffsetsVector::kStaticOffsetsVectorSize]; -Handle RegExpImpl::JsreExec(Handle regexp, +Handle RegExpImpl::IrregexpExec(Handle regexp, Handle subject, Handle index) { + ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); + ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined()); + // Prepare space for the return values. - int num_captures = JsreCapture(regexp); + int number_of_registers = IrregexpNumberOfRegisters(regexp); + OffsetsVector offsets(number_of_registers); + + int num_captures = IrregexpNumberOfCaptures(regexp); + + int previous_index = static_cast(DoubleToInteger(index->Number())); + + Handle subject16 = CachedStringToTwoByte(subject); + + Handle result( + IrregexpExecOnce(regexp, + num_captures, + subject16, + previous_index, + offsets.vector(), + offsets.length())); + return result; +} + + +Handle RegExpImpl::JscreExec(Handle regexp, + Handle subject, + Handle index) { + ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE); + if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) { + Handle compile_result = JscreCompile(regexp); + if (compile_result.is_null()) return compile_result; + } + ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray()); - OffsetsVector offsets(num_captures); + int num_captures = JscreNumberOfCaptures(regexp); + + OffsetsVector offsets((num_captures + 1) * 3); int previous_index = static_cast(DoubleToInteger(index->Number())); Handle subject16 = CachedStringToTwoByte(subject); - Handle result(JsreExecOnce(regexp, num_captures, subject, - previous_index, - subject16->GetTwoByteData(), - offsets.vector(), offsets.length())); + Handle result(JscreExecOnce(regexp, + num_captures, + subject, + previous_index, + subject16->GetTwoByteData(), + offsets.vector(), + offsets.length())); return result; } -Handle RegExpImpl::JsreExecGlobal(Handle regexp, - Handle subject) { +Handle RegExpImpl::IrregexpExecGlobal(Handle regexp, + Handle subject) { + ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); + ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined()); + + // Prepare space for the return values. + int number_of_registers = IrregexpNumberOfRegisters(regexp); + OffsetsVector offsets(number_of_registers); + + int previous_index = 0; + + Handle result = Factory::NewJSArray(0); + int i = 0; + Handle matches; + + Handle subject16 = CachedStringToTwoByte(subject); + + do { + if (previous_index > subject->length() || previous_index < 0) { + // Per ECMA-262 15.10.6.2, if the previous index is greater than the + // string length, there is no match. + matches = Factory::null_value(); + } else { + matches = IrregexpExecOnce(regexp, + IrregexpNumberOfCaptures(regexp), + subject16, + previous_index, + offsets.vector(), + offsets.length()); + + if (matches->IsJSArray()) { + SetElement(result, i, matches); + i++; + previous_index = offsets.vector()[1]; + if (offsets.vector()[0] == offsets.vector()[1]) { + previous_index++; + } + } + } + } while (matches->IsJSArray()); + + // If we exited the loop with an exception, throw it. + if (matches->IsNull()) { // Exited loop normally. + return result; + } else { // Exited loop with the exception in matches. + return matches; + } +} + + +Handle RegExpImpl::JscreExecGlobal(Handle regexp, + Handle subject) { + ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE); + if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) { + Handle compile_result = JscreCompile(regexp); + if (compile_result.is_null()) return compile_result; + } + ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray()); + // Prepare space for the return values. - int num_captures = JsreCapture(regexp); + int num_captures = JscreNumberOfCaptures(regexp); - OffsetsVector offsets(num_captures); + OffsetsVector offsets((num_captures + 1) * 3); int previous_index = 0; @@ -538,9 +774,13 @@ Handle RegExpImpl::JsreExecGlobal(Handle regexp, // string length, there is no match. matches = Factory::null_value(); } else { - matches = JsreExecOnce(regexp, num_captures, subject, previous_index, - subject16->GetTwoByteData(), - offsets.vector(), offsets.length()); + matches = JscreExecOnce(regexp, + num_captures, + subject, + previous_index, + subject16->GetTwoByteData(), + offsets.vector(), + offsets.length()); if (matches->IsJSArray()) { SetElement(result, i, matches); @@ -562,15 +802,1798 @@ Handle RegExpImpl::JsreExecGlobal(Handle regexp, } -int RegExpImpl::JsreCapture(Handle re) { +int RegExpImpl::JscreNumberOfCaptures(Handle re) { FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex)); - return Smi::cast(value->get(CAPTURE_INDEX))->value(); + return Smi::cast(value->get(kJscreNumberOfCapturesIndex))-> + value(); } -ByteArray* RegExpImpl::JsreInternal(Handle re) { +ByteArray* RegExpImpl::JscreInternal(Handle re) { FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex)); - return ByteArray::cast(value->get(INTERNAL_INDEX)); + return ByteArray::cast(value->get(kJscreInternalIndex)); +} + + +int RegExpImpl::IrregexpNumberOfCaptures(Handle re) { + FixedArray* value = + FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex)); + return Smi::cast(value->get(kIrregexpNumberOfCapturesIndex))->value(); +} + + +int RegExpImpl::IrregexpNumberOfRegisters(Handle re) { + FixedArray* value = + FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex)); + return Smi::cast(value->get(kIrregexpNumberOfRegistersIndex))->value(); +} + + +Handle RegExpImpl::IrregexpCode(Handle re) { + FixedArray* value = + FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex)); + return Handle(ByteArray::cast(value->get(kIrregexpCodeIndex))); +} + + +// ------------------------------------------------------------------- +// New regular expression engine + + +void RegExpTree::AppendToText(RegExpText* text) { + UNREACHABLE(); +} + + +void RegExpAtom::AppendToText(RegExpText* text) { + text->AddElement(TextElement::Atom(this)); +} + + +void RegExpCharacterClass::AppendToText(RegExpText* text) { + text->AddElement(TextElement::CharClass(this)); +} + + +void RegExpText::AppendToText(RegExpText* text) { + for (int i = 0; i < elements()->length(); i++) + text->AddElement(elements()->at(i)); +} + + +TextElement TextElement::Atom(RegExpAtom* atom) { + TextElement result = TextElement(ATOM); + result.data.u_atom = atom; + return result; +} + + +TextElement TextElement::CharClass( + RegExpCharacterClass* char_class) { + TextElement result = TextElement(CHAR_CLASS); + result.data.u_char_class = char_class; + return result; +} + + +class RegExpCompiler { + public: + RegExpCompiler(int capture_count, bool ignore_case); + + int AllocateRegister() { return next_register_++; } + + Handle Assemble(RegExpMacroAssembler* assembler, + RegExpNode* start, + int capture_count); + + inline void AddWork(RegExpNode* node) { work_list_->Add(node); } + + static const int kImplementationOffset = 0; + static const int kNumberOfRegistersOffset = 0; + static const int kCodeOffset = 1; + + RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } + EndNode* accept() { return accept_; } + EndNode* backtrack() { return backtrack_; } + + static const int kMaxRecursion = 100; + inline int recursion_depth() { return recursion_depth_; } + inline void IncrementRecursionDepth() { recursion_depth_++; } + inline void DecrementRecursionDepth() { recursion_depth_--; } + + inline bool is_case_independent() { return is_case_independent_; } + + private: + EndNode* accept_; + EndNode* backtrack_; + int next_register_; + List* work_list_; + int recursion_depth_; + RegExpMacroAssembler* macro_assembler_; + bool is_case_independent_; +}; + + +// Attempts to compile the regexp using an Irregexp code generator. Returns +// a fixed array or a null handle depending on whether it succeeded. +RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case) + : next_register_(2 * (capture_count + 1)), + work_list_(NULL), + recursion_depth_(0), + is_case_independent_(ignore_case) { + accept_ = new EndNode(EndNode::ACCEPT); + backtrack_ = new EndNode(EndNode::BACKTRACK); +} + + +Handle RegExpCompiler::Assemble( + RegExpMacroAssembler* macro_assembler, + RegExpNode* start, + int capture_count) { + if (!FLAG_attempt_case_independent && is_case_independent_) { + return Handle::null(); + } + macro_assembler_ = macro_assembler; + List work_list(0); + work_list_ = &work_list; + Label fail; + macro_assembler->PushBacktrack(&fail); + if (!start->GoTo(this)) { + fail.Unuse(); + return Handle::null(); + } + while (!work_list.is_empty()) { + if (!work_list.RemoveLast()->GoTo(this)) { + fail.Unuse(); + return Handle::null(); + } + } + macro_assembler->Bind(&fail); + macro_assembler->Fail(); + Handle array = + Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength); + array->set(RegExpImpl::kIrregexpImplementationIndex, + Smi::FromInt(macro_assembler->Implementation())); + array->set(RegExpImpl::kIrregexpNumberOfRegistersIndex, + Smi::FromInt(next_register_)); + array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex, + Smi::FromInt(capture_count)); + Handle code = macro_assembler->GetCode(); + array->set(RegExpImpl::kIrregexpCodeIndex, *code); + work_list_ = NULL; + return array; +} + + +bool RegExpNode::GoTo(RegExpCompiler* compiler) { + // TODO(erikcorry): Implement support. + if (info_.follows_word_interest || + info_.follows_newline_interest || + info_.follows_start_interest) { + return false; + } + if (label_.is_bound()) { + compiler->macro_assembler()->GoTo(&label_); + return true; + } else { + if (compiler->recursion_depth() > RegExpCompiler::kMaxRecursion) { + compiler->macro_assembler()->GoTo(&label_); + compiler->AddWork(this); + return true; + } else { + compiler->IncrementRecursionDepth(); + bool how_it_went = Emit(compiler); + compiler->DecrementRecursionDepth(); + return how_it_went; + } + } +} + + +bool EndNode::GoTo(RegExpCompiler* compiler) { + if (info()->follows_word_interest || + info()->follows_newline_interest || + info()->follows_start_interest) { + return false; + } + if (!label()->is_bound()) { + Bind(compiler->macro_assembler()); + } + switch (action_) { + case ACCEPT: + compiler->macro_assembler()->Succeed(); + break; + case BACKTRACK: + compiler->macro_assembler()->Backtrack(); + break; + } + return true; +} + + +Label* RegExpNode::label() { + return &label_; +} + + +bool EndNode::Emit(RegExpCompiler* compiler) { + RegExpMacroAssembler* macro = compiler->macro_assembler(); + switch (action_) { + case ACCEPT: + Bind(macro); + macro->Succeed(); + return true; + case BACKTRACK: + Bind(macro); + macro->Backtrack(); + return true; + } + return false; +} + + +void GuardedAlternative::AddGuard(Guard* guard) { + if (guards_ == NULL) + guards_ = new ZoneList(1); + guards_->Add(guard); +} + + +ActionNode* ActionNode::StoreRegister(int reg, + int val, + RegExpNode* on_success) { + ActionNode* result = new ActionNode(STORE_REGISTER, on_success); + result->data_.u_store_register.reg = reg; + result->data_.u_store_register.value = val; + return result; +} + + +ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { + ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success); + result->data_.u_increment_register.reg = reg; + return result; } + +ActionNode* ActionNode::StorePosition(int reg, RegExpNode* on_success) { + ActionNode* result = new ActionNode(STORE_POSITION, on_success); + result->data_.u_position_register.reg = reg; + return result; +} + + +ActionNode* ActionNode::SavePosition(int reg, RegExpNode* on_success) { + ActionNode* result = new ActionNode(SAVE_POSITION, on_success); + result->data_.u_position_register.reg = reg; + return result; +} + + +ActionNode* ActionNode::RestorePosition(int reg, RegExpNode* on_success) { + ActionNode* result = new ActionNode(RESTORE_POSITION, on_success); + result->data_.u_position_register.reg = reg; + return result; +} + + +ActionNode* ActionNode::BeginSubmatch(int reg, RegExpNode* on_success) { + ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success); + result->data_.u_submatch_stack_pointer_register.reg = reg; + return result; +} + + +ActionNode* ActionNode::EscapeSubmatch(int reg, RegExpNode* on_success) { + ActionNode* result = new ActionNode(ESCAPE_SUBMATCH, on_success); + result->data_.u_submatch_stack_pointer_register.reg = reg; + return result; +} + + +#define DEFINE_ACCEPT(Type) \ + void Type##Node::Accept(NodeVisitor* visitor) { \ + visitor->Visit##Type(this); \ + } +FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) +#undef DEFINE_ACCEPT + + +// ------------------------------------------------------------------- +// Emit code. + + +void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, + Guard* guard, + Label* on_failure) { + switch (guard->op()) { + case Guard::LT: + macro_assembler->IfRegisterGE(guard->reg(), guard->value(), on_failure); + break; + case Guard::GEQ: + macro_assembler->IfRegisterLT(guard->reg(), guard->value(), on_failure); + break; + } +} + + +static unibrow::Mapping uncanonicalize; +static unibrow::Mapping canonrange; + + +static inline void EmitAtomNonLetters( + RegExpMacroAssembler* macro_assembler, + TextElement elm, + Vector quarks, + Label* on_failure, + int cp_offset) { + unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; + for (int i = quarks.length() - 1; i >= 0; i--) { + uc16 c = quarks[i]; + int length = uncanonicalize.get(c, '\0', chars); + if (length <= 1) { + macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure); + macro_assembler->CheckNotCharacter(c, on_failure); + } + } +} + + +static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, + uc16 c1, + uc16 c2, + Label* on_failure) { + uc16 exor = c1 ^ c2; + // Check whether exor has only one bit set. + if (((exor - 1) & exor) == 0) { + // If c1 and c2 differ only by one bit. + // Ecma262UnCanonicalize always gives the highest number last. + ASSERT(c2 > c1); + macro_assembler->CheckNotCharacterAfterOr(c2, exor, on_failure); + return true; + } else { + ASSERT(c2 > c1); + uc16 diff = c2 - c1; + if (((diff - 1) & diff) == 0 && c1 >= diff) { + // If the characters differ by 2^n but don't differ by one bit then + // subtract the difference from the found character, then do the or + // trick. We avoid the theoretical case where negative numbers are + // involved in order to simplify code generation. + macro_assembler->CheckNotCharacterAfterMinusOr(c2 - diff, + diff, + on_failure); + return true; + } + } + return false; +} + + +static inline void EmitAtomLetters( + RegExpMacroAssembler* macro_assembler, + TextElement elm, + Vector quarks, + Label* on_failure, + int cp_offset) { + unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; + for (int i = quarks.length() - 1; i >= 0; i--) { + uc16 c = quarks[i]; + int length = uncanonicalize.get(c, '\0', chars); + if (length <= 1) continue; + macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure); + Label ok; + ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); + switch (length) { + case 2: { + if (ShortCutEmitCharacterPair(macro_assembler, + chars[0], + chars[1], + on_failure)) { + ok.Unuse(); + } else { + macro_assembler->CheckCharacter(chars[0], &ok); + macro_assembler->CheckNotCharacter(chars[1], on_failure); + macro_assembler->Bind(&ok); + } + break; + } + case 4: + macro_assembler->CheckCharacter(chars[3], &ok); + // Fall through! + case 3: + macro_assembler->CheckCharacter(chars[0], &ok); + macro_assembler->CheckCharacter(chars[1], &ok); + macro_assembler->CheckNotCharacter(chars[2], on_failure); + macro_assembler->Bind(&ok); + break; + default: + UNREACHABLE(); + break; + } + } +} + + +static void EmitCharClass(RegExpMacroAssembler* macro_assembler, + RegExpCharacterClass* cc, + int cp_offset, + Label* on_failure) { + macro_assembler->LoadCurrentCharacter(cp_offset, on_failure); + cp_offset++; + + ZoneList* ranges = cc->ranges(); + + Label success; + + Label *char_is_in_class = + cc->is_negated() ? on_failure : &success; + + int range_count = ranges->length(); + + if (range_count == 0) { + if (!cc->is_negated()) { + macro_assembler->GoTo(on_failure); + } + return; + } + + for (int i = 0; i < range_count - 1; i++) { + CharacterRange& range = ranges->at(i); + Label next_range; + uc16 from = range.from(); + uc16 to = range.to(); + if (to == from) { + macro_assembler->CheckCharacter(to, char_is_in_class); + } else { + if (from != 0) { + macro_assembler->CheckCharacterLT(from, &next_range); + } + if (to != 0xffff) { + macro_assembler->CheckCharacterLT(to + 1, char_is_in_class); + } else { + macro_assembler->GoTo(char_is_in_class); + } + } + macro_assembler->Bind(&next_range); + } + + CharacterRange& range = ranges->at(range_count - 1); + uc16 from = range.from(); + uc16 to = range.to(); + + if (to == from) { + if (cc->is_negated()) { + macro_assembler->CheckCharacter(to, on_failure); + } else { + macro_assembler->CheckNotCharacter(to, on_failure); + } + } else { + if (from != 0) { + if (!cc->is_negated()) { + macro_assembler->CheckCharacterLT(from, on_failure); + } else { + macro_assembler->CheckCharacterLT(from, &success); + } + } + if (to != 0xffff) { + if (!cc->is_negated()) { + macro_assembler->CheckCharacterGT(to, on_failure); + } else { + macro_assembler->CheckCharacterLT(to + 1, on_failure); + } + } else { + if (cc->is_negated()) { + macro_assembler->GoTo(on_failure); + } + } + } + macro_assembler->Bind(&success); +} + + + +bool TextNode::Emit(RegExpCompiler* compiler) { + RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); + Bind(macro_assembler); + int element_count = elms_->length(); + int cp_offset = 0; + // First, handle straight character matches. + for (int i = 0; i < element_count; i++) { + TextElement elm = elms_->at(i); + if (elm.type == TextElement::ATOM) { + Vector quarks = elm.data.u_atom->data(); + if (!compiler->is_case_independent()) { + macro_assembler->CheckCharacters(quarks, + cp_offset, + on_failure_->label()); + } else { + EmitAtomNonLetters(macro_assembler, + elm, + quarks, + on_failure_->label(), + cp_offset); + } + cp_offset += quarks.length(); + } else { + ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); + cp_offset++; + } + } + // Second, handle case independent letter matches if any. + if (compiler->is_case_independent()) { + cp_offset = 0; + for (int i = 0; i < element_count; i++) { + TextElement elm = elms_->at(i); + if (elm.type == TextElement::ATOM) { + Vector quarks = elm.data.u_atom->data(); + EmitAtomLetters(macro_assembler, + elm, + quarks, + on_failure_->label(), + cp_offset); + cp_offset += quarks.length(); + } else { + cp_offset++; + } + } + } + // If the fast character matches passed then do the character classes. + cp_offset = 0; + for (int i = 0; i < element_count; i++) { + TextElement elm = elms_->at(i); + if (elm.type == TextElement::CHAR_CLASS) { + RegExpCharacterClass* cc = elm.data.u_char_class; + EmitCharClass(macro_assembler, cc, cp_offset, on_failure_->label()); + cp_offset++; + } else { + cp_offset += elm.data.u_atom->data().length(); + } + } + + compiler->AddWork(on_failure_); + macro_assembler->AdvanceCurrentPosition(cp_offset); + return on_success()->GoTo(compiler); +} + + +bool ChoiceNode::Emit(RegExpCompiler* compiler) { + int choice_count = alternatives_->length(); + RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); + Bind(macro_assembler); + // For now we just call all choices one after the other. The idea ultimately + // is to use the Dispatch table to try only the relevant ones. + int i; + for (i = 0; i < choice_count - 1; i++) { + GuardedAlternative alternative = alternatives_->at(i); + Label after; + Label after_no_pop_cp; + ZoneList* guards = alternative.guards(); + if (guards != NULL) { + int guard_count = guards->length(); + for (int j = 0; j < guard_count; j++) { + GenerateGuard(macro_assembler, guards->at(j), &after_no_pop_cp); + } + } + macro_assembler->PushCurrentPosition(); + macro_assembler->PushBacktrack(&after); + if (!alternative.node()->GoTo(compiler)) { + after.Unuse(); + after_no_pop_cp.Unuse(); + return false; + } + macro_assembler->Bind(&after); + macro_assembler->PopCurrentPosition(); + macro_assembler->Bind(&after_no_pop_cp); + } + GuardedAlternative alternative = alternatives_->at(i); + ZoneList* guards = alternative.guards(); + if (guards != NULL) { + int guard_count = guards->length(); + for (int j = 0; j < guard_count; j++) { + GenerateGuard(macro_assembler, guards->at(j), on_failure_->label()); + } + } + if (!on_failure_->IsBacktrack()) { + ASSERT_NOT_NULL(on_failure_ -> label()); + macro_assembler->PushBacktrack(on_failure_->label()); + compiler->AddWork(on_failure_); + } + if (!alternative.node()->GoTo(compiler)) { + return false; + } + return true; +} + + +bool ActionNode::Emit(RegExpCompiler* compiler) { + RegExpMacroAssembler* macro = compiler->macro_assembler(); + Bind(macro); + switch (type_) { + case STORE_REGISTER: + macro->SetRegister(data_.u_store_register.reg, + data_.u_store_register.value); + break; + case INCREMENT_REGISTER: { + Label undo; + macro->PushBacktrack(&undo); + macro->AdvanceRegister(data_.u_increment_register.reg, 1); + bool ok = on_success()->GoTo(compiler); + if (!ok) { + undo.Unuse(); + return false; + } + macro->Bind(&undo); + macro->AdvanceRegister(data_.u_increment_register.reg, -1); + macro->Backtrack(); + break; + } + case STORE_POSITION: { + Label undo; + macro->PushRegister(data_.u_position_register.reg); + macro->PushBacktrack(&undo); + macro->WriteCurrentPositionToRegister(data_.u_position_register.reg); + bool ok = on_success()->GoTo(compiler); + if (!ok) { + undo.Unuse(); + return false; + } + macro->Bind(&undo); + macro->PopRegister(data_.u_position_register.reg); + macro->Backtrack(); + break; + } + case SAVE_POSITION: + macro->WriteCurrentPositionToRegister( + data_.u_position_register.reg); + break; + case RESTORE_POSITION: + macro->ReadCurrentPositionFromRegister( + data_.u_position_register.reg); + break; + case BEGIN_SUBMATCH: + macro->WriteStackPointerToRegister( + data_.u_submatch_stack_pointer_register.reg); + break; + case ESCAPE_SUBMATCH: + macro->ReadStackPointerFromRegister( + data_.u_submatch_stack_pointer_register.reg); + break; + default: + UNREACHABLE(); + return false; + } + return on_success()->GoTo(compiler); +} + + +bool BackReferenceNode::Emit(RegExpCompiler* compiler) { + RegExpMacroAssembler* macro = compiler->macro_assembler(); + Bind(macro); + // Check whether the registers are uninitialized and always + // succeed if they are. + macro->IfRegisterLT(start_reg_, 0, on_success()->label()); + macro->IfRegisterLT(end_reg_, 0, on_success()->label()); + ASSERT_EQ(start_reg_ + 1, end_reg_); + macro->CheckNotBackReference(start_reg_, on_failure_->label()); + return on_success()->GoTo(compiler); +} + + +// ------------------------------------------------------------------- +// Dot/dotty output + + +#ifdef DEBUG + + +class DotPrinter: public NodeVisitor { + public: + DotPrinter() : stream_(&alloc_) { } + void PrintNode(const char* label, RegExpNode* node); + void Visit(RegExpNode* node); + void PrintOnFailure(RegExpNode* from, RegExpNode* on_failure); + StringStream* stream() { return &stream_; } +#define DECLARE_VISIT(Type) \ + virtual void Visit##Type(Type##Node* that); +FOR_EACH_NODE_TYPE(DECLARE_VISIT) +#undef DECLARE_VISIT + private: + HeapStringAllocator alloc_; + StringStream stream_; + std::set seen_; +}; + + +void DotPrinter::PrintNode(const char* label, RegExpNode* node) { + stream()->Add("digraph G {\n graph [label=\""); + for (int i = 0; label[i]; i++) { + switch (label[i]) { + case '\\': + stream()->Add("\\\\"); + break; + case '"': + stream()->Add("\""); + break; + default: + stream()->Put(label[i]); + break; + } + } + stream()->Add("\"];\n"); + Visit(node); + stream()->Add("}\n"); + printf("%s", *(stream()->ToCString())); +} + + +void DotPrinter::Visit(RegExpNode* node) { + if (seen_.find(node) != seen_.end()) + return; + seen_.insert(node); + node->Accept(this); +} + + +void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { + if (on_failure->IsBacktrack()) return; + stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure); + Visit(on_failure); +} + + +class TableEntryBodyPrinter { + public: + TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice) + : stream_(stream), choice_(choice) { } + void Call(uc16 from, DispatchTable::Entry entry) { + OutSet* out_set = entry.out_set(); + for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { + if (out_set->Get(i)) { + stream()->Add(" n%p:s%io%i -> n%p;\n", + choice(), + from, + i, + choice()->alternatives()->at(i).node()); + } + } + } + private: + StringStream* stream() { return stream_; } + ChoiceNode* choice() { return choice_; } + StringStream* stream_; + ChoiceNode* choice_; +}; + + +class TableEntryHeaderPrinter { + public: + explicit TableEntryHeaderPrinter(StringStream* stream) + : first_(true), stream_(stream) { } + void Call(uc16 from, DispatchTable::Entry entry) { + if (first_) { + first_ = false; + } else { + stream()->Add("|"); + } + stream()->Add("{\\%k-\\%k|{", from, entry.to()); + OutSet* out_set = entry.out_set(); + int priority = 0; + for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { + if (out_set->Get(i)) { + if (priority > 0) stream()->Add("|"); + stream()->Add(" %i", from, i, priority); + priority++; + } + } + stream()->Add("}}"); + } + private: + bool first_; + StringStream* stream() { return stream_; } + StringStream* stream_; +}; + + +void DotPrinter::VisitChoice(ChoiceNode* that) { + stream()->Add(" n%p [shape=Mrecord, label=\"", that); + TableEntryHeaderPrinter header_printer(stream()); + that->table()->ForEach(&header_printer); + stream()->Add("\"]\n", that); + TableEntryBodyPrinter body_printer(stream(), that); + that->table()->ForEach(&body_printer); + PrintOnFailure(that, that->on_failure()); + for (int i = 0; i < that->alternatives()->length(); i++) { + GuardedAlternative alt = that->alternatives()->at(i); + alt.node()->Accept(this); + } +} + + +void DotPrinter::VisitText(TextNode* that) { + stream()->Add(" n%p [label=\"", that); + for (int i = 0; i < that->elements()->length(); i++) { + if (i > 0) stream()->Add(" "); + TextElement elm = that->elements()->at(i); + switch (elm.type) { + case TextElement::ATOM: { + stream()->Add("'%w'", elm.data.u_atom->data()); + break; + } + case TextElement::CHAR_CLASS: { + RegExpCharacterClass* node = elm.data.u_char_class; + stream()->Add("["); + if (node->is_negated()) + stream()->Add("^"); + for (int j = 0; j < node->ranges()->length(); j++) { + CharacterRange range = node->ranges()->at(j); + stream()->Add("%k-%k", range.from(), range.to()); + } + stream()->Add("]"); + break; + } + default: + UNREACHABLE(); + } + } + stream()->Add("\", shape=box, peripheries=2];\n"); + stream()->Add(" n%p -> n%p;\n", that, that->on_success()); + Visit(that->on_success()); + PrintOnFailure(that, that->on_failure()); +} + + +void DotPrinter::VisitBackReference(BackReferenceNode* that) { + stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n", + that, + that->start_register(), + that->end_register()); + stream()->Add(" n%p -> n%p;\n", that, that->on_success()); + Visit(that->on_success()); + PrintOnFailure(that, that->on_failure()); +} + + +void DotPrinter::VisitEnd(EndNode* that) { + stream()->Add(" n%p [style=bold, shape=point];\n", that); +} + + +void DotPrinter::VisitAction(ActionNode* that) { + stream()->Add(" n%p [", that); + switch (that->type_) { + case ActionNode::STORE_REGISTER: + stream()->Add("label=\"$%i:=%i\", shape=octagon", + that->data_.u_store_register.reg, + that->data_.u_store_register.value); + break; + case ActionNode::INCREMENT_REGISTER: + stream()->Add("label=\"$%i++\", shape=octagon", + that->data_.u_increment_register.reg); + break; + case ActionNode::STORE_POSITION: + stream()->Add("label=\"$%i:=$pos\", shape=octagon", + that->data_.u_position_register.reg); + break; + case ActionNode::SAVE_POSITION: + stream()->Add("label=\"$%i:=$pos\", shape=octagon", + that->data_.u_position_register.reg); + break; + case ActionNode::RESTORE_POSITION: + stream()->Add("label=\"$pos:=$%i\", shape=octagon", + that->data_.u_position_register.reg); + break; + case ActionNode::BEGIN_SUBMATCH: + stream()->Add("label=\"begin\", shape=septagon"); + break; + case ActionNode::ESCAPE_SUBMATCH: + stream()->Add("label=\"escape\", shape=septagon"); + break; + } + stream()->Add("];\n"); + stream()->Add(" n%p -> n%p;\n", that, that->on_success()); + Visit(that->on_success()); +} + + +class DispatchTableDumper { + public: + explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { } + void Call(uc16 key, DispatchTable::Entry entry); + StringStream* stream() { return stream_; } + private: + StringStream* stream_; +}; + + +void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) { + stream()->Add("[%k-%k]: {", key, entry.to()); + OutSet* set = entry.out_set(); + bool first = true; + for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { + if (set->Get(i)) { + if (first) { + first = false; + } else { + stream()->Add(", "); + } + stream()->Add("%i", i); + } + } + stream()->Add("}\n"); +} + + +void DispatchTable::Dump() { + HeapStringAllocator alloc; + StringStream stream(&alloc); + DispatchTableDumper dumper(&stream); + tree()->ForEach(&dumper); + OS::PrintError("%s", *stream.ToCString()); +} + + +void RegExpEngine::DotPrint(const char* label, RegExpNode* node) { + DotPrinter printer; + printer.PrintNode(label, node); +} + + +#endif // DEBUG + + +// ------------------------------------------------------------------- +// Tree to graph conversion + + +RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure) { + ZoneList* elms = new ZoneList(1); + elms->Add(TextElement::Atom(this)); + return new TextNode(elms, on_success, on_failure); +} + + +RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure) { + return new TextNode(elements(), on_success, on_failure); +} + + +RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure) { + ZoneList* elms = new ZoneList(1); + elms->Add(TextElement::CharClass(this)); + return new TextNode(elms, on_success, on_failure); +} + + +RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure) { + ZoneList* alternatives = this->alternatives(); + int length = alternatives->length(); + ChoiceNode* result = new ChoiceNode(length, on_failure); + for (int i = 0; i < length; i++) { + GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, + on_success, + on_failure)); + result->AddAlternative(alternative); + } + return result; +} + + +RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure) { + return ToNode(min(), + max(), + is_greedy(), + body(), + compiler, + on_success, + on_failure); +} + + +RegExpNode* RegExpQuantifier::ToNode(int min, + int max, + bool is_greedy, + RegExpTree* body, + RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure) { + // x{f, t} becomes this: + // + // (r++)<-. + // | ` + // | (x) + // v ^ + // (r=0)-->(?)---/ [if r < t] + // | + // [if r >= f] \----> ... + // + // + // TODO(someone): clear captures on repetition and handle empty + // matches. + bool has_min = min > 0; + bool has_max = max < RegExpQuantifier::kInfinity; + bool needs_counter = has_min || has_max; + int reg_ctr = needs_counter ? compiler->AllocateRegister() : -1; + ChoiceNode* center = new ChoiceNode(2, on_failure); + RegExpNode* loop_return = needs_counter + ? static_cast(ActionNode::IncrementRegister(reg_ctr, center)) + : static_cast(center); + RegExpNode* body_node = body->ToNode(compiler, loop_return, on_failure); + GuardedAlternative body_alt(body_node); + if (has_max) { + Guard* body_guard = new Guard(reg_ctr, Guard::LT, max); + body_alt.AddGuard(body_guard); + } + GuardedAlternative rest_alt(on_success); + if (has_min) { + Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min); + rest_alt.AddGuard(rest_guard); + } + if (is_greedy) { + center->AddAlternative(body_alt); + center->AddAlternative(rest_alt); + } else { + center->AddAlternative(rest_alt); + center->AddAlternative(body_alt); + } + if (needs_counter) { + return ActionNode::StoreRegister(reg_ctr, 0, center); + } else { + return center; + } +} + + +RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure) { + NodeInfo info; + switch (type()) { + case START_OF_LINE: + info.follows_newline_interest = true; + break; + case START_OF_INPUT: + info.follows_start_interest = true; + break; + case BOUNDARY: case NON_BOUNDARY: + info.follows_word_interest = true; + break; + case END_OF_LINE: case END_OF_INPUT: + // This is wrong but has the effect of making the compiler abort. + info.follows_start_interest = true; + } + return on_success->PropagateInterest(&info); +} + + +RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure) { + return new BackReferenceNode(RegExpCapture::StartRegister(index()), + RegExpCapture::EndRegister(index()), + on_success, + on_failure); +} + + +RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure) { + return on_success; +} + + +RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure) { + int stack_pointer_register = compiler->AllocateRegister(); + int position_register = compiler->AllocateRegister(); + if (is_positive()) { + // begin submatch scope + // $reg = $pos + // if [body] + // then + // $pos = $reg + // escape submatch scope (drop all backtracks created in scope) + // succeed + // else + // end submatch scope (nothing to clean up, just exit the scope) + // fail + return ActionNode::BeginSubmatch( + stack_pointer_register, + ActionNode::SavePosition( + position_register, + body()->ToNode( + compiler, + ActionNode::RestorePosition( + position_register, + ActionNode::EscapeSubmatch(stack_pointer_register, + on_success)), + on_failure))); + } else { + // begin submatch scope + // try + // first if (body) + // then + // escape submatch scope + // fail + // else + // backtrack + // second + // end submatch scope + // restore current position + // succeed + ChoiceNode* try_node = + new ChoiceNode(1, ActionNode::RestorePosition(position_register, + on_success)); + RegExpNode* body_node = body()->ToNode( + compiler, + ActionNode::EscapeSubmatch(stack_pointer_register, on_failure), + compiler->backtrack()); + GuardedAlternative body_alt(body_node); + try_node->AddAlternative(body_alt); + return ActionNode::BeginSubmatch(stack_pointer_register, + ActionNode::SavePosition( + position_register, + try_node)); + } +} + + +RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure) { + return ToNode(body(), index(), compiler, on_success, on_failure); +} + + +RegExpNode* RegExpCapture::ToNode(RegExpTree* body, + int index, + RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure) { + int start_reg = RegExpCapture::StartRegister(index); + int end_reg = RegExpCapture::EndRegister(index); + RegExpNode* store_end = ActionNode::StorePosition(end_reg, on_success); + RegExpNode* body_node = body->ToNode(compiler, store_end, on_failure); + return ActionNode::StorePosition(start_reg, body_node); +} + + +RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpNode* on_failure) { + ZoneList* children = nodes(); + RegExpNode* current = on_success; + for (int i = children->length() - 1; i >= 0; i--) { + current = children->at(i)->ToNode(compiler, current, on_failure); + } + return current; +} + + +static const int kSpaceRangeCount = 20; +static const uc16 kSpaceRanges[kSpaceRangeCount] = { + 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, 0x00A0, 0x1680, + 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029, + 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 +}; + + +static const int kWordRangeCount = 8; +static const uc16 kWordRanges[kWordRangeCount] = { + '0', '9', 'A', 'Z', '_', '_', 'a', 'z' +}; + + +static const int kDigitRangeCount = 2; +static const uc16 kDigitRanges[kDigitRangeCount] = { + '0', '9' +}; + + +static const int kLineTerminatorRangeCount = 6; +static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { + 0x000A, 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 +}; + + +static void AddClass(const uc16* elmv, + int elmc, + ZoneList* ranges) { + for (int i = 0; i < elmc; i += 2) { + ASSERT(elmv[i] <= elmv[i + 1]); + ranges->Add(CharacterRange(elmv[i], elmv[i + 1])); + } +} + + +static void AddClassNegated(const uc16 *elmv, + int elmc, + ZoneList* ranges) { + ASSERT(elmv[0] != 0x0000); + ASSERT(elmv[elmc-1] != 0xFFFF); + uc16 last = 0x0000; + for (int i = 0; i < elmc; i += 2) { + ASSERT(last <= elmv[i] - 1); + ASSERT(elmv[i] <= elmv[i + 1]); + ranges->Add(CharacterRange(last, elmv[i] - 1)); + last = elmv[i + 1] + 1; + } + ranges->Add(CharacterRange(last, 0xFFFF)); +} + + +void CharacterRange::AddClassEscape(uc16 type, + ZoneList* ranges) { + switch (type) { + case 's': + AddClass(kSpaceRanges, kSpaceRangeCount, ranges); + break; + case 'S': + AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges); + break; + case 'w': + AddClass(kWordRanges, kWordRangeCount, ranges); + break; + case 'W': + AddClassNegated(kWordRanges, kWordRangeCount, ranges); + break; + case 'd': + AddClass(kDigitRanges, kDigitRangeCount, ranges); + break; + case 'D': + AddClassNegated(kDigitRanges, kDigitRangeCount, ranges); + break; + case '.': + AddClassNegated(kLineTerminatorRanges, + kLineTerminatorRangeCount, + ranges); + break; + // This is not a character range as defined by the spec but a + // convenient shorthand for a character class that matches any + // character. + case '*': + ranges->Add(CharacterRange::Everything()); + break; + default: + UNREACHABLE(); + } +} + + +void CharacterRange::AddCaseEquivalents(ZoneList* ranges) { + unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; + if (IsSingleton()) { + // If this is a singleton we just expand the one character. + int length = uncanonicalize.get(from(), '\0', chars); + for (int i = 0; i < length; i++) { + uc32 chr = chars[i]; + if (chr != from()) { + ranges->Add(CharacterRange::Singleton(chars[i])); + } + } + } else if (from() <= kRangeCanonicalizeMax + && to() <= kRangeCanonicalizeMax) { + // If this is a range we expand the characters block by block, + // expanding contiguous subranges (blocks) one at a time. + // The approach is as follows. For a given start character we + // look up the block that contains it, for instance 'a' if the + // start character is 'c'. A block is characterized by the property + // that all characters uncanonicalize in the same way as the first + // element, except that each entry in the result is incremented + // by the distance from the first element. So a-z is a block + // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter + // uncanonicalizes to ['a' + k, 'A' + k]. + // Once we've found the start point we look up its uncanonicalization + // and produce a range for each element. For instance for [c-f] + // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only + // add a range if it is not already contained in the input, so [c-f] + // will be skipped but [C-F] will be added. If this range is not + // completely contained in a block we do this for all the blocks + // covered by the range. + unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; + // First, look up the block that contains the 'from' character. + int length = canonrange.get(from(), '\0', range); + if (length == 0) { + range[0] = from(); + } else { + ASSERT_EQ(1, length); + } + int pos = from(); + // The start of the current block. Note that except for the first + // iteration 'start' is always equal to 'pos'. + int start; + // If it is not the start point of a block the entry contains the + // offset of the character from the start point. + if ((range[0] & kStartMarker) == 0) { + start = pos - range[0]; + } else { + start = pos; + } + // Then we add the ranges on at a time, incrementing the current + // position to be after the last block each time. The position + // always points to the start of a block. + while (pos < to()) { + length = canonrange.get(start, '\0', range); + if (length == 0) { + range[0] = start; + } else { + ASSERT_EQ(1, length); + } + ASSERT((range[0] & kStartMarker) != 0); + // The start point of a block contains the distance to the end + // of the range. + int block_end = start + (range[0] & kPayloadMask) - 1; + int end = (block_end > to()) ? to() : block_end; + length = uncanonicalize.get(start, '\0', range); + for (int i = 0; i < length; i++) { + uc32 c = range[i]; + uc16 range_from = c + (pos - start); + uc16 range_to = c + (end - start); + if (!(from() <= range_from && range_to <= to())) + ranges->Add(CharacterRange(range_from, range_to)); + } + start = pos = block_end + 1; + } + } else { + // TODO(plesner) when we've fixed the 2^11 bug in unibrow. + } +} + + +// ------------------------------------------------------------------- +// Interest propagation + + +RegExpNode* RegExpNode::GetSibling(NodeInfo* info) { + for (int i = 0; i < siblings_.length(); i++) { + RegExpNode* sibling = siblings_.Get(i); + if (sibling->info()->SameInterests(info)) + return sibling; + } + return NULL; +} + + +template +static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) { + RegExpNode* sibling = node->GetSibling(info); + if (sibling != NULL) return sibling; + node->EnsureSiblings(); + sibling = new C(*node); + sibling->info()->AdoptInterests(info); + node->AddSibling(sibling); + return sibling; +} + + +RegExpNode* ActionNode::PropagateInterest(NodeInfo* info) { + RegExpNode* sibling = GetSibling(info); + if (sibling != NULL) return sibling; + EnsureSiblings(); + ActionNode* action = new ActionNode(*this); + action->info()->AdoptInterests(info); + AddSibling(action); + action->set_on_success(action->on_success()->PropagateInterest(info)); + return action; +} + + +RegExpNode* ChoiceNode::PropagateInterest(NodeInfo* info) { + RegExpNode* sibling = GetSibling(info); + if (sibling != NULL) return sibling; + EnsureSiblings(); + ChoiceNode* choice = new ChoiceNode(*this); + choice->info()->AdoptInterests(info); + AddSibling(choice); + ZoneList* old_alternatives = alternatives(); + int count = old_alternatives->length(); + choice->alternatives_ = new ZoneList(count); + for (int i = 0; i < count; i++) { + GuardedAlternative alternative = old_alternatives->at(i); + alternative.set_node(alternative.node()->PropagateInterest(info)); + choice->alternatives()->Add(alternative); + } + return choice; +} + + +RegExpNode* EndNode::PropagateInterest(NodeInfo* info) { + return PropagateToEndpoint(this, info); +} + + +RegExpNode* BackReferenceNode::PropagateInterest(NodeInfo* info) { + return PropagateToEndpoint(this, info); +} + + +RegExpNode* TextNode::PropagateInterest(NodeInfo* info) { + return PropagateToEndpoint(this, info); +} + + +// ------------------------------------------------------------------- +// Splay tree + + +OutSet* OutSet::Extend(unsigned value) { + if (Get(value)) + return this; + if (successors() != NULL) { + for (int i = 0; i < successors()->length(); i++) { + OutSet* successor = successors()->at(i); + if (successor->Get(value)) + return successor; + } + } else { + successors_ = new ZoneList(2); + } + OutSet* result = new OutSet(first_, remaining_); + result->Set(value); + successors()->Add(result); + return result; +} + + +void OutSet::Set(unsigned value) { + if (value < kFirstLimit) { + first_ |= (1 << value); + } else { + if (remaining_ == NULL) + remaining_ = new ZoneList(1); + if (remaining_->is_empty() || !remaining_->Contains(value)) + remaining_->Add(value); + } +} + + +bool OutSet::Get(unsigned value) { + if (value < kFirstLimit) { + return (first_ & (1 << value)) != 0; + } else if (remaining_ == NULL) { + return false; + } else { + return remaining_->Contains(value); + } +} + + +const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar; +const DispatchTable::Entry DispatchTable::Config::kNoValue; + + +void DispatchTable::AddRange(CharacterRange full_range, int value) { + CharacterRange current = full_range; + if (tree()->is_empty()) { + // If this is the first range we just insert into the table. + ZoneSplayTree::Locator loc; + ASSERT_RESULT(tree()->Insert(current.from(), &loc)); + loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value))); + return; + } + // First see if there is a range to the left of this one that + // overlaps. + ZoneSplayTree::Locator loc; + if (tree()->FindGreatestLessThan(current.from(), &loc)) { + Entry* entry = &loc.value(); + // If we've found a range that overlaps with this one, and it + // starts strictly to the left of this one, we have to fix it + // because the following code only handles ranges that start on + // or after the start point of the range we're adding. + if (entry->from() < current.from() && entry->to() >= current.from()) { + // Snap the overlapping range in half around the start point of + // the range we're adding. + CharacterRange left(entry->from(), current.from() - 1); + CharacterRange right(current.from(), entry->to()); + // The left part of the overlapping range doesn't overlap. + // Truncate the whole entry to be just the left part. + entry->set_to(left.to()); + // The right part is the one that overlaps. We add this part + // to the map and let the next step deal with merging it with + // the range we're adding. + ZoneSplayTree::Locator loc; + ASSERT_RESULT(tree()->Insert(right.from(), &loc)); + loc.set_value(Entry(right.from(), + right.to(), + entry->out_set())); + } + } + while (current.is_valid()) { + if (tree()->FindLeastGreaterThan(current.from(), &loc) && + (loc.value().from() <= current.to()) && + (loc.value().to() >= current.from())) { + Entry* entry = &loc.value(); + // We have overlap. If there is space between the start point of + // the range we're adding and where the overlapping range starts + // then we have to add a range covering just that space. + if (current.from() < entry->from()) { + ZoneSplayTree::Locator ins; + ASSERT_RESULT(tree()->Insert(current.from(), &ins)); + ins.set_value(Entry(current.from(), + entry->from() - 1, + empty()->Extend(value))); + current.set_from(entry->from()); + } + ASSERT_EQ(current.from(), entry->from()); + // If the overlapping range extends beyond the one we want to add + // we have to snap the right part off and add it separately. + if (entry->to() > current.to()) { + ZoneSplayTree::Locator ins; + ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins)); + ins.set_value(Entry(current.to() + 1, + entry->to(), + entry->out_set())); + entry->set_to(current.to()); + } + ASSERT(entry->to() <= current.to()); + // The overlapping range is now completely contained by the range + // we're adding so we can just update it and move the start point + // of the range we're adding just past it. + entry->AddValue(value); + // Bail out if the last interval ended at 0xFFFF since otherwise + // adding 1 will wrap around to 0. + if (entry->to() == 0xFFFF) + break; + ASSERT(entry->to() + 1 > current.from()); + current.set_from(entry->to() + 1); + } else { + // There is no overlap so we can just add the range + ZoneSplayTree::Locator ins; + ASSERT_RESULT(tree()->Insert(current.from(), &ins)); + ins.set_value(Entry(current.from(), + current.to(), + empty()->Extend(value))); + break; + } + } +} + + +OutSet* DispatchTable::Get(uc16 value) { + ZoneSplayTree::Locator loc; + if (!tree()->FindGreatestLessThan(value, &loc)) + return empty(); + Entry* entry = &loc.value(); + if (value <= entry->to()) + return entry->out_set(); + else + return empty(); +} + + +// ------------------------------------------------------------------- +// Analysis + + +void Analysis::EnsureAnalyzed(RegExpNode* that) { + if (that->info()->been_analyzed || that->info()->being_analyzed) + return; + that->info()->being_analyzed = true; + that->Accept(this); + that->info()->being_analyzed = false; + that->info()->been_analyzed = true; +} + + +void Analysis::VisitEnd(EndNode* that) { + // nothing to do +} + + +void Analysis::VisitText(TextNode* that) { + EnsureAnalyzed(that->on_success()); + EnsureAnalyzed(that->on_failure()); +} + + +void Analysis::VisitAction(ActionNode* that) { + RegExpNode* next = that->on_success(); + EnsureAnalyzed(next); + that->info()->determine_newline = next->info()->prev_determine_newline(); + that->info()->determine_word = next->info()->prev_determine_word(); + that->info()->determine_start = next->info()->prev_determine_start(); +} + + +void Analysis::VisitChoice(ChoiceNode* that) { + NodeInfo* info = that->info(); + for (int i = 0; i < that->alternatives()->length(); i++) { + RegExpNode* node = that->alternatives()->at(i).node(); + EnsureAnalyzed(node); + info->determine_newline |= node->info()->prev_determine_newline(); + info->determine_word |= node->info()->prev_determine_word(); + info->determine_start |= node->info()->prev_determine_start(); + } + if (!that->table_calculated()) { + DispatchTableConstructor cons(that->table()); + cons.BuildTable(that); + } + EnsureAnalyzed(that->on_failure()); +} + + +void Analysis::VisitBackReference(BackReferenceNode* that) { + EnsureAnalyzed(that->on_success()); + EnsureAnalyzed(that->on_failure()); +} + + +// ------------------------------------------------------------------- +// Dispatch table construction + + +void DispatchTableConstructor::VisitEnd(EndNode* that) { + AddRange(CharacterRange::Everything()); +} + + +void DispatchTableConstructor::BuildTable(ChoiceNode* node) { + ASSERT(!node->table_calculated()); + node->set_being_calculated(true); + ZoneList* alternatives = node->alternatives(); + for (int i = 0; i < alternatives->length(); i++) { + set_choice_index(i); + alternatives->at(i).node()->Accept(this); + } + node->set_being_calculated(false); + node->set_table_calculated(true); +} + + +class AddDispatchRange { + public: + explicit AddDispatchRange(DispatchTableConstructor* constructor) + : constructor_(constructor) { } + void Call(uc32 from, DispatchTable::Entry entry); + private: + DispatchTableConstructor* constructor_; +}; + + +void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) { + CharacterRange range(from, entry.to()); + constructor_->AddRange(range); +} + + +void DispatchTableConstructor::VisitChoice(ChoiceNode* node) { + if (node->being_calculated()) + return; + if (!node->table_calculated()) { + DispatchTableConstructor constructor(node->table()); + constructor.BuildTable(node); + } + ASSERT(node->table_calculated()); + AddDispatchRange adder(this); + node->table()->ForEach(&adder); +} + + +void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) { + // TODO(160): Find the node that we refer back to and propagate its start + // set back to here. For now we just accept anything. + AddRange(CharacterRange::Everything()); +} + + + +static int CompareRangeByFrom(const CharacterRange* a, + const CharacterRange* b) { + return Spaceship(a->from(), b->from()); +} + + +void DispatchTableConstructor::AddInverse(ZoneList* ranges) { + ranges->Sort(CompareRangeByFrom); + uc16 last = 0; + for (int i = 0; i < ranges->length(); i++) { + CharacterRange range = ranges->at(i); + if (last < range.from()) + AddRange(CharacterRange(last, range.from() - 1)); + if (range.to() >= last) { + if (range.to() == 0xFFFF) { + return; + } else { + last = range.to() + 1; + } + } + } + AddRange(CharacterRange(last, 0xFFFF)); +} + + +void DispatchTableConstructor::VisitText(TextNode* that) { + TextElement elm = that->elements()->at(0); + switch (elm.type) { + case TextElement::ATOM: { + uc16 c = elm.data.u_atom->data()[0]; + AddRange(CharacterRange(c, c)); + break; + } + case TextElement::CHAR_CLASS: { + RegExpCharacterClass* tree = elm.data.u_char_class; + ZoneList* ranges = tree->ranges(); + if (tree->is_negated()) { + AddInverse(ranges); + } else { + for (int i = 0; i < ranges->length(); i++) + AddRange(ranges->at(i)); + } + break; + } + default: { + UNIMPLEMENTED(); + } + } +} + + +void DispatchTableConstructor::VisitAction(ActionNode* that) { + that->on_success()->Accept(this); +} + + +Handle RegExpEngine::Compile(RegExpParseResult* input, + RegExpNode** node_return, + bool ignore_case) { + RegExpCompiler compiler(input->capture_count, ignore_case); + // Wrap the body of the regexp in capture #0. + RegExpNode* captured_body = RegExpCapture::ToNode(input->tree, + 0, + &compiler, + compiler.accept(), + compiler.backtrack()); + // Add a .*? at the beginning, outside the body capture. + // Note: We could choose to not add this if the regexp is anchored at + // the start of the input but I'm not sure how best to do that and + // since we don't even handle ^ yet I'm saving that optimization for + // later. + RegExpNode* node = RegExpQuantifier::ToNode(0, + RegExpQuantifier::kInfinity, + false, + new RegExpCharacterClass('*'), + &compiler, + captured_body, + compiler.backtrack()); + if (node_return != NULL) *node_return = node; + Analysis analysis; + analysis.EnsureAnalyzed(node); + + if (!FLAG_irregexp) { + return Handle::null(); + } + +#if !(defined ARM || defined __arm__ || defined __thumb__) + if (FLAG_irregexp_native) { // Flag only checked in IA32 mode. + // TODO(lrn) Move compilation to a later point in the life-cycle + // of the RegExp. We don't know the type of input string yet. + // For now, always assume two-byte strings. + RegExpMacroAssemblerIA32 macro_assembler(RegExpMacroAssemblerIA32::UC16, + (input->capture_count + 1) * 2, + ignore_case); + return compiler.Assemble(¯o_assembler, + node, + input->capture_count); + } +#endif + byte codes[1024]; + IrregexpAssembler assembler(Vector(codes, 1024)); + RegExpMacroAssemblerIrregexp macro_assembler(&assembler); + return compiler.Assemble(¯o_assembler, + node, + input->capture_count); +} + + }} // namespace v8::internal diff --git a/src/jsregexp.h b/src/jsregexp.h index c05380d..d8ed043 100644 --- a/src/jsregexp.h +++ b/src/jsregexp.h @@ -30,6 +30,10 @@ namespace v8 { namespace internal { + +class RegExpMacroAssembler; + + class RegExpImpl { public: // Creates a regular expression literal in the old space. @@ -61,10 +65,28 @@ class RegExpImpl { static Handle ExecGlobal(Handle regexp, Handle subject); + // Stores an uncompiled RegExp pattern in the JSRegExp object. + // It will be compiled by JSCRE when first executed. + static Handle JscrePrepare(Handle re, + Handle pattern, + JSRegExp::Flags flags); + + // Stores a compiled RegExp pattern in the JSRegExp object. + // The pattern is compiled by Irregexp. + static Handle IrregexpPrepare(Handle re, + Handle pattern, + JSRegExp::Flags flags, + Handle irregexp_data); + + + // Compile the pattern using JSCRE and store the result in the + // JSRegExp object. + static Handle JscreCompile(Handle re); + static Handle AtomCompile(Handle re, Handle pattern, - JSRegExp::Flags flags); - + JSRegExp::Flags flags, + Handle match_pattern); static Handle AtomExec(Handle regexp, Handle subject, Handle index); @@ -72,47 +94,78 @@ class RegExpImpl { static Handle AtomExecGlobal(Handle regexp, Handle subject); - static Handle JsreCompile(Handle re, - Handle pattern, - JSRegExp::Flags flags); + static Handle JscreCompile(Handle re, + Handle pattern, + JSRegExp::Flags flags); - static Handle JsreExec(Handle regexp, - Handle subject, - Handle index); + // Execute a compiled JSCRE pattern. + static Handle JscreExec(Handle regexp, + Handle subject, + Handle index); - static Handle JsreExecGlobal(Handle regexp, - Handle subject); + // Execute an Irregexp bytecode pattern. + static Handle IrregexpExec(Handle regexp, + Handle subject, + Handle index); + + static Handle JscreExecGlobal(Handle regexp, + Handle subject); + + static Handle IrregexpExecGlobal(Handle regexp, + Handle subject); static void NewSpaceCollectionPrologue(); static void OldSpaceCollectionPrologue(); - private: // Converts a source string to a 16 bit flat string. The string // will be either sequential or it will be a SlicedString backed // by a flat string. static Handle StringToTwoByte(Handle pattern); static Handle CachedStringToTwoByte(Handle pattern); + static const int kIrregexpImplementationIndex = 0; + static const int kIrregexpNumberOfCapturesIndex = 1; + static const int kIrregexpNumberOfRegistersIndex = 2; + static const int kIrregexpCodeIndex = 3; + static const int kIrregexpDataLength = 4; + + static const int kJscreNumberOfCapturesIndex = 0; + static const int kJscreInternalIndex = 1; + static const int kJscreDataLength = 2; + + private: static String* last_ascii_string_; static String* two_byte_cached_string_; - // Returns the caputure from the re. - static int JsreCapture(Handle re); - static ByteArray* JsreInternal(Handle re); + static int JscreNumberOfCaptures(Handle re); + static ByteArray* JscreInternal(Handle re); + + static int IrregexpNumberOfCaptures(Handle re); + static int IrregexpNumberOfRegisters(Handle re); + static Handle IrregexpCode(Handle re); // Call jsRegExpExecute once - static Handle JsreExecOnce(Handle regexp, - int num_captures, - Handle subject, - int previous_index, - const uc16* utf8_subject, - int* ovector, - int ovector_length); + static Handle JscreExecOnce(Handle regexp, + int num_captures, + Handle subject, + int previous_index, + const uc16* utf8_subject, + int* ovector, + int ovector_length); + + static Handle IrregexpExecOnce(Handle regexp, + int num_captures, + Handle subject16, + int previous_index, + int* ovector, + int ovector_length); // Set the subject cache. The previous string buffer is not deleted, so the // caller should ensure that it doesn't leak. - static void SetSubjectCache(String* subject, char* utf8_subject, - int uft8_length, int character_position, + static void SetSubjectCache(String* subject, + char* utf8_subject, + int uft8_length, + int character_position, int utf8_position); // A one element cache of the last utf8_subject string and its length. The @@ -125,6 +178,599 @@ class RegExpImpl { }; +class CharacterRange { + public: + CharacterRange() : from_(0), to_(0) { } + // For compatibility with the CHECK_OK macro + CharacterRange(void* null) { ASSERT_EQ(NULL, null); } //NOLINT + CharacterRange(uc16 from, uc16 to) + : from_(from), + to_(to) { + } + static void AddClassEscape(uc16 type, ZoneList* ranges); + static inline CharacterRange Singleton(uc16 value) { + return CharacterRange(value, value); + } + static inline CharacterRange Range(uc16 from, uc16 to) { + ASSERT(from <= to); + return CharacterRange(from, to); + } + static inline CharacterRange Everything() { + return CharacterRange(0, 0xFFFF); + } + bool Contains(uc16 i) { return from_ <= i && i <= to_; } + uc16 from() const { return from_; } + void set_from(uc16 value) { from_ = value; } + uc16 to() const { return to_; } + void set_to(uc16 value) { to_ = value; } + bool is_valid() { return from_ <= to_; } + bool IsSingleton() { return (from_ == to_); } + void AddCaseEquivalents(ZoneList* ranges); + static const int kRangeCanonicalizeMax = 0x200; + static const int kStartMarker = (1 << 24); + static const int kPayloadMask = (1 << 24) - 1; + private: + uc16 from_; + uc16 to_; +}; + + +template +static void DoForEach(Node* node, Callback* callback); + + +// A zone splay tree. The config type parameter encapsulates the +// different configurations of a concrete splay tree: +// +// typedef Key: the key type +// typedef Value: the value type +// static const kNoKey: the dummy key used when no key is set +// static const kNoValue: the dummy value used to initialize nodes +// int (Compare)(Key& a, Key& b) -> {-1, 0, 1}: comparison function +// +template +class ZoneSplayTree : public ZoneObject { + public: + typedef typename Config::Key Key; + typedef typename Config::Value Value; + + class Locator; + + ZoneSplayTree() : root_(NULL) { } + + // Inserts the given key in this tree with the given value. Returns + // true if a node was inserted, otherwise false. If found the locator + // is enabled and provides access to the mapping for the key. + bool Insert(const Key& key, Locator* locator); + + // Looks up the key in this tree and returns true if it was found, + // otherwise false. If the node is found the locator is enabled and + // provides access to the mapping for the key. + bool Find(const Key& key, Locator* locator); + + // Finds the mapping with the greatest key less than or equal to the + // given key. + bool FindGreatestLessThan(const Key& key, Locator* locator); + + // Find the mapping with the greatest key in this tree. + bool FindGreatest(Locator* locator); + + // Finds the mapping with the least key greater than or equal to the + // given key. + bool FindLeastGreaterThan(const Key& key, Locator* locator); + + // Find the mapping with the least key in this tree. + bool FindLeast(Locator* locator); + + // Remove the node with the given key from the tree. + bool Remove(const Key& key); + + bool is_empty() { return root_ == NULL; } + + // Perform the splay operation for the given key. Moves the node with + // the given key to the top of the tree. If no node has the given + // key, the last node on the search path is moved to the top of the + // tree. + void Splay(const Key& key); + + class Node : public ZoneObject { + public: + Node(const Key& key, const Value& value) + : key_(key), + value_(value), + left_(NULL), + right_(NULL) { } + Key key() { return key_; } + Value value() { return value_; } + Node* left() { return left_; } + Node* right() { return right_; } + private: + friend class ZoneSplayTree; + friend class Locator; + Key key_; + Value value_; + Node* left_; + Node* right_; + }; + + // A locator provides access to a node in the tree without actually + // exposing the node. + class Locator { + public: + explicit Locator(Node* node) : node_(node) { } + Locator() : node_(NULL) { } + const Key& key() { return node_->key_; } + Value& value() { return node_->value_; } + void set_value(const Value& value) { node_->value_ = value; } + inline void bind(Node* node) { node_ = node; } + private: + Node* node_; + }; + + template + void ForEach(Callback* c) { + DoForEach::Node, Callback>(root_, c); + } + + private: + Node* root_; +}; + + +// A set of unsigned integers that behaves especially well on small +// integers (< 32). May do zone-allocation. +class OutSet: public ZoneObject { + public: + OutSet() : first_(0), remaining_(NULL), successors_(NULL) { } + OutSet* Extend(unsigned value); + bool Get(unsigned value); + static const unsigned kFirstLimit = 32; + private: + + // Destructively set a value in this set. In most cases you want + // to use Extend instead to ensure that only one instance exists + // that contains the same values. + void Set(unsigned value); + + // The successors are a list of sets that contain the same values + // as this set and the one more value that is not present in this + // set. + ZoneList* successors() { return successors_; } + + OutSet(uint32_t first, ZoneList* remaining) + : first_(first), remaining_(remaining), successors_(NULL) { } + uint32_t first_; + ZoneList* remaining_; + ZoneList* successors_; +}; + + +// A mapping from integers, specified as ranges, to a set of integers. +// Used for mapping character ranges to choices. +class DispatchTable { + public: + class Entry { + public: + Entry() + : from_(0), to_(0), out_set_(NULL) { } + Entry(uc16 from, uc16 to, OutSet* out_set) + : from_(from), to_(to), out_set_(out_set) { } + uc16 from() { return from_; } + uc16 to() { return to_; } + void set_to(uc16 value) { to_ = value; } + void AddValue(int value) { out_set_ = out_set_->Extend(value); } + OutSet* out_set() { return out_set_; } + private: + uc16 from_; + uc16 to_; + OutSet* out_set_; + }; + + class Config { + public: + typedef uc16 Key; + typedef Entry Value; + static const uc16 kNoKey; + static const Entry kNoValue; + static inline int Compare(uc16 a, uc16 b) { + if (a == b) + return 0; + else if (a < b) + return -1; + else + return 1; + } + }; + + void AddRange(CharacterRange range, int value); + OutSet* Get(uc16 value); + void Dump(); + + template + void ForEach(Callback* callback) { return tree()->ForEach(callback); } + private: + // There can't be a static empty set since it allocates its + // successors in a zone and caches them. + OutSet* empty() { return &empty_; } + OutSet empty_; + ZoneSplayTree* tree() { return &tree_; } + ZoneSplayTree tree_; +}; + + +#define FOR_EACH_NODE_TYPE(VISIT) \ + VISIT(End) \ + VISIT(Action) \ + VISIT(Choice) \ + VISIT(BackReference) \ + VISIT(Text) + + +#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ + VISIT(Disjunction) \ + VISIT(Alternative) \ + VISIT(Assertion) \ + VISIT(CharacterClass) \ + VISIT(Atom) \ + VISIT(Quantifier) \ + VISIT(Capture) \ + VISIT(Lookahead) \ + VISIT(BackReference) \ + VISIT(Empty) \ + VISIT(Text) + + +#define FORWARD_DECLARE(Name) class RegExp##Name; +FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE) +#undef FORWARD_DECLARE + + +class TextElement { + public: + enum Type {UNINITIALIZED, ATOM, CHAR_CLASS}; + TextElement() : type(UNINITIALIZED) { } + explicit TextElement(Type t) : type(t) { } + static TextElement Atom(RegExpAtom* atom); + static TextElement CharClass(RegExpCharacterClass* char_class); + Type type; + union { + RegExpAtom* u_atom; + RegExpCharacterClass* u_char_class; + } data; +}; + + +struct NodeInfo { + NodeInfo() + : being_analyzed(false), + been_analyzed(false), + determine_word(false), + determine_newline(false), + determine_start(false), + follows_word_interest(false), + follows_newline_interest(false), + follows_start_interest(false) { } + bool SameInterests(NodeInfo* that) { + return (follows_word_interest == that->follows_word_interest) + && (follows_newline_interest == that->follows_newline_interest) + && (follows_start_interest == that->follows_start_interest); + } + void AdoptInterests(NodeInfo* that) { + follows_word_interest = that->follows_word_interest; + follows_newline_interest = that->follows_newline_interest; + follows_start_interest = that->follows_start_interest; + } + bool prev_determine_word() { + return determine_word || follows_word_interest; + } + bool prev_determine_newline() { + return determine_newline || follows_newline_interest; + } + bool prev_determine_start() { + return determine_start || follows_start_interest; + } + bool being_analyzed: 1; + bool been_analyzed: 1; + bool determine_word: 1; + bool determine_newline: 1; + bool determine_start: 1; + bool follows_word_interest: 1; + bool follows_newline_interest: 1; + bool follows_start_interest: 1; +}; + + +STATIC_CHECK(sizeof(NodeInfo) <= sizeof(int)); // NOLINT + + +class SiblingList { + public: + SiblingList() : list_(NULL) { } + int length() { + return list_ == NULL ? 0 : list_->length(); + } + void Ensure(RegExpNode* parent) { + if (list_ == NULL) { + list_ = new ZoneList(2); + list_->Add(parent); + } + } + void Add(RegExpNode* node) { list_->Add(node); } + RegExpNode* Get(int index) { return list_->at(index); } + private: + ZoneList* list_; +}; + + +class RegExpNode: public ZoneObject { + public: + virtual ~RegExpNode() { } + virtual void Accept(NodeVisitor* visitor) = 0; + // Generates a goto to this node or actually generates the code at this point. + // Until the implementation is complete we will return true for success and + // false for failure. + virtual bool GoTo(RegExpCompiler* compiler); + Label* label(); + + // Until the implementation is complete we will return true for success and + // false for failure. + virtual bool Emit(RegExpCompiler* compiler) = 0; + virtual RegExpNode* PropagateInterest(NodeInfo* info) = 0; + NodeInfo* info() { return &info_; } + virtual bool IsBacktrack() { return false; } + RegExpNode* GetSibling(NodeInfo* info); + void EnsureSiblings() { siblings_.Ensure(this); } + void AddSibling(RegExpNode* node) { siblings_.Add(node); } + protected: + inline void Bind(RegExpMacroAssembler* macro); + private: + Label label_; + NodeInfo info_; + SiblingList siblings_; +}; + + +class SeqRegExpNode: public RegExpNode { + public: + explicit SeqRegExpNode(RegExpNode* on_success) + : on_success_(on_success) { } + RegExpNode* on_success() { return on_success_; } + void set_on_success(RegExpNode* node) { on_success_ = node; } + virtual bool Emit(RegExpCompiler* compiler) { return false; } + private: + RegExpNode* on_success_; +}; + + +class ActionNode: public SeqRegExpNode { + public: + enum Type { + STORE_REGISTER, + INCREMENT_REGISTER, + STORE_POSITION, + SAVE_POSITION, + RESTORE_POSITION, + BEGIN_SUBMATCH, + ESCAPE_SUBMATCH + }; + static ActionNode* StoreRegister(int reg, int val, RegExpNode* on_success); + static ActionNode* IncrementRegister(int reg, RegExpNode* on_success); + static ActionNode* StorePosition(int reg, RegExpNode* on_success); + static ActionNode* SavePosition(int reg, RegExpNode* on_success); + static ActionNode* RestorePosition(int reg, RegExpNode* on_success); + static ActionNode* BeginSubmatch(int reg, RegExpNode* on_success); + static ActionNode* EscapeSubmatch(int reg, RegExpNode* on_success); + virtual void Accept(NodeVisitor* visitor); + virtual bool Emit(RegExpCompiler* compiler); + virtual RegExpNode* PropagateInterest(NodeInfo* info); + private: + union { + struct { + int reg; + int value; + } u_store_register; + struct { + int reg; + } u_increment_register; + struct { + int reg; + } u_position_register; + struct { + int reg; + } u_submatch_stack_pointer_register; + } data_; + ActionNode(Type type, RegExpNode* on_success) + : SeqRegExpNode(on_success), + type_(type) { } + Type type_; + friend class DotPrinter; +}; + + +class TextNode: public SeqRegExpNode { + public: + TextNode(ZoneList* elms, + RegExpNode* on_success, + RegExpNode* on_failure) + : SeqRegExpNode(on_success), + on_failure_(on_failure), + elms_(elms) { } + virtual void Accept(NodeVisitor* visitor); + virtual RegExpNode* PropagateInterest(NodeInfo* info); + RegExpNode* on_failure() { return on_failure_; } + virtual bool Emit(RegExpCompiler* compiler); + ZoneList* elements() { return elms_; } + private: + RegExpNode* on_failure_; + ZoneList* elms_; +}; + + +class BackReferenceNode: public SeqRegExpNode { + public: + BackReferenceNode(int start_reg, + int end_reg, + RegExpNode* on_success, + RegExpNode* on_failure) + : SeqRegExpNode(on_success), + on_failure_(on_failure), + start_reg_(start_reg), + end_reg_(end_reg) { } + virtual void Accept(NodeVisitor* visitor); + RegExpNode* on_failure() { return on_failure_; } + int start_register() { return start_reg_; } + int end_register() { return end_reg_; } + virtual bool Emit(RegExpCompiler* compiler); + virtual RegExpNode* PropagateInterest(NodeInfo* info); + private: + RegExpNode* on_failure_; + int start_reg_; + int end_reg_; +}; + + +class EndNode: public RegExpNode { + public: + enum Action { ACCEPT, BACKTRACK }; + explicit EndNode(Action action) : action_(action) { } + virtual void Accept(NodeVisitor* visitor); + virtual bool Emit(RegExpCompiler* compiler); + virtual RegExpNode* PropagateInterest(NodeInfo* info); + virtual bool IsBacktrack() { return action_ == BACKTRACK; } + virtual bool GoTo(RegExpCompiler* compiler); + private: + Action action_; +}; + + +class Guard: public ZoneObject { + public: + enum Relation { LT, GEQ }; + Guard(int reg, Relation op, int value) + : reg_(reg), + op_(op), + value_(value) { } + int reg() { return reg_; } + Relation op() { return op_; } + int value() { return value_; } + private: + int reg_; + Relation op_; + int value_; +}; + + +class GuardedAlternative { + public: + explicit GuardedAlternative(RegExpNode* node) : node_(node), guards_(NULL) { } + void AddGuard(Guard* guard); + RegExpNode* node() { return node_; } + void set_node(RegExpNode* node) { node_ = node; } + ZoneList* guards() { return guards_; } + private: + RegExpNode* node_; + ZoneList* guards_; +}; + + +class ChoiceNode: public RegExpNode { + public: + explicit ChoiceNode(int expected_size, RegExpNode* on_failure) + : on_failure_(on_failure), + alternatives_(new ZoneList(expected_size)), + table_calculated_(false), + being_calculated_(false) { } + virtual void Accept(NodeVisitor* visitor); + void AddAlternative(GuardedAlternative node) { alternatives()->Add(node); } + ZoneList* alternatives() { return alternatives_; } + DispatchTable* table() { return &table_; } + RegExpNode* on_failure() { return on_failure_; } + virtual bool Emit(RegExpCompiler* compiler); + virtual RegExpNode* PropagateInterest(NodeInfo* info); + bool table_calculated() { return table_calculated_; } + void set_table_calculated(bool b) { table_calculated_ = b; } + bool being_calculated() { return being_calculated_; } + void set_being_calculated(bool b) { being_calculated_ = b; } + private: + void GenerateGuard(RegExpMacroAssembler* macro_assembler, + Guard *guard, + Label* on_failure); + RegExpNode* on_failure_; + ZoneList* alternatives_; + DispatchTable table_; + bool table_calculated_; + bool being_calculated_; +}; + + +class NodeVisitor { + public: + virtual ~NodeVisitor() { } +#define DECLARE_VISIT(Type) \ + virtual void Visit##Type(Type##Node* that) = 0; +FOR_EACH_NODE_TYPE(DECLARE_VISIT) +#undef DECLARE_VISIT +}; + + +// Node visitor used to add the start set of the alternatives to the +// dispatch table of a choice node. +class DispatchTableConstructor: public NodeVisitor { + public: + explicit DispatchTableConstructor(DispatchTable* table) + : table_(table), + choice_index_(-1) { } + + void BuildTable(ChoiceNode* node); + + void AddRange(CharacterRange range) { + table()->AddRange(range, choice_index_); + } + + void AddInverse(ZoneList* ranges); + +#define DECLARE_VISIT(Type) \ + virtual void Visit##Type(Type##Node* that); +FOR_EACH_NODE_TYPE(DECLARE_VISIT) +#undef DECLARE_VISIT + + DispatchTable* table() { return table_; } + void set_choice_index(int value) { choice_index_ = value; } + + protected: + DispatchTable *table_; + int choice_index_; +}; + + +class Analysis: public NodeVisitor { + public: + void EnsureAnalyzed(RegExpNode* node); + +#define DECLARE_VISIT(Type) \ + virtual void Visit##Type(Type##Node* that); +FOR_EACH_NODE_TYPE(DECLARE_VISIT) +#undef DECLARE_VISIT +}; + + +struct RegExpParseResult { + RegExpTree* tree; + bool has_character_escapes; + Handle error; + int capture_count; +}; + + +class RegExpEngine: public AllStatic { + public: + static Handle Compile(RegExpParseResult* input, + RegExpNode** node_return, + bool ignore_case); + static void DotPrint(const char* label, RegExpNode* node); +}; + + } } // namespace v8::internal #endif // V8_JSREGEXP_H_ diff --git a/src/list-inl.h b/src/list-inl.h index a185af3..80094fb 100644 --- a/src/list-inl.h +++ b/src/list-inl.h @@ -90,11 +90,18 @@ void List::Iterate(void (*callback)(T* x)) { template +bool List::Contains(const T& elm) { + for (int i = 0; i < length_; i++) { + if (data_[i] == elm) + return true; + } + return false; +} + + +template void List::Sort(int (*cmp)(const T* x, const T* y)) { - qsort(data_, - length_, - sizeof(T), - reinterpret_cast(cmp)); + ToVector().Sort(cmp); #ifdef DEBUG for (int i = 1; i < length_; i++) ASSERT(cmp(&data_[i - 1], &data_[i]) <= 0); @@ -103,6 +110,12 @@ void List::Sort(int (*cmp)(const T* x, const T* y)) { template +void List::Sort() { + Sort(PointerSpaceship); +} + + +template void List::Initialize(int capacity) { ASSERT(capacity >= 0); data_ = (capacity > 0) ? NewData(capacity) : NULL; diff --git a/src/list.h b/src/list.h index 34b18fb..2f8aa90 100644 --- a/src/list.h +++ b/src/list.h @@ -46,6 +46,7 @@ namespace v8 { namespace internal { template class List { public: + INLINE(explicit List(int capacity)) { Initialize(capacity); } INLINE(~List()) { DeleteData(data_); } @@ -67,6 +68,8 @@ class List { Vector ToVector() { return Vector(data_, length_); } + Vector ToConstVector() { return Vector(data_, length_); } + // Adds a copy of the given 'element' to the end of the list, // expanding the list if necessary. T& Add(const T& element); @@ -92,11 +95,14 @@ class List { // Drops all but the first 'pos' elements from the list. INLINE(void Rewind(int pos)); + bool Contains(const T& elm); + // Iterate through all list entries, starting at index 0. void Iterate(void (*callback)(T* x)); // Sort all list entries (using QuickSort) void Sort(int (*cmp)(const T* x, const T* y)); + void Sort(); INLINE(void Initialize(int capacity)); diff --git a/src/objects-debug.cc b/src/objects-debug.cc index f2dd3b5..1c1ffed 100644 --- a/src/objects-debug.cc +++ b/src/objects-debug.cc @@ -670,7 +670,14 @@ void JSRegExp::JSRegExpVerify() { } case JSRegExp::JSCRE: { FixedArray* arr = FixedArray::cast(data()); - ASSERT(arr->get(JSRegExp::kJscreDataIndex)->IsFixedArray()); + Object* jscre_data = arr->get(JSRegExp::kJscreDataIndex); + ASSERT(jscre_data->IsFixedArray() || jscre_data->IsUndefined()); + break; + } + case JSRegExp::IRREGEXP: { + FixedArray* arr = FixedArray::cast(data()); + Object* jscre_data = arr->get(JSRegExp::kJscreDataIndex); + ASSERT(jscre_data->IsFixedArray()); break; } default: diff --git a/src/objects-inl.h b/src/objects-inl.h index 97c6819..e8c4e91 100644 --- a/src/objects-inl.h +++ b/src/objects-inl.h @@ -279,6 +279,16 @@ bool StringShape::IsExternalTwoByte() { } +uc32 FlatStringReader::Get(int index) { + ASSERT(0 <= index && index <= length_); + if (is_ascii_) { + return static_cast(start_)[index]; + } else { + return static_cast(start_)[index]; + } +} + + bool Object::IsNumber() { return IsSmi() || IsHeapNumber(); } @@ -1142,6 +1152,13 @@ Object* FixedArray::get(int index) { } +void FixedArray::set(int index, Smi* value) { + ASSERT(reinterpret_cast(value)->IsSmi()); + int offset = kHeaderSize + index * kPointerSize; + WRITE_FIELD(this, offset, value); +} + + void FixedArray::set(int index, Object* value) { ASSERT(index >= 0 && index < this->length()); int offset = kHeaderSize + index * kPointerSize; @@ -1747,6 +1764,7 @@ Code::Flags Code::flags() { void Code::set_flags(Code::Flags flags) { + STATIC_ASSERT(Code::NUMBER_OF_KINDS <= (kFlagsKindMask >> kFlagsKindShift)+1); // Make sure that all call stubs have an arguments count. ASSERT(ExtractKindFromFlags(flags) != CALL_IC || ExtractArgumentsCountFromFlags(flags) >= 0); @@ -2213,6 +2231,22 @@ JSRegExp::Type JSRegExp::TypeTag() { } +JSRegExp::Flags JSRegExp::GetFlags() { + ASSERT(this->data()->IsFixedArray()); + Object* data = this->data(); + Smi* smi = Smi::cast(FixedArray::cast(data)->get(kFlagsIndex)); + return Flags(smi->value()); +} + + +String* JSRegExp::Pattern() { + ASSERT(this->data()->IsFixedArray()); + Object* data = this->data(); + String* pattern= String::cast(FixedArray::cast(data)->get(kSourceIndex)); + return pattern; +} + + Object* JSRegExp::DataAt(int index) { ASSERT(TypeTag() != NOT_COMPILED); return FixedArray::cast(data())->get(index); diff --git a/src/objects.cc b/src/objects.cc index b4fe552..9b88310 100644 --- a/src/objects.cc +++ b/src/objects.cc @@ -3501,6 +3501,57 @@ const unibrow::byte* String::ReadBlock(String* input, } +FlatStringReader* FlatStringReader::top_ = NULL; + + +FlatStringReader::FlatStringReader(Handle str) + : str_(str.location()), + length_(str->length()), + prev_(top_) { + top_ = this; + RefreshState(); +} + + +FlatStringReader::FlatStringReader(Vector input) + : str_(NULL), + is_ascii_(true), + length_(input.length()), + start_(input.start()), + prev_(top_) { + top_ = this; +} + + +FlatStringReader::~FlatStringReader() { + ASSERT_EQ(top_, this); + top_ = prev_; +} + + +void FlatStringReader::RefreshState() { + if (str_ == NULL) return; + Handle str(str_); + StringShape shape(*str); + ASSERT(str->IsFlat(shape)); + is_ascii_ = shape.IsAsciiRepresentation(); + if (is_ascii_) { + start_ = str->ToAsciiVector().start(); + } else { + start_ = str->ToUC16Vector().start(); + } +} + + +void FlatStringReader::PostGarbageCollectionProcessing() { + FlatStringReader* current = top_; + while (current != NULL) { + current->RefreshState(); + current = current->prev_; + } +} + + void StringInputBuffer::Seek(unsigned pos) { Reset(pos, input_); } diff --git a/src/objects.h b/src/objects.h index 82edc53..f6feca8 100644 --- a/src/objects.h +++ b/src/objects.h @@ -1498,9 +1498,12 @@ class FixedArray: public Array { // Setter and getter for elements. inline Object* get(int index); + // Setter that uses write barrier. inline void set(int index, Object* value); - // Setter with barrier mode. + // Setter that doesn't need write barrier). + inline void set(int index, Smi* value); + // Setter with explicit barrier mode. inline void set(int index, Object* value, WriteBarrierMode mode); // Setters for frequently used oddballs located in old space. @@ -2114,14 +2117,17 @@ class Code: public HeapObject { CALL_IC, STORE_IC, KEYED_STORE_IC, + // No more than eight kinds. The value currently encoded in three bits in + // Flags. // Pseudo-kinds. + REGEXP = BUILTIN, FIRST_IC_KIND = LOAD_IC, LAST_IC_KIND = KEYED_STORE_IC }; enum { - NUMBER_OF_KINDS = LAST_IC_KIND + 1 + NUMBER_OF_KINDS = KEYED_STORE_IC + 1 }; // A state indicates that inline cache in this Code object contains @@ -2272,7 +2278,6 @@ class Code: public HeapObject { static const int kFlagsTypeMask = 0x000001C0; // 111000000 static const int kFlagsArgumentsCountMask = 0xFFFFFE00; - private: DISALLOW_IMPLICIT_CONSTRUCTORS(Code); }; @@ -2912,7 +2917,13 @@ class JSValue: public JSObject { // Regular expressions class JSRegExp: public JSObject { public: - enum Type { NOT_COMPILED, JSCRE, ATOM }; + // Meaning of Type: + // NOT_COMPILED: Initial value. No data has been stored in the JSRegExp yet. + // JSCRE: A complex RegExp for JSCRE + // ATOM: A simple string to match against using an indexOf operation. + // IRREGEXP: Compiled with Irregexp. + // IRREGEXP_NATIVE: Compiled to native code with Irregexp. + enum Type { NOT_COMPILED, JSCRE, ATOM, IRREGEXP, IRREGEXP_NATIVE }; enum Flag { NONE = 0, GLOBAL = 1, IGNORE_CASE = 2, MULTILINE = 4 }; class Flags { @@ -2929,6 +2940,8 @@ class JSRegExp: public JSObject { DECL_ACCESSORS(data, Object) inline Type TypeTag(); + inline Flags GetFlags(); + inline String* Pattern(); inline Object* DataAt(int index); static inline JSRegExp* cast(Object* obj); @@ -2945,10 +2958,11 @@ class JSRegExp: public JSObject { static const int kTagIndex = 0; static const int kSourceIndex = kTagIndex + 1; static const int kFlagsIndex = kSourceIndex + 1; - // These two are the same since the same entry is shared for + // These three are the same since the same entry is shared for // different purposes in different types of regexps. static const int kAtomPatternIndex = kFlagsIndex + 1; static const int kJscreDataIndex = kFlagsIndex + 1; + static const int kIrregexpDataIndex = kFlagsIndex + 1; static const int kDataSize = kAtomPatternIndex + 1; }; @@ -3578,6 +3592,28 @@ class ExternalTwoByteString: public ExternalString { }; +// A flat string reader provides random access to the contents of a +// string independent of the character width of the string. The handle +// must be valid as long as the reader is being used. +class FlatStringReader BASE_EMBEDDED { + public: + explicit FlatStringReader(Handle str); + explicit FlatStringReader(Vector input); + ~FlatStringReader(); + void RefreshState(); + inline uc32 Get(int index); + int length() { return length_; } + static void PostGarbageCollectionProcessing(); + private: + String** str_; + bool is_ascii_; + int length_; + const void* start_; + FlatStringReader* prev_; + static FlatStringReader* top_; +}; + + // Note that StringInputBuffers are not valid across a GC! To fix this // it would have to store a String Handle instead of a String* and // AsciiStringReadBlock would have to be modified to use memcpy. diff --git a/src/parser.cc b/src/parser.cc index bcc439f..ff9bbdc 100644 --- a/src/parser.cc +++ b/src/parser.cc @@ -34,6 +34,7 @@ #include "runtime.h" #include "parser.h" #include "scopes.h" +#include "string-stream.h" namespace v8 { namespace internal { @@ -227,6 +228,334 @@ class Parser { }; +template +class BufferedZoneList { + public: + + BufferedZoneList() : + list_(NULL), last_(NULL) {} + + // Adds element at end of list. This element is buffered and can + // be read using last() or removed using RemoveLast until a new Add or until + // RemoveLast or GetList has been called. + void Add(T* value) { + if (last_ != NULL) { + if (list_ == NULL) { + list_ = new ZoneList(initial_size); + } + list_->Add(last_); + } + last_ = value; + } + + T* last() { + ASSERT(last_ != NULL); + return last_; + } + + T* RemoveLast() { + ASSERT(last_ != NULL); + T* result = last_; + if (list_ != NULL && list_->length() > 0) + last_ = list_->RemoveLast(); + else + last_ = NULL; + return result; + } + + T* Get(int i) { + ASSERT(0 <= i && i < length()); + if (list_ == NULL) { + ASSERT_EQ(0, i); + return last_; + } else { + if (i == list_->length()) { + ASSERT(last_ != NULL); + return last_; + } else { + return list_->at(i); + } + } + } + + void Clear() { + list_ = NULL; + last_ = NULL; + } + + int length() { + int length = (list_ == NULL) ? 0 : list_->length(); + return length + ((last_ == NULL) ? 0 : 1); + } + + ZoneList* GetList() { + if (list_ == NULL) { + list_ = new ZoneList(initial_size); + } + if (last_ != NULL) { + list_->Add(last_); + last_ = NULL; + } + return list_; + } + + private: + ZoneList* list_; + T* last_; +}; + +// Accumulates RegExp atoms and assertions into lists of terms and alternatives. +class RegExpBuilder { + public: + RegExpBuilder(); + void AddCharacter(uc16 character); + // "Adds" an empty expression. Does nothing except consume a + // following quantifier + void AddEmpty(); + void AddAtom(RegExpTree* tree); + void AddAssertion(RegExpTree* tree); + void NewAlternative(); // '|' + void AddQuantifierToAtom(int min, int max, bool is_greedy); + RegExpTree* ToRegExp(); + private: + void FlushCharacters(); + void FlushText(); + void FlushTerms(); + bool pending_empty_; + ZoneList* characters_; + BufferedZoneList terms_; + BufferedZoneList text_; + BufferedZoneList alternatives_; +#ifdef DEBUG + enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_; +#define LAST(x) last_added_ = x; +#else +#define LAST(x) +#endif +}; + + +RegExpBuilder::RegExpBuilder() + : pending_empty_(false), characters_(NULL), terms_(), alternatives_() +#ifdef DEBUG + , last_added_(ADD_NONE) +#endif + {} + + +void RegExpBuilder::FlushCharacters() { + pending_empty_ = false; + if (characters_ != NULL) { + RegExpTree* atom = new RegExpAtom(characters_->ToConstVector()); + characters_ = NULL; + text_.Add(atom); + LAST(ADD_ATOM); + } +} + + +void RegExpBuilder::FlushText() { + FlushCharacters(); + int num_text = text_.length(); + if (num_text == 0) { + return; + } else if (num_text == 1) { + terms_.Add(text_.last()); + } else { + RegExpText* text = new RegExpText(); + for (int i = 0; i < num_text; i++) + text_.Get(i)->AppendToText(text); + terms_.Add(text); + } + text_.Clear(); +} + + +void RegExpBuilder::AddCharacter(uc16 c) { + pending_empty_ = false; + if (characters_ == NULL) { + characters_ = new ZoneList(4); + } + characters_->Add(c); + LAST(ADD_CHAR); +} + + +void RegExpBuilder::AddEmpty() { + pending_empty_ = true; +} + + +void RegExpBuilder::AddAtom(RegExpTree* term) { + if (term->IsEmpty()) { + AddEmpty(); + return; + } + if (term->IsTextElement()) { + FlushCharacters(); + text_.Add(term); + } else { + FlushText(); + terms_.Add(term); + } + LAST(ADD_ATOM); +} + + +void RegExpBuilder::AddAssertion(RegExpTree* assert) { + FlushText(); + terms_.Add(assert); + LAST(ADD_ASSERT); +} + + +void RegExpBuilder::NewAlternative() { + FlushTerms(); +} + + +void RegExpBuilder::FlushTerms() { + FlushText(); + int num_terms = terms_.length(); + RegExpTree* alternative; + if (num_terms == 0) { + alternative = RegExpEmpty::GetInstance(); + } else if (num_terms == 1) { + alternative = terms_.last(); + } else { + alternative = new RegExpAlternative(terms_.GetList()); + } + alternatives_.Add(alternative); + terms_.Clear(); + LAST(ADD_NONE); +} + + +RegExpTree* RegExpBuilder::ToRegExp() { + FlushTerms(); + int num_alternatives = alternatives_.length(); + if (num_alternatives == 0) { + return RegExpEmpty::GetInstance(); + } + if (num_alternatives == 1) { + return alternatives_.last(); + } + return new RegExpDisjunction(alternatives_.GetList()); +} + + +void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) { + if (pending_empty_) { + pending_empty_ = false; + return; + } + RegExpTree* atom; + if (characters_ != NULL) { + ASSERT(last_added_ == ADD_CHAR); + // Last atom was character. + Vector char_vector = characters_->ToConstVector(); + int num_chars = char_vector.length(); + if (num_chars > 1) { + Vector prefix = char_vector.SubVector(0, num_chars - 1); + text_.Add(new RegExpAtom(prefix)); + char_vector = char_vector.SubVector(num_chars - 1, num_chars); + } + characters_ = NULL; + atom = new RegExpAtom(char_vector); + FlushText(); + } else if (text_.length() > 0) { + ASSERT(last_added_ == ADD_ATOM); + atom = text_.RemoveLast(); + FlushText(); + } else if (terms_.length() > 0) { + ASSERT(last_added_ == ADD_ATOM); + atom = terms_.RemoveLast(); + if (atom->IsLookahead() || atom->IsAssertion()) { + // Guaranteed not to match a non-empty string. + // Assertion as an atom can happen as, e.g., (?:\b) + LAST(ADD_TERM); + if (min == 0) { + return; + } + terms_.Add(atom); + return; + } + } else { + // Only call immediately after adding an atom or character! + UNREACHABLE(); + return; + } + terms_.Add(new RegExpQuantifier(min, max, is_greedy, atom)); + LAST(ADD_TERM); +} + + +class RegExpParser { + public: + RegExpParser(FlatStringReader* in, + Handle* error, + bool multiline_mode); + RegExpTree* ParsePattern(bool* ok); + RegExpTree* ParseDisjunction(bool* ok); + RegExpTree* ParseGroup(bool* ok); + RegExpTree* ParseCharacterClass(bool* ok); + + // Parses a {...,...} quantifier and stores the range in the given + // out parameters. + bool ParseIntervalQuantifier(int* min_out, int* max_out); + + // Parses and returns a single escaped character. The character + // must not be 'b' or 'B' since they are usually handle specially. + uc32 ParseClassCharacterEscape(bool* ok); + + // Checks whether the following is a length-digit hexadecimal number, + // and sets the value if it is. + bool ParseHexEscape(int length, uc32* value); + + uc32 ParseControlLetterEscape(bool* ok); + uc32 ParseOctalLiteral(); + + // Tries to parse the input as a back reference. If successful it + // stores the result in the output parameter and returns true. If + // it fails it will push back the characters read so the same characters + // can be reparsed. + bool ParseBackReferenceIndex(int* index_out); + + CharacterRange ParseClassAtom(bool* is_char_class, + ZoneList* ranges, + bool* ok); + RegExpTree* ReportError(Vector message, bool* ok); + void Advance(); + void Advance(int dist); + void Reset(int pos); + + bool HasCharacterEscapes(); + + int captures_started() { return captures_ == NULL ? 0 : captures_->length(); } + int position() { return next_pos_ - 1; } + + static const uc32 kEndMarker = (1 << 21); + private: + uc32 current() { return current_; } + bool has_more() { return has_more_; } + bool has_next() { return next_pos_ < in()->length(); } + uc32 Next(); + FlatStringReader* in() { return in_; } + void ScanForCaptures(); + bool CaptureAvailable(int index); + uc32 current_; + bool has_more_; + bool multiline_mode_; + int next_pos_; + FlatStringReader* in_; + Handle* error_; + bool has_character_escapes_; + bool is_scanned_for_captures_; + ZoneList* captures_; + int capture_count_; +}; + + // A temporary scope stores information during parsing, just like // a plain scope. However, temporary scopes are not kept around // after parsing or referenced by syntax trees so they can be stack- @@ -3164,6 +3493,756 @@ Expression* Parser::NewThrowError(Handle constructor, // ---------------------------------------------------------------------------- +// Regular expressions + + +RegExpParser::RegExpParser(FlatStringReader* in, + Handle* error, + bool multiline_mode) + : current_(kEndMarker), + has_more_(true), + multiline_mode_(multiline_mode), + next_pos_(0), + in_(in), + error_(error), + has_character_escapes_(false), + is_scanned_for_captures_(false), + captures_(NULL), + capture_count_(0) { + Advance(1); +} + + +uc32 RegExpParser::Next() { + if (has_next()) { + return in()->Get(next_pos_); + } else { + return kEndMarker; + } +} + + +void RegExpParser::Advance() { + if (next_pos_ < in()->length()) { + current_ = in()->Get(next_pos_); + next_pos_++; + } else { + current_ = kEndMarker; + has_more_ = false; + } +} + + +void RegExpParser::Reset(int pos) { + next_pos_ = pos; + Advance(); +} + + +void RegExpParser::Advance(int dist) { + for (int i = 0; i < dist; i++) + Advance(); +} + + +// Reports whether the parsed string atoms contain any characters that were +// escaped in the original pattern. If not, all atoms are proper substrings +// of the original pattern. +bool RegExpParser::HasCharacterEscapes() { + return has_character_escapes_; +} + +RegExpTree* RegExpParser::ReportError(Vector message, bool* ok) { + *ok = false; + *error_ = Factory::NewStringFromAscii(message, NOT_TENURED); + return NULL; +} + + +// Pattern :: +// Disjunction +RegExpTree* RegExpParser::ParsePattern(bool* ok) { + RegExpTree* result = ParseDisjunction(CHECK_OK); + if (has_more()) { + ReportError(CStrVector("Unmatched ')'"), CHECK_OK); + } + return result; +} + + +bool RegExpParser::CaptureAvailable(int index) { + if (captures_ == NULL) return false; + if (index >= captures_->length()) return false; + RegExpCapture* capture = captures_->at(index); + return capture != NULL && capture->available() == CAPTURE_AVAILABLE; +} + + +// Disjunction :: +// Alternative +// Alternative | Disjunction +// Alternative :: +// [empty] +// Term Alternative +// Term :: +// Assertion +// Atom +// Atom Quantifier +RegExpTree* RegExpParser::ParseDisjunction(bool* ok) { + RegExpBuilder builder; + int capture_start_index = captures_started(); + while (true) { + switch (current()) { + case kEndMarker: + case ')': + return builder.ToRegExp(); + case '|': { + Advance(); + builder.NewAlternative(); + int capture_new_alt_start_index = captures_started(); + for (int i = capture_start_index; i < capture_new_alt_start_index; i++) { + RegExpCapture* capture = captures_->at(i); + if (capture->available() == CAPTURE_AVAILABLE) { + capture->set_available(CAPTURE_UNREACHABLE); + } + } + capture_start_index = capture_new_alt_start_index; + continue; + } + case '*': + case '+': + case '?': + ReportError(CStrVector("Nothing to repeat"), CHECK_OK); + case '^': { + Advance(); + RegExpAssertion::Type type = + multiline_mode_ ? RegExpAssertion::START_OF_LINE : + RegExpAssertion::START_OF_INPUT; + builder.AddAssertion(new RegExpAssertion(type)); + continue; + } + case '$': { + Advance(); + RegExpAssertion::Type type = + multiline_mode_ ? RegExpAssertion::END_OF_LINE : + RegExpAssertion::END_OF_INPUT; + builder.AddAssertion(new RegExpAssertion(type)); + continue; + } + case '.': { + Advance(); + // everything except \x0a, \x0d, \u2028 and \u2029 + ZoneList* ranges = new ZoneList(2); + CharacterRange::AddClassEscape('.', ranges); + RegExpTree* atom = new RegExpCharacterClass(ranges, false); + builder.AddAtom(atom); + break; + } + case '(': { + RegExpTree* atom = ParseGroup(CHECK_OK); + builder.AddAtom(atom); + break; + } + case '[': { + RegExpTree* atom = ParseCharacterClass(CHECK_OK); + builder.AddAtom(atom); + break; + } + // Atom :: + // \ AtomEscape + case '\\': + switch (Next()) { + case kEndMarker: + ReportError(CStrVector("\\ at end of pattern"), CHECK_OK); + case 'b': + Advance(2); + builder.AddAssertion( + new RegExpAssertion(RegExpAssertion::BOUNDARY)); + continue; + case 'B': + Advance(2); + builder.AddAssertion( + new RegExpAssertion(RegExpAssertion::NON_BOUNDARY)); + continue; + // AtomEscape :: + // CharacterClassEscape + // + // CharacterClassEscape :: one of + // d D s S w W + case 'd': case 'D': case 's': case 'S': case 'w': case 'W': { + uc32 c = Next(); + Advance(2); + ZoneList* ranges = new ZoneList(2); + CharacterRange::AddClassEscape(c, ranges); + RegExpTree* atom = new RegExpCharacterClass(ranges, false); + builder.AddAtom(atom); + goto has_read_atom; // Avoid setting has_character_escapes_. + } + case '1': case '2': case '3': case '4': case '5': case '6': + case '7': case '8': case '9': { + int index = 0; + if (ParseBackReferenceIndex(&index)) { + if (!CaptureAvailable(index - 1)) { + // Prepare to ignore a following quantifier + builder.AddEmpty(); + goto has_read_atom; + } + RegExpCapture* capture = captures_->at(index - 1); + RegExpTree* atom = new RegExpBackReference(capture); + builder.AddAtom(atom); + goto has_read_atom; // Avoid setting has_character_escapes_. + } + uc32 first_digit = Next(); + if (first_digit == '8' || first_digit == '9') { + // Treat as identity escape + builder.AddCharacter(first_digit); + Advance(2); + break; + } + } + // FALLTHROUGH + case '0': { + Advance(); + uc32 octal = ParseOctalLiteral(); + builder.AddCharacter(octal); + break; + } + // ControlEscape :: one of + // f n r t v + case 'f': + Advance(2); + builder.AddCharacter('\f'); + break; + case 'n': + Advance(2); + builder.AddCharacter('\n'); + break; + case 'r': + Advance(2); + builder.AddCharacter('\r'); + break; + case 't': + Advance(2); + builder.AddCharacter('\t'); + break; + case 'v': + Advance(2); + builder.AddCharacter('\v'); + break; + case 'c': { + Advance(2); + uc32 control = ParseControlLetterEscape(ok); + builder.AddCharacter(control); + break; + } + case 'x': { + Advance(2); + uc32 value; + if (ParseHexEscape(2, &value)) { + builder.AddCharacter(value); + } else { + builder.AddCharacter('x'); + } + break; + } + case 'u': { + Advance(2); + uc32 value; + if (ParseHexEscape(4, &value)) { + builder.AddCharacter(value); + } else { + builder.AddCharacter('u'); + } + break; + } + default: + // Identity escape. + builder.AddCharacter(Next()); + Advance(2); + break; + } + has_character_escapes_ = true; + break; + case '{': { + int dummy; + if (ParseIntervalQuantifier(&dummy, &dummy)) { + ReportError(CStrVector("Nothing to repeat"), CHECK_OK); + } + // fallthrough + } + default: + builder.AddCharacter(current()); + Advance(); + break; + } // end switch(current()) + + has_read_atom: + int min; + int max; + switch (current()) { + // QuantifierPrefix :: + // * + // + + // ? + // { + case '*': + min = 0; + max = RegExpQuantifier::kInfinity; + Advance(); + break; + case '+': + min = 1; + max = RegExpQuantifier::kInfinity; + Advance(); + break; + case '?': + min = 0; + max = 1; + Advance(); + break; + case '{': + if (ParseIntervalQuantifier(&min, &max)) { + break; + } else { + continue; + } + default: + continue; + } + bool is_greedy = true; + if (current() == '?') { + is_greedy = false; + Advance(); + } + builder.AddQuantifierToAtom(min, max, is_greedy); + } +} + +class SourceCharacter { + public: + static bool Is(uc32 c) { + switch (c) { + // case ']': case '}': + // In spidermonkey and jsc these are treated as source characters + // so we do too. + case '^': case '$': case '\\': case '.': case '*': case '+': + case '?': case '(': case ')': case '[': case '{': case '|': + case RegExpParser::kEndMarker: + return false; + default: + return true; + } + } +}; + + +static unibrow::Predicate source_character; + + +static inline bool IsSourceCharacter(uc32 c) { + return source_character.get(c); +} + +#ifdef DEBUG +// Currently only used in an ASSERT. +static bool IsSpecialClassEscape(uc32 c) { + switch (c) { + case 'd': case 'D': + case 's': case 'S': + case 'w': case 'W': + return true; + default: + return false; + } +} +#endif + + +// In order to know whether an escape is a backreference or not we have to scan +// the entire regexp and find the number of capturing parentheses. However we +// don't want to scan the regexp twice unless it is necessary. This mini-parser +// is called when needed. It can see the difference between capturing and +// noncapturing parentheses and can skip character classes and backslash-escaped +// characters. +void RegExpParser::ScanForCaptures() { + int n; + while ((n = current()) != kEndMarker) { + Advance(); + switch (n) { + case '\\': + Advance(); + break; + case '[': { + int c; + while ((c = current()) != kEndMarker) { + Advance(); + if (c == '\\') { + Advance(); + } else { + if (c == ']') break; + } + } + break; + } + case '(': + if (current() != '?') capture_count_++; + break; + } + } + is_scanned_for_captures_ = true; +} + + +bool RegExpParser::ParseBackReferenceIndex(int* index_out) { + ASSERT_EQ('\\', current()); + ASSERT('1' <= Next() && Next() <= '9'); + // Try to parse a decimal literal that is no greater than the number + // of previously encountered left capturing parentheses. + // This is a not according the the ECMAScript specification. According to + // that, one must accept values up to the total number of left capturing + // parentheses in the entire input, even if they are meaningless. + if (!is_scanned_for_captures_) { + int saved_position = position(); + ScanForCaptures(); + Reset(saved_position); + } + if (capture_count_ == 0) return false; + int start = position(); + int value = Next() - '0'; + if (value > capture_count_) return false; + Advance(2); + while (true) { + uc32 c = current(); + if (IsDecimalDigit(c)) { + value = 10 * value + (c - '0'); + if (value > capture_count_) { + Reset(start); + return false; + } + Advance(); + } else { + break; + } + } + *index_out = value; + return true; +} + + +// QuantifierPrefix :: +// { DecimalDigits } +// { DecimalDigits , } +// { DecimalDigits , DecimalDigits } +bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) { + ASSERT_EQ(current(), '{'); + int start = position(); + Advance(); + int min = 0; + if (!IsDecimalDigit(current())) { + Reset(start); + return false; + } + while (IsDecimalDigit(current())) { + min = 10 * min + (current() - '0'); + Advance(); + } + int max = 0; + if (current() == '}') { + max = min; + Advance(); + } else if (current() == ',') { + Advance(); + if (current() == '}') { + max = RegExpQuantifier::kInfinity; + Advance(); + } else { + while (IsDecimalDigit(current())) { + max = 10 * max + (current() - '0'); + Advance(); + } + if (current() != '}') { + Reset(start); + return false; + } + Advance(); + } + } else { + Reset(start); + return false; + } + *min_out = min; + *max_out = max; + return true; +} + + +// Upper and lower case letters differ by one bit. +STATIC_CHECK(('a' ^ 'A') == 0x20); + +uc32 RegExpParser::ParseControlLetterEscape(bool* ok) { + if (!has_more()) { + ReportError(CStrVector("\\c at end of pattern"), ok); + return '\0'; + } + uc32 letter = current() & ~(0x20); // Collapse upper and lower case letters. + if (letter < 'A' || 'Z' < letter) { + // Non-spec error-correction: "\c" followed by non-control letter is + // interpreted as an IdentityEscape of 'c'. + return 'c'; + } + Advance(); + return letter & 0x1f; // Remainder modulo 32, per specification. +} + + +uc32 RegExpParser::ParseOctalLiteral() { + ASSERT('0' <= current() && current() <= '7'); + // For compatibility with some other browsers (not all), we parse + // up to three octal digits with a value below 256. + uc32 value = current() - '0'; + Advance(); + if ('0' <= current() && current() <= '7') { + value = value * 8 + current() - '0'; + Advance(); + if (value < 32 && '0' <= current() && current() <= '7') { + value = value * 8 + current() - '0'; + Advance(); + } + } + return value; +} + + +bool RegExpParser::ParseHexEscape(int length, uc32 *value) { + int start = position(); + uc32 val = 0; + bool done = false; + for (int i = 0; !done; i++) { + uc32 c = current(); + int d = HexValue(c); + if (d < 0) { + Reset(start); + return false; + } + val = val * 16 + d; + Advance(); + if (i == length - 1) { + done = true; + } + } + *value = val; + return true; +} + + +uc32 RegExpParser::ParseClassCharacterEscape(bool* ok) { + ASSERT(current() == '\\'); + ASSERT(has_next() && !IsSpecialClassEscape(Next())); + Advance(); + switch (current()) { + case 'b': + Advance(); + return '\b'; + // ControlEscape :: one of + // f n r t v + case 'f': + Advance(); + return '\f'; + case 'n': + Advance(); + return '\n'; + case 'r': + Advance(); + return '\r'; + case 't': + Advance(); + return '\t'; + case 'v': + Advance(); + return '\v'; + case 'c': + return ParseControlLetterEscape(ok); + case '0': case '1': case '2': case '3': case '4': case '5': + case '6': case '7': + // For compatibility, we interpret a decimal escape that isn't + // a back reference (and therefore either \0 or not valid according + // to the specification) as a 1..3 digit octal character code. + return ParseOctalLiteral(); + case 'x': { + Advance(); + uc32 value; + if (ParseHexEscape(2, &value)) { + return value; + } + // If \x is not followed by a two-digit hexadecimal, treat it + // as an identity escape. + return 'x'; + } + case 'u': { + Advance(); + uc32 value; + if (ParseHexEscape(4, &value)) { + return value; + } + // If \u is not followed by a four-digit hexadecimal, treat it + // as an identity escape. + return 'u'; + } + default: { + // Extended identity escape. We accept any character that hasn't + // been matched by a more specific case, not just the subset required + // by the ECMAScript specification. + uc32 result = current(); + Advance(); + return result; + } + } + return 0; +} + + +RegExpTree* RegExpParser::ParseGroup(bool* ok) { + ASSERT_EQ(current(), '('); + char type = '('; + Advance(); + if (current() == '?') { + switch (Next()) { + case ':': case '=': case '!': + type = Next(); + Advance(2); + break; + default: + ReportError(CStrVector("Invalid group"), CHECK_OK); + break; + } + } else { + if (captures_ == NULL) { + captures_ = new ZoneList(2); + } + captures_->Add(NULL); + if (!is_scanned_for_captures_) capture_count_++; + } + int capture_index = captures_started(); + RegExpTree* body = ParseDisjunction(CHECK_OK); + if (current() != ')') { + ReportError(CStrVector("Unterminated group"), CHECK_OK); + } + Advance(); + + int end_capture_index = captures_started(); + if (type == '!') { + // Captures inside a negative lookahead are never available outside it. + for (int i = capture_index; i < end_capture_index; i++) { + RegExpCapture* capture = captures_->at(i); + ASSERT(capture != NULL); + capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE); + } + } else { + // Captures temporarily unavailable because they are in different + // alternatives are all available after the disjunction. + for (int i = capture_index; i < end_capture_index; i++) { + RegExpCapture* capture = captures_->at(i); + ASSERT(capture != NULL); + if (capture->available() == CAPTURE_UNREACHABLE) { + capture->set_available(CAPTURE_AVAILABLE); + } + } + } + + if (type == '(') { + RegExpCapture* capture = new RegExpCapture(body, capture_index); + captures_->at(capture_index - 1) = capture; + return capture; + } else if (type == ':') { + return body; + } else { + ASSERT(type == '=' || type == '!'); + bool is_positive = (type == '='); + return new RegExpLookahead(body, is_positive); + } +} + + +CharacterRange RegExpParser::ParseClassAtom(bool* is_char_class, + ZoneList* ranges, + bool* ok) { + ASSERT_EQ(false, *is_char_class); + uc32 first = current(); + if (first == '\\') { + switch (Next()) { + case 'w': case 'W': case 'd': case 'D': case 's': case 'S': { + *is_char_class = true; + uc32 c = Next(); + CharacterRange::AddClassEscape(c, ranges); + Advance(2); + return NULL; + } + default: + uc32 c = ParseClassCharacterEscape(CHECK_OK); + return CharacterRange::Singleton(c); + } + } else { + Advance(); + return CharacterRange::Singleton(first); + } +} + + +RegExpTree* RegExpParser::ParseCharacterClass(bool* ok) { + static const char* kUnterminated = "Unterminated character class"; + static const char* kIllegal = "Illegal character class"; + static const char* kRangeOutOfOrder = "Range out of order in character class"; + + ASSERT_EQ(current(), '['); + Advance(); + bool is_negated = false; + if (current() == '^') { + is_negated = true; + Advance(); + } + ZoneList* ranges = new ZoneList(2); + while (has_more() && current() != ']') { + bool is_char_class = false; + CharacterRange first = ParseClassAtom(&is_char_class, ranges, CHECK_OK); + if (!is_char_class) { + if (current() == '-') { + Advance(); + if (current() == kEndMarker) { + // If we reach the end we break out of the loop and let the + // following code report an error. + break; + } else if (current() == ']') { + ranges->Add(first); + ranges->Add(CharacterRange::Singleton('-')); + break; + } + CharacterRange next = + ParseClassAtom(&is_char_class, ranges, CHECK_OK); + if (is_char_class) { + return ReportError(CStrVector(kIllegal), CHECK_OK); + } + if (first.from() > next.to()) { + return ReportError(CStrVector(kRangeOutOfOrder), CHECK_OK); + } + ranges->Add(CharacterRange::Range(first.from(), next.to())); + } else { + ranges->Add(first); + } + } + } + if (!has_more()) { + return ReportError(CStrVector(kUnterminated), CHECK_OK); + } + Advance(); + if (ranges->length() == 0) { + ranges->Add(CharacterRange::Range(0, 0xffff)); + is_negated = !is_negated; + } + return new RegExpCharacterClass(ranges, is_negated); +} + + +// ---------------------------------------------------------------------------- // The Parser interface. // MakeAST() is just a wrapper for the corresponding Parser calls @@ -3211,6 +4290,27 @@ ScriptDataImpl* PreParse(unibrow::CharacterStream* stream, } +bool ParseRegExp(FlatStringReader* input, RegExpParseResult* result) { + ASSERT(result != NULL); + // Get multiline flag somehow + RegExpParser parser(input, &result->error, false); + bool ok = true; + result->tree = parser.ParsePattern(&ok); + if (!ok) { + ASSERT(result->tree == NULL); + ASSERT(!result->error.is_null()); + } else { + ASSERT(result->tree != NULL); + ASSERT(result->error.is_null()); + } + if (ok) { + result->has_character_escapes = parser.HasCharacterEscapes(); + result->capture_count = parser.captures_started(); + } + return ok; +} + + FunctionLiteral* MakeAST(bool compile_in_global_context, Handle