From: yangguo@chromium.org Date: Fri, 26 Aug 2011 13:03:30 +0000 (+0000) Subject: Tentative implementation of string slices (hidden under the flag --string-slices). X-Git-Tag: upstream/4.7.83~18643 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=77141f78ff0b038ed74f96c9d443ea2ef27485ea;p=platform%2Fupstream%2Fv8.git Tentative implementation of string slices (hidden under the flag --string-slices). TEST=test/mjsunit/string-slices.js Review URL: http://codereview.chromium.org/7477045 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@9027 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- diff --git a/src/arm/code-stubs-arm.cc b/src/arm/code-stubs-arm.cc index ba345e2..ffe32bc 100644 --- a/src/arm/code-stubs-arm.cc +++ b/src/arm/code-stubs-arm.cc @@ -4367,6 +4367,8 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { __ cmp(r2, Operand(r0, ASR, kSmiTagSize)); __ b(gt, &runtime); + // Reset offset for possibly sliced string. + __ mov(r9, Operand(0)); // subject: Subject string // regexp_data: RegExp data (FixedArray) // Check the representation and encoding of the subject string. @@ -4374,33 +4376,45 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { __ ldr(r0, FieldMemOperand(subject, HeapObject::kMapOffset)); __ ldrb(r0, FieldMemOperand(r0, Map::kInstanceTypeOffset)); // First check for flat string. - __ tst(r0, Operand(kIsNotStringMask | kStringRepresentationMask)); + __ and_(r1, r0, Operand(kIsNotStringMask | kStringRepresentationMask), SetCC); STATIC_ASSERT((kStringTag | kSeqStringTag) == 0); __ b(eq, &seq_string); // subject: Subject string // regexp_data: RegExp data (FixedArray) - // Check for flat cons string. + // Check for flat cons string or sliced string. // A flat cons string is a cons string where the second part is the empty // string. In that case the subject string is just the first part of the cons // string. Also in this case the first part of the cons string is known to be // a sequential string or an external string. - STATIC_ASSERT(kExternalStringTag !=0); - STATIC_ASSERT((kConsStringTag & kExternalStringTag) == 0); - __ tst(r0, Operand(kIsNotStringMask | kExternalStringTag)); - __ b(ne, &runtime); + // In the case of a sliced string its offset has to be taken into account. + Label cons_string, check_encoding; + STATIC_ASSERT((kConsStringTag < kExternalStringTag)); + STATIC_ASSERT((kSlicedStringTag > kExternalStringTag)); + __ cmp(r1, Operand(kExternalStringTag)); + __ b(lt, &cons_string); + __ b(eq, &runtime); + + // String is sliced. + __ ldr(r9, FieldMemOperand(subject, SlicedString::kOffsetOffset)); + __ mov(r9, Operand(r9, ASR, kSmiTagSize)); + __ ldr(subject, FieldMemOperand(subject, SlicedString::kParentOffset)); + // r9: offset of sliced string, smi-tagged. + __ jmp(&check_encoding); + // String is a cons string, check whether it is flat. + __ bind(&cons_string); __ ldr(r0, FieldMemOperand(subject, ConsString::kSecondOffset)); __ LoadRoot(r1, Heap::kEmptyStringRootIndex); __ cmp(r0, r1); __ b(ne, &runtime); __ ldr(subject, FieldMemOperand(subject, ConsString::kFirstOffset)); + // Is first part of cons or parent of slice a flat string? + __ bind(&check_encoding); __ ldr(r0, FieldMemOperand(subject, HeapObject::kMapOffset)); __ ldrb(r0, FieldMemOperand(r0, Map::kInstanceTypeOffset)); - // Is first part a flat string? STATIC_ASSERT(kSeqStringTag == 0); __ tst(r0, Operand(kStringRepresentationMask)); __ b(ne, &runtime); - __ bind(&seq_string); // subject: Subject string // regexp_data: RegExp data (FixedArray) @@ -4466,21 +4480,30 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { // For arguments 4 and 3 get string length, calculate start of string data and // calculate the shift of the index (0 for ASCII and 1 for two byte). - __ ldr(r0, FieldMemOperand(subject, String::kLengthOffset)); - __ mov(r0, Operand(r0, ASR, kSmiTagSize)); STATIC_ASSERT(SeqAsciiString::kHeaderSize == SeqTwoByteString::kHeaderSize); - __ add(r9, subject, Operand(SeqAsciiString::kHeaderSize - kHeapObjectTag)); + __ add(r8, subject, Operand(SeqAsciiString::kHeaderSize - kHeapObjectTag)); __ eor(r3, r3, Operand(1)); - // Argument 4 (r3): End of string data - // Argument 3 (r2): Start of string data + // Load the length from the original subject string from the previous stack + // frame. Therefore we have to use fp, which points exactly to two pointer + // sizes below the previous sp. (Because creating a new stack frame pushes + // the previous fp onto the stack and moves up sp by 2 * kPointerSize.) + __ ldr(r0, MemOperand(fp, kSubjectOffset + 2 * kPointerSize)); + // If slice offset is not 0, load the length from the original sliced string. + // Argument 4, r3: End of string data + // Argument 3, r2: Start of string data + // Prepare start and end index of the input. + __ add(r9, r8, Operand(r9, LSL, r3)); __ add(r2, r9, Operand(r1, LSL, r3)); - __ add(r3, r9, Operand(r0, LSL, r3)); + + __ ldr(r8, FieldMemOperand(r0, String::kLengthOffset)); + __ mov(r8, Operand(r8, ASR, kSmiTagSize)); + __ add(r3, r9, Operand(r8, LSL, r3)); // Argument 2 (r1): Previous index. // Already there // Argument 1 (r0): Subject string. - __ mov(r0, subject); + // Already there // Locate the code entry and call it. __ add(r7, r7, Operand(Code::kHeaderSize - kHeapObjectTag)); @@ -4497,12 +4520,12 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { // Check the result. Label success; - __ cmp(r0, Operand(NativeRegExpMacroAssembler::SUCCESS)); + __ cmp(subject, Operand(NativeRegExpMacroAssembler::SUCCESS)); __ b(eq, &success); Label failure; - __ cmp(r0, Operand(NativeRegExpMacroAssembler::FAILURE)); + __ cmp(subject, Operand(NativeRegExpMacroAssembler::FAILURE)); __ b(eq, &failure); - __ cmp(r0, Operand(NativeRegExpMacroAssembler::EXCEPTION)); + __ cmp(subject, Operand(NativeRegExpMacroAssembler::EXCEPTION)); // If not exception it can only be retry. Handle that in the runtime system. __ b(ne, &runtime); // Result must now be exception. If there is no pending exception already a @@ -4514,18 +4537,18 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { __ mov(r2, Operand(ExternalReference(Isolate::k_pending_exception_address, isolate))); __ ldr(r0, MemOperand(r2, 0)); - __ cmp(r0, r1); + __ cmp(subject, r1); __ b(eq, &runtime); __ str(r1, MemOperand(r2, 0)); // Clear pending exception. // Check if the exception is a termination. If so, throw as uncatchable. __ LoadRoot(ip, Heap::kTerminationExceptionRootIndex); - __ cmp(r0, ip); + __ cmp(subject, ip); Label termination_exception; __ b(eq, &termination_exception); - __ Throw(r0); // Expects thrown value in r0. + __ Throw(subject); // Expects thrown value in r0. __ bind(&termination_exception); __ ThrowUncatchable(TERMINATION, r0); // Expects thrown value in r0. @@ -4803,6 +4826,7 @@ void StringCharCodeAtGenerator::GenerateFast(MacroAssembler* masm) { Label flat_string; Label ascii_string; Label got_char_code; + Label sliced_string; // If the receiver is a smi trigger the non-string case. __ JumpIfSmi(object_, receiver_not_string_); @@ -4832,7 +4856,11 @@ void StringCharCodeAtGenerator::GenerateFast(MacroAssembler* masm) { __ b(eq, &flat_string); // Handle non-flat strings. - __ tst(result_, Operand(kIsConsStringMask)); + __ and_(result_, result_, Operand(kStringRepresentationMask)); + STATIC_ASSERT((kConsStringTag < kExternalStringTag)); + STATIC_ASSERT((kSlicedStringTag > kExternalStringTag)); + __ cmp(result_, Operand(kExternalStringTag)); + __ b(gt, &sliced_string); __ b(eq, &call_runtime_); // ConsString. @@ -4840,15 +4868,26 @@ void StringCharCodeAtGenerator::GenerateFast(MacroAssembler* masm) { // this is really a flat string in a cons string). If that is not // the case we would rather go to the runtime system now to flatten // the string. + Label assure_seq_string; __ ldr(result_, FieldMemOperand(object_, ConsString::kSecondOffset)); __ LoadRoot(ip, Heap::kEmptyStringRootIndex); __ cmp(result_, Operand(ip)); __ b(ne, &call_runtime_); // Get the first of the two strings and load its instance type. __ ldr(object_, FieldMemOperand(object_, ConsString::kFirstOffset)); + __ jmp(&assure_seq_string); + + // SlicedString, unpack and add offset. + __ bind(&sliced_string); + __ ldr(result_, FieldMemOperand(object_, SlicedString::kOffsetOffset)); + __ add(scratch_, scratch_, result_); + __ ldr(object_, FieldMemOperand(object_, SlicedString::kParentOffset)); + + // Assure that we are dealing with a sequential string. Go to runtime if not. + __ bind(&assure_seq_string); __ ldr(result_, FieldMemOperand(object_, HeapObject::kMapOffset)); __ ldrb(result_, FieldMemOperand(result_, Map::kInstanceTypeOffset)); - // If the first cons component is also non-flat, then go to runtime. + // Check that parent is not an external string. Go to runtime otherwise. STATIC_ASSERT(kSeqStringTag == 0); __ tst(result_, Operand(kStringRepresentationMask)); __ b(ne, &call_runtime_); @@ -5428,10 +5467,17 @@ void SubStringStub::Generate(MacroAssembler* masm) { // Check bounds and smi-ness. Register to = r6; Register from = r7; + + if (FLAG_string_slices) { + __ nop(0); // Jumping as first instruction would crash the code generation. + __ jmp(&runtime); + } + __ Ldrd(to, from, MemOperand(sp, kToOffset)); STATIC_ASSERT(kFromOffset == kToOffset + 4); STATIC_ASSERT(kSmiTag == 0); STATIC_ASSERT(kSmiTagSize + kSmiShiftSize == 1); + // I.e., arithmetic shift right by one un-smi-tags. __ mov(r2, Operand(to, ASR, 1), SetCC); __ mov(r3, Operand(from, ASR, 1), SetCC, cc); @@ -5440,7 +5486,6 @@ void SubStringStub::Generate(MacroAssembler* masm) { __ b(mi, &runtime); // From is negative. // Both to and from are smis. - __ sub(r2, r2, Operand(r3), SetCC); __ b(mi, &runtime); // Fail if from > to. // Special handling of sub-strings of length 1 and 2. One character strings diff --git a/src/arm/lithium-arm.cc b/src/arm/lithium-arm.cc index 38f77cd..30d7a1c 100644 --- a/src/arm/lithium-arm.cc +++ b/src/arm/lithium-arm.cc @@ -1999,8 +1999,8 @@ LInstruction* LChunkBuilder::DoStringAdd(HStringAdd* instr) { LInstruction* LChunkBuilder::DoStringCharCodeAt(HStringCharCodeAt* instr) { - LOperand* string = UseRegister(instr->string()); - LOperand* index = UseRegisterOrConstant(instr->index()); + LOperand* string = UseTempRegister(instr->string()); + LOperand* index = UseTempRegister(instr->index()); LStringCharCodeAt* result = new LStringCharCodeAt(string, index); return AssignEnvironment(AssignPointerMap(DefineAsRegister(result))); } diff --git a/src/arm/lithium-codegen-arm.cc b/src/arm/lithium-codegen-arm.cc index 3469bb6..65a6169 100644 --- a/src/arm/lithium-codegen-arm.cc +++ b/src/arm/lithium-codegen-arm.cc @@ -3455,97 +3455,83 @@ void LCodeGen::DoStringCharCodeAt(LStringCharCodeAt* instr) { LStringCharCodeAt* instr_; }; - Register scratch = scratch0(); Register string = ToRegister(instr->string()); - Register index = no_reg; - int const_index = -1; - if (instr->index()->IsConstantOperand()) { - const_index = ToInteger32(LConstantOperand::cast(instr->index())); - STATIC_ASSERT(String::kMaxLength <= Smi::kMaxValue); - if (!Smi::IsValid(const_index)) { - // Guaranteed to be out of bounds because of the assert above. - // So the bounds check that must dominate this instruction must - // have deoptimized already. - if (FLAG_debug_code) { - __ Abort("StringCharCodeAt: out of bounds index."); - } - // No code needs to be generated. - return; - } - } else { - index = ToRegister(instr->index()); - } + Register index = ToRegister(instr->index()); Register result = ToRegister(instr->result()); DeferredStringCharCodeAt* deferred = new DeferredStringCharCodeAt(this, instr); - Label flat_string, ascii_string, done; - // Fetch the instance type of the receiver into result register. __ ldr(result, FieldMemOperand(string, HeapObject::kMapOffset)); __ ldrb(result, FieldMemOperand(result, Map::kInstanceTypeOffset)); - // We need special handling for non-flat strings. - STATIC_ASSERT(kSeqStringTag == 0); - __ tst(result, Operand(kStringRepresentationMask)); - __ b(eq, &flat_string); - - // Handle non-flat strings. - __ tst(result, Operand(kIsConsStringMask)); - __ b(eq, deferred->entry()); - - // ConsString. + // We need special handling for indirect strings. + Label check_sequential; + __ tst(result, Operand(kIsIndirectStringMask)); + __ b(eq, &check_sequential); + + // Dispatch on the indirect string shape: slice or cons. + Label cons_string; + const uint32_t kSlicedNotConsMask = kSlicedStringTag & ~kConsStringTag; + ASSERT(IsPowerOf2(kSlicedNotConsMask) && kSlicedNotConsMask != 0); + __ tst(result, Operand(kSlicedNotConsMask)); + __ b(eq, &cons_string); + + // Handle slices. + Label indirect_string_loaded; + __ ldr(result, FieldMemOperand(string, SlicedString::kOffsetOffset)); + __ add(index, index, Operand(result, ASR, kSmiTagSize)); + __ ldr(string, FieldMemOperand(string, SlicedString::kParentOffset)); + __ jmp(&indirect_string_loaded); + + // Handle conses. // Check whether the right hand side is the empty string (i.e. if // this is really a flat string in a cons string). If that is not // the case we would rather go to the runtime system now to flatten // the string. - __ ldr(scratch, FieldMemOperand(string, ConsString::kSecondOffset)); + __ bind(&cons_string); + __ ldr(result, FieldMemOperand(string, ConsString::kSecondOffset)); __ LoadRoot(ip, Heap::kEmptyStringRootIndex); - __ cmp(scratch, ip); + __ cmp(result, ip); __ b(ne, deferred->entry()); // Get the first of the two strings and load its instance type. __ ldr(string, FieldMemOperand(string, ConsString::kFirstOffset)); + + __ bind(&indirect_string_loaded); __ ldr(result, FieldMemOperand(string, HeapObject::kMapOffset)); __ ldrb(result, FieldMemOperand(result, Map::kInstanceTypeOffset)); - // If the first cons component is also non-flat, then go to runtime. + + // Check whether the string is sequential. The only non-sequential + // shapes we support have just been unwrapped above. + __ bind(&check_sequential); STATIC_ASSERT(kSeqStringTag == 0); __ tst(result, Operand(kStringRepresentationMask)); __ b(ne, deferred->entry()); - // Check for 1-byte or 2-byte string. - __ bind(&flat_string); + // Dispatch on the encoding: ASCII or two-byte. + Label ascii_string; STATIC_ASSERT(kAsciiStringTag != 0); __ tst(result, Operand(kStringEncodingMask)); __ b(ne, &ascii_string); - // 2-byte string. - // Load the 2-byte character code into the result register. - STATIC_ASSERT(kSmiTag == 0 && kSmiTagSize == 1); - if (instr->index()->IsConstantOperand()) { - __ ldrh(result, - FieldMemOperand(string, - SeqTwoByteString::kHeaderSize + 2 * const_index)); - } else { - __ add(scratch, - string, - Operand(SeqTwoByteString::kHeaderSize - kHeapObjectTag)); - __ ldrh(result, MemOperand(scratch, index, LSL, 1)); - } + // Two-byte string. + // Load the two-byte character code into the result register. + Label done; + __ add(result, + string, + Operand(SeqTwoByteString::kHeaderSize - kHeapObjectTag)); + __ ldrh(result, MemOperand(result, index, LSL, 1)); __ jmp(&done); // ASCII string. // Load the byte into the result register. __ bind(&ascii_string); - if (instr->index()->IsConstantOperand()) { - __ ldrb(result, FieldMemOperand(string, - SeqAsciiString::kHeaderSize + const_index)); - } else { - __ add(scratch, - string, - Operand(SeqAsciiString::kHeaderSize - kHeapObjectTag)); - __ ldrb(result, MemOperand(scratch, index)); - } + __ add(result, + string, + Operand(SeqAsciiString::kHeaderSize - kHeapObjectTag)); + __ ldrb(result, MemOperand(result, index)); + __ bind(&done); __ bind(deferred->exit()); } diff --git a/src/arm/regexp-macro-assembler-arm.cc b/src/arm/regexp-macro-assembler-arm.cc index 983a528..81645c7 100644 --- a/src/arm/regexp-macro-assembler-arm.cc +++ b/src/arm/regexp-macro-assembler-arm.cc @@ -1034,12 +1034,13 @@ int RegExpMacroAssemblerARM::CheckStackGuardState(Address* return_address, } // Prepare for possible GC. - HandleScope handles; + HandleScope handles(isolate); Handle code_handle(re_code); Handle subject(frame_entry(re_frame, kInputString)); + // Current string. - bool is_ascii = subject->IsAsciiRepresentation(); + bool is_ascii = subject->IsAsciiRepresentationUnderneath(); ASSERT(re_code->instruction_start() <= *return_address); ASSERT(*return_address <= @@ -1057,8 +1058,20 @@ int RegExpMacroAssemblerARM::CheckStackGuardState(Address* return_address, return EXCEPTION; } + Handle subject_tmp = subject; + int slice_offset = 0; + + // Extract the underlying string and the slice offset. + if (StringShape(*subject_tmp).IsCons()) { + subject_tmp = Handle(ConsString::cast(*subject_tmp)->first()); + } else if (StringShape(*subject_tmp).IsSliced()) { + SlicedString* slice = SlicedString::cast(*subject_tmp); + subject_tmp = Handle(slice->parent()); + slice_offset = slice->offset(); + } + // String might have changed. - if (subject->IsAsciiRepresentation() != is_ascii) { + if (subject_tmp->IsAsciiRepresentation() != is_ascii) { // If we changed between an ASCII and an UC16 string, the specialized // code cannot be used, and we need to restart regexp matching from // scratch (including, potentially, compiling a new version of the code). @@ -1069,8 +1082,8 @@ int RegExpMacroAssemblerARM::CheckStackGuardState(Address* return_address, // be a sequential or external string with the same content. // Update the start and end pointers in the stack frame to the current // location (whether it has actually moved or not). - ASSERT(StringShape(*subject).IsSequential() || - StringShape(*subject).IsExternal()); + ASSERT(StringShape(*subject_tmp).IsSequential() || + StringShape(*subject_tmp).IsExternal()); // The original start address of the characters to match. const byte* start_address = frame_entry(re_frame, kInputStart); @@ -1078,13 +1091,14 @@ int RegExpMacroAssemblerARM::CheckStackGuardState(Address* return_address, // Find the current start address of the same character at the current string // position. int start_index = frame_entry(re_frame, kStartIndex); - const byte* new_address = StringCharacterPosition(*subject, start_index); + const byte* new_address = StringCharacterPosition(*subject_tmp, + start_index + slice_offset); if (start_address != new_address) { // If there is a difference, update the object pointer and start and end // addresses in the RegExp stack frame to match the new value. const byte* end_address = frame_entry(re_frame, kInputEnd); - int byte_length = end_address - start_address; + int byte_length = static_cast(end_address - start_address); frame_entry(re_frame, kInputString) = *subject; frame_entry(re_frame, kInputStart) = new_address; frame_entry(re_frame, kInputEnd) = new_address + byte_length; diff --git a/src/flag-definitions.h b/src/flag-definitions.h index 2d8f6fa..7df2b0b 100644 --- a/src/flag-definitions.h +++ b/src/flag-definitions.h @@ -104,6 +104,7 @@ DEFINE_bool(harmony_block_scoping, false, "enable harmony block scoping") // Flags for experimental implementation features. DEFINE_bool(unbox_double_arrays, true, "automatically unbox arrays of doubles") +DEFINE_bool(string_slices, false, "use string slices") // Flags for Crankshaft. #ifdef V8_TARGET_ARCH_MIPS diff --git a/src/heap-inl.h b/src/heap-inl.h index b08655c..7b666af 100644 --- a/src/heap-inl.h +++ b/src/heap-inl.h @@ -323,10 +323,10 @@ AllocationSpace Heap::TargetSpaceId(InstanceType type) { ASSERT(type != JS_GLOBAL_PROPERTY_CELL_TYPE); if (type < FIRST_NONSTRING_TYPE) { - // There are three string representations: sequential strings, cons - // strings, and external strings. Only cons strings contain - // non-map-word pointers to heap objects. - return ((type & kStringRepresentationMask) == kConsStringTag) + // There are four string representations: sequential strings, external + // strings, cons strings, and sliced strings. + // Only the latter two contain non-map-word pointers to heap objects. + return ((type & kIsIndirectStringMask) == kIsIndirectStringTag) ? OLD_POINTER_SPACE : OLD_DATA_SPACE; } else { diff --git a/src/heap.cc b/src/heap.cc index e080cde..90d0e11 100644 --- a/src/heap.cc +++ b/src/heap.cc @@ -1290,6 +1290,10 @@ class ScavengingVisitor : public StaticVisitorBase { &ObjectEvacuationStrategy:: template VisitSpecialized); + table_.Register(kVisitSlicedString, + &ObjectEvacuationStrategy:: + template VisitSpecialized); + table_.Register(kVisitSharedFunctionInfo, &ObjectEvacuationStrategy:: template VisitSpecialized); @@ -2564,6 +2568,8 @@ MaybeObject* Heap::AllocateConsString(String* first, String* second) { // If the resulting string is small make a flat string. if (length < String::kMinNonFlatLength) { + // Note that neither of the two inputs can be a slice because: + STATIC_ASSERT(String::kMinNonFlatLength <= SlicedString::kMinLength); ASSERT(first->IsFlat()); ASSERT(second->IsFlat()); if (is_ascii) { @@ -2655,24 +2661,69 @@ MaybeObject* Heap::AllocateSubString(String* buffer, // Make an attempt to flatten the buffer to reduce access time. buffer = buffer->TryFlattenGetString(); + // TODO(1626): For now slicing external strings is not supported. However, + // a flat cons string can have an external string as first part in some cases. + // Therefore we have to single out this case as well. + if (!FLAG_string_slices || + (buffer->IsConsString() && + (!buffer->IsFlat() || + !ConsString::cast(buffer)->first()->IsSeqString())) || + buffer->IsExternalString() || + length < SlicedString::kMinLength || + pretenure == TENURED) { + Object* result; + { MaybeObject* maybe_result = buffer->IsAsciiRepresentation() + ? AllocateRawAsciiString(length, pretenure) + : AllocateRawTwoByteString(length, pretenure); + if (!maybe_result->ToObject(&result)) return maybe_result; + } + String* string_result = String::cast(result); + // Copy the characters into the new object. + if (buffer->IsAsciiRepresentation()) { + ASSERT(string_result->IsAsciiRepresentation()); + char* dest = SeqAsciiString::cast(string_result)->GetChars(); + String::WriteToFlat(buffer, dest, start, end); + } else { + ASSERT(string_result->IsTwoByteRepresentation()); + uc16* dest = SeqTwoByteString::cast(string_result)->GetChars(); + String::WriteToFlat(buffer, dest, start, end); + } + return result; + } + + ASSERT(buffer->IsFlat()); + ASSERT(!buffer->IsExternalString()); +#if DEBUG + buffer->StringVerify(); +#endif + Object* result; - { MaybeObject* maybe_result = buffer->IsAsciiRepresentation() - ? AllocateRawAsciiString(length, pretenure ) - : AllocateRawTwoByteString(length, pretenure); + { Map* map = buffer->IsAsciiRepresentation() + ? sliced_ascii_string_map() + : sliced_string_map(); + MaybeObject* maybe_result = Allocate(map, NEW_SPACE); if (!maybe_result->ToObject(&result)) return maybe_result; } - String* string_result = String::cast(result); - // Copy the characters into the new object. - if (buffer->IsAsciiRepresentation()) { - ASSERT(string_result->IsAsciiRepresentation()); - char* dest = SeqAsciiString::cast(string_result)->GetChars(); - String::WriteToFlat(buffer, dest, start, end); + + AssertNoAllocation no_gc; + SlicedString* sliced_string = SlicedString::cast(result); + sliced_string->set_length(length); + sliced_string->set_hash_field(String::kEmptyHashField); + if (buffer->IsConsString()) { + ConsString* cons = ConsString::cast(buffer); + ASSERT(cons->second()->length() == 0); + sliced_string->set_parent(cons->first()); + sliced_string->set_offset(start); + } else if (buffer->IsSlicedString()) { + // Prevent nesting sliced strings. + SlicedString* parent_slice = SlicedString::cast(buffer); + sliced_string->set_parent(parent_slice->parent()); + sliced_string->set_offset(start + parent_slice->offset()); } else { - ASSERT(string_result->IsTwoByteRepresentation()); - uc16* dest = SeqTwoByteString::cast(string_result)->GetChars(); - String::WriteToFlat(buffer, dest, start, end); + sliced_string->set_parent(buffer); + sliced_string->set_offset(start); } - + ASSERT(sliced_string->parent()->IsSeqString()); return result; } diff --git a/src/heap.h b/src/heap.h index c4ee4db..0f69fab 100644 --- a/src/heap.h +++ b/src/heap.h @@ -88,6 +88,8 @@ inline Heap* _inline_get_heap_(); V(Map, symbol_map, SymbolMap) \ V(Map, cons_string_map, ConsStringMap) \ V(Map, cons_ascii_string_map, ConsAsciiStringMap) \ + V(Map, sliced_string_map, SlicedStringMap) \ + V(Map, sliced_ascii_string_map, SlicedAsciiStringMap) \ V(Map, ascii_symbol_map, AsciiSymbolMap) \ V(Map, cons_symbol_map, ConsSymbolMap) \ V(Map, cons_ascii_symbol_map, ConsAsciiSymbolMap) \ diff --git a/src/ia32/code-stubs-ia32.cc b/src/ia32/code-stubs-ia32.cc index 77e8432..d76e4bf 100644 --- a/src/ia32/code-stubs-ia32.cc +++ b/src/ia32/code-stubs-ia32.cc @@ -3371,6 +3371,8 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { __ cmp(edx, Operand(eax)); __ j(greater, &runtime); + // Reset offset for possibly sliced string. + __ Set(edi, Immediate(0)); // ecx: RegExp data (FixedArray) // Check the representation and encoding of the subject string. Label seq_ascii_string, seq_two_byte_string, check_code; @@ -3381,36 +3383,45 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { __ and_(ebx, kIsNotStringMask | kStringRepresentationMask | kStringEncodingMask); STATIC_ASSERT((kStringTag | kSeqStringTag | kTwoByteStringTag) == 0); - __ j(zero, &seq_two_byte_string); + __ j(zero, &seq_two_byte_string, Label::kNear); // Any other flat string must be a flat ascii string. - __ test(Operand(ebx), + __ and_(Operand(ebx), Immediate(kIsNotStringMask | kStringRepresentationMask)); - __ j(zero, &seq_ascii_string); + __ j(zero, &seq_ascii_string, Label::kNear); - // Check for flat cons string. + // Check for flat cons string or sliced string. // A flat cons string is a cons string where the second part is the empty // string. In that case the subject string is just the first part of the cons // string. Also in this case the first part of the cons string is known to be // a sequential string or an external string. - STATIC_ASSERT(kExternalStringTag != 0); - STATIC_ASSERT((kConsStringTag & kExternalStringTag) == 0); - __ test(Operand(ebx), - Immediate(kIsNotStringMask | kExternalStringTag)); - __ j(not_zero, &runtime); - // String is a cons string. - __ mov(edx, FieldOperand(eax, ConsString::kSecondOffset)); - __ cmp(Operand(edx), factory->empty_string()); + // In the case of a sliced string its offset has to be taken into account. + Label cons_string, check_encoding; + STATIC_ASSERT((kConsStringTag < kExternalStringTag)); + STATIC_ASSERT((kSlicedStringTag > kExternalStringTag)); + __ cmp(Operand(ebx), Immediate(kExternalStringTag)); + __ j(less, &cons_string); + __ j(equal, &runtime); + + // String is sliced. + __ mov(edi, FieldOperand(eax, SlicedString::kOffsetOffset)); + __ mov(eax, FieldOperand(eax, SlicedString::kParentOffset)); + // edi: offset of sliced string, smi-tagged. + // eax: parent string. + __ jmp(&check_encoding, Label::kNear); + // String is a cons string, check whether it is flat. + __ bind(&cons_string); + __ cmp(FieldOperand(eax, ConsString::kSecondOffset), factory->empty_string()); __ j(not_equal, &runtime); __ mov(eax, FieldOperand(eax, ConsString::kFirstOffset)); + __ bind(&check_encoding); __ mov(ebx, FieldOperand(eax, HeapObject::kMapOffset)); - // String is a cons string with empty second part. - // eax: first part of cons string. - // ebx: map of first part of cons string. - // Is first part a flat two byte string? + // eax: first part of cons string or parent of sliced string. + // ebx: map of first part of cons string or map of parent of sliced string. + // Is first part of cons or parent of slice a flat two byte string? __ test_b(FieldOperand(ebx, Map::kInstanceTypeOffset), kStringRepresentationMask | kStringEncodingMask); STATIC_ASSERT((kSeqStringTag | kTwoByteStringTag) == 0); - __ j(zero, &seq_two_byte_string); + __ j(zero, &seq_two_byte_string, Label::kNear); // Any other flat string must be ascii. __ test_b(FieldOperand(ebx, Map::kInstanceTypeOffset), kStringRepresentationMask); @@ -3420,14 +3431,14 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { // eax: subject string (flat ascii) // ecx: RegExp data (FixedArray) __ mov(edx, FieldOperand(ecx, JSRegExp::kDataAsciiCodeOffset)); - __ Set(edi, Immediate(1)); // Type is ascii. - __ jmp(&check_code); + __ Set(ecx, Immediate(1)); // Type is ascii. + __ jmp(&check_code, Label::kNear); __ bind(&seq_two_byte_string); // eax: subject string (flat two byte) // ecx: RegExp data (FixedArray) __ mov(edx, FieldOperand(ecx, JSRegExp::kDataUC16CodeOffset)); - __ Set(edi, Immediate(0)); // Type is two byte. + __ Set(ecx, Immediate(0)); // Type is two byte. __ bind(&check_code); // Check that the irregexp code has been generated for the actual string @@ -3437,7 +3448,7 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { // eax: subject string // edx: code - // edi: encoding of subject string (1 if ascii, 0 if two_byte); + // ecx: encoding of subject string (1 if ascii, 0 if two_byte); // Load used arguments before starting to push arguments for call to native // RegExp code to avoid handling changing stack height. __ mov(ebx, Operand(esp, kPreviousIndexOffset)); @@ -3446,7 +3457,7 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { // eax: subject string // ebx: previous index // edx: code - // edi: encoding of subject string (1 if ascii 0 if two_byte); + // ecx: encoding of subject string (1 if ascii 0 if two_byte); // All checks done. Now push arguments for native regexp code. Counters* counters = masm->isolate()->counters(); __ IncrementCounter(counters->regexp_entry_native(), 1); @@ -3463,23 +3474,47 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { __ mov(Operand(esp, 6 * kPointerSize), Immediate(1)); // Argument 6: Start (high end) of backtracking stack memory area. - __ mov(ecx, Operand::StaticVariable(address_of_regexp_stack_memory_address)); - __ add(ecx, Operand::StaticVariable(address_of_regexp_stack_memory_size)); - __ mov(Operand(esp, 5 * kPointerSize), ecx); + __ mov(esi, Operand::StaticVariable(address_of_regexp_stack_memory_address)); + __ add(esi, Operand::StaticVariable(address_of_regexp_stack_memory_size)); + __ mov(Operand(esp, 5 * kPointerSize), esi); // Argument 5: static offsets vector buffer. __ mov(Operand(esp, 4 * kPointerSize), Immediate(ExternalReference::address_of_static_offsets_vector( masm->isolate()))); + // Argument 2: Previous index. + __ mov(Operand(esp, 1 * kPointerSize), ebx); + + // Argument 1: Original subject string. + // The original subject is in the previous stack frame. Therefore we have to + // use ebp, which points exactly to one pointer size below the previous esp. + // (Because creating a new stack frame pushes the previous ebp onto the stack + // and thereby moves up esp by one kPointerSize.) + __ mov(esi, Operand(ebp, kSubjectOffset + kPointerSize)); + __ mov(Operand(esp, 0 * kPointerSize), esi); + + // esi: original subject string + // eax: underlying subject string + // ebx: previous index + // ecx: encoding of subject string (1 if ascii 0 if two_byte); + // edx: code // Argument 4: End of string data // Argument 3: Start of string data + // Prepare start and end index of the input. + // Load the length from the original sliced string if that is the case. + __ mov(esi, FieldOperand(esi, String::kLengthOffset)); + __ add(esi, Operand(edi)); // Calculate input end wrt offset. + __ SmiUntag(edi); + __ add(ebx, Operand(edi)); // Calculate input start wrt offset. + + // ebx: start index of the input string + // esi: end index of the input string Label setup_two_byte, setup_rest; - __ test(edi, Operand(edi)); - __ mov(edi, FieldOperand(eax, String::kLengthOffset)); + __ test(ecx, Operand(ecx)); __ j(zero, &setup_two_byte, Label::kNear); - __ SmiUntag(edi); - __ lea(ecx, FieldOperand(eax, edi, times_1, SeqAsciiString::kHeaderSize)); + __ SmiUntag(esi); + __ lea(ecx, FieldOperand(eax, esi, times_1, SeqAsciiString::kHeaderSize)); __ mov(Operand(esp, 3 * kPointerSize), ecx); // Argument 4. __ lea(ecx, FieldOperand(eax, ebx, times_1, SeqAsciiString::kHeaderSize)); __ mov(Operand(esp, 2 * kPointerSize), ecx); // Argument 3. @@ -3487,20 +3522,14 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { __ bind(&setup_two_byte); STATIC_ASSERT(kSmiTag == 0); - STATIC_ASSERT(kSmiTagSize == 1); // edi is smi (powered by 2). - __ lea(ecx, FieldOperand(eax, edi, times_1, SeqTwoByteString::kHeaderSize)); + STATIC_ASSERT(kSmiTagSize == 1); // esi is smi (powered by 2). + __ lea(ecx, FieldOperand(eax, esi, times_1, SeqTwoByteString::kHeaderSize)); __ mov(Operand(esp, 3 * kPointerSize), ecx); // Argument 4. __ lea(ecx, FieldOperand(eax, ebx, times_2, SeqTwoByteString::kHeaderSize)); __ mov(Operand(esp, 2 * kPointerSize), ecx); // Argument 3. __ bind(&setup_rest); - // Argument 2: Previous index. - __ mov(Operand(esp, 1 * kPointerSize), ebx); - - // Argument 1: Subject string. - __ mov(Operand(esp, 0 * kPointerSize), eax); - // Locate the code entry and call it. __ add(Operand(edx), Immediate(Code::kHeaderSize - kHeapObjectTag)); __ call(Operand(edx)); @@ -3539,7 +3568,7 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { // by javascript code. __ cmp(eax, factory->termination_exception()); Label throw_termination_exception; - __ j(equal, &throw_termination_exception); + __ j(equal, &throw_termination_exception, Label::kNear); // Handle normal exception by following handler chain. __ Throw(eax); @@ -4811,6 +4840,7 @@ void StringCharCodeAtGenerator::GenerateFast(MacroAssembler* masm) { Label flat_string; Label ascii_string; Label got_char_code; + Label sliced_string; // If the receiver is a smi trigger the non-string case. STATIC_ASSERT(kSmiTag == 0); @@ -4841,31 +4871,45 @@ void StringCharCodeAtGenerator::GenerateFast(MacroAssembler* masm) { __ j(zero, &flat_string); // Handle non-flat strings. - __ test(result_, Immediate(kIsConsStringMask)); - __ j(zero, &call_runtime_); + __ and_(result_, kStringRepresentationMask); + STATIC_ASSERT((kConsStringTag < kExternalStringTag)); + STATIC_ASSERT((kSlicedStringTag > kExternalStringTag)); + __ cmp(result_, kExternalStringTag); + __ j(greater, &sliced_string, Label::kNear); + __ j(equal, &call_runtime_); // ConsString. // Check whether the right hand side is the empty string (i.e. if // this is really a flat string in a cons string). If that is not // the case we would rather go to the runtime system now to flatten // the string. + Label assure_seq_string; __ cmp(FieldOperand(object_, ConsString::kSecondOffset), Immediate(masm->isolate()->factory()->empty_string())); __ j(not_equal, &call_runtime_); // Get the first of the two strings and load its instance type. __ mov(object_, FieldOperand(object_, ConsString::kFirstOffset)); + __ jmp(&assure_seq_string, Label::kNear); + + // SlicedString, unpack and add offset. + __ bind(&sliced_string); + __ add(scratch_, FieldOperand(object_, SlicedString::kOffsetOffset)); + __ mov(object_, FieldOperand(object_, SlicedString::kParentOffset)); + + // Assure that we are dealing with a sequential string. Go to runtime if not. + __ bind(&assure_seq_string); __ mov(result_, FieldOperand(object_, HeapObject::kMapOffset)); __ movzx_b(result_, FieldOperand(result_, Map::kInstanceTypeOffset)); - // If the first cons component is also non-flat, then go to runtime. STATIC_ASSERT(kSeqStringTag == 0); __ test(result_, Immediate(kStringRepresentationMask)); __ j(not_zero, &call_runtime_); + __ jmp(&flat_string, Label::kNear); // Check for 1-byte or 2-byte string. __ bind(&flat_string); STATIC_ASSERT(kAsciiStringTag != 0); __ test(result_, Immediate(kStringEncodingMask)); - __ j(not_zero, &ascii_string); + __ j(not_zero, &ascii_string, Label::kNear); // 2-byte string. // Load the 2-byte character code into the result register. @@ -4873,7 +4917,7 @@ void StringCharCodeAtGenerator::GenerateFast(MacroAssembler* masm) { __ movzx_w(result_, FieldOperand(object_, scratch_, times_1, // Scratch is smi-tagged. SeqTwoByteString::kHeaderSize)); - __ jmp(&got_char_code); + __ jmp(&got_char_code, Label::kNear); // ASCII string. // Load the byte into the result register. @@ -5185,6 +5229,8 @@ void StringAddStub::Generate(MacroAssembler* masm) { __ and_(ecx, kStringRepresentationMask); __ cmp(ecx, kExternalStringTag); __ j(equal, &string_add_runtime); + // We cannot encounter sliced strings here since: + STATIC_ASSERT(SlicedString::kMinLength >= String::kMinNonFlatLength); // Now check if both strings are ascii strings. // eax: first string // ebx: length of resulting flat string as a smi @@ -5596,6 +5642,9 @@ void StringHelper::GenerateHashGetHash(MacroAssembler* masm, void SubStringStub::Generate(MacroAssembler* masm) { Label runtime; + if (FLAG_string_slices) { + __ jmp(&runtime); + } // Stack frame on entry. // esp[0]: return address // esp[4]: to diff --git a/src/ia32/lithium-codegen-ia32.cc b/src/ia32/lithium-codegen-ia32.cc index 71fe8d9..5f67038 100644 --- a/src/ia32/lithium-codegen-ia32.cc +++ b/src/ia32/lithium-codegen-ia32.cc @@ -3217,95 +3217,81 @@ void LCodeGen::DoStringCharCodeAt(LStringCharCodeAt* instr) { }; Register string = ToRegister(instr->string()); - Register index = no_reg; - int const_index = -1; - if (instr->index()->IsConstantOperand()) { - const_index = ToInteger32(LConstantOperand::cast(instr->index())); - STATIC_ASSERT(String::kMaxLength <= Smi::kMaxValue); - if (!Smi::IsValid(const_index)) { - // Guaranteed to be out of bounds because of the assert above. - // So the bounds check that must dominate this instruction must - // have deoptimized already. - if (FLAG_debug_code) { - __ Abort("StringCharCodeAt: out of bounds index."); - } - // No code needs to be generated. - return; - } - } else { - index = ToRegister(instr->index()); - } + Register index = ToRegister(instr->index()); Register result = ToRegister(instr->result()); DeferredStringCharCodeAt* deferred = new DeferredStringCharCodeAt(this, instr); - Label flat_string, ascii_string, done; - // Fetch the instance type of the receiver into result register. __ mov(result, FieldOperand(string, HeapObject::kMapOffset)); __ movzx_b(result, FieldOperand(result, Map::kInstanceTypeOffset)); - // We need special handling for non-flat strings. - STATIC_ASSERT(kSeqStringTag == 0); - __ test(result, Immediate(kStringRepresentationMask)); - __ j(zero, &flat_string, Label::kNear); - - // Handle non-flat strings. - __ test(result, Immediate(kIsConsStringMask)); - __ j(zero, deferred->entry()); + // We need special handling for indirect strings. + Label check_sequential; + __ test(result, Immediate(kIsIndirectStringMask)); + __ j(zero, &check_sequential, Label::kNear); + + // Dispatch on the indirect string shape: slice or cons. + Label cons_string; + const uint32_t kSlicedNotConsMask = kSlicedStringTag & ~kConsStringTag; + ASSERT(IsPowerOf2(kSlicedNotConsMask) && kSlicedNotConsMask != 0); + __ test(result, Immediate(kSlicedNotConsMask)); + __ j(zero, &cons_string, Label::kNear); + + // Handle slices. + Label indirect_string_loaded; + __ mov(result, FieldOperand(string, SlicedString::kOffsetOffset)); + __ SmiUntag(result); + __ add(index, Operand(result)); + __ mov(string, FieldOperand(string, SlicedString::kParentOffset)); + __ jmp(&indirect_string_loaded, Label::kNear); - // ConsString. + // Handle conses. // Check whether the right hand side is the empty string (i.e. if // this is really a flat string in a cons string). If that is not // the case we would rather go to the runtime system now to flatten // the string. + __ bind(&cons_string); __ cmp(FieldOperand(string, ConsString::kSecondOffset), Immediate(factory()->empty_string())); __ j(not_equal, deferred->entry()); - // Get the first of the two strings and load its instance type. __ mov(string, FieldOperand(string, ConsString::kFirstOffset)); + + __ bind(&indirect_string_loaded); __ mov(result, FieldOperand(string, HeapObject::kMapOffset)); __ movzx_b(result, FieldOperand(result, Map::kInstanceTypeOffset)); - // If the first cons component is also non-flat, then go to runtime. + + // Check whether the string is sequential. The only non-sequential + // shapes we support have just been unwrapped above. + __ bind(&check_sequential); STATIC_ASSERT(kSeqStringTag == 0); __ test(result, Immediate(kStringRepresentationMask)); __ j(not_zero, deferred->entry()); - // Check for ASCII or two-byte string. - __ bind(&flat_string); + // Dispatch on the encoding: ASCII or two-byte. + Label ascii_string; STATIC_ASSERT(kAsciiStringTag != 0); __ test(result, Immediate(kStringEncodingMask)); __ j(not_zero, &ascii_string, Label::kNear); // Two-byte string. // Load the two-byte character code into the result register. + Label done; STATIC_ASSERT(kSmiTag == 0 && kSmiTagSize == 1); - if (instr->index()->IsConstantOperand()) { - __ movzx_w(result, - FieldOperand(string, - SeqTwoByteString::kHeaderSize + - (kUC16Size * const_index))); - } else { - __ movzx_w(result, FieldOperand(string, - index, - times_2, - SeqTwoByteString::kHeaderSize)); - } + __ movzx_w(result, FieldOperand(string, + index, + times_2, + SeqTwoByteString::kHeaderSize)); __ jmp(&done, Label::kNear); // ASCII string. // Load the byte into the result register. __ bind(&ascii_string); - if (instr->index()->IsConstantOperand()) { - __ movzx_b(result, FieldOperand(string, - SeqAsciiString::kHeaderSize + const_index)); - } else { - __ movzx_b(result, FieldOperand(string, - index, - times_1, - SeqAsciiString::kHeaderSize)); - } + __ movzx_b(result, FieldOperand(string, + index, + times_1, + SeqAsciiString::kHeaderSize)); __ bind(&done); __ bind(deferred->exit()); } diff --git a/src/ia32/lithium-ia32.cc b/src/ia32/lithium-ia32.cc index bb92e89..34c5beb 100644 --- a/src/ia32/lithium-ia32.cc +++ b/src/ia32/lithium-ia32.cc @@ -2058,8 +2058,8 @@ LInstruction* LChunkBuilder::DoStringAdd(HStringAdd* instr) { LInstruction* LChunkBuilder::DoStringCharCodeAt(HStringCharCodeAt* instr) { - LOperand* string = UseRegister(instr->string()); - LOperand* index = UseRegisterOrConstant(instr->index()); + LOperand* string = UseTempRegister(instr->string()); + LOperand* index = UseTempRegister(instr->index()); LOperand* context = UseAny(instr->context()); LStringCharCodeAt* result = new LStringCharCodeAt(context, string, index); return AssignEnvironment(AssignPointerMap(DefineAsRegister(result))); diff --git a/src/ia32/regexp-macro-assembler-ia32.cc b/src/ia32/regexp-macro-assembler-ia32.cc index 8db2e9b..7d7de0e 100644 --- a/src/ia32/regexp-macro-assembler-ia32.cc +++ b/src/ia32/regexp-macro-assembler-ia32.cc @@ -1065,12 +1065,13 @@ int RegExpMacroAssemblerIA32::CheckStackGuardState(Address* return_address, } // Prepare for possible GC. - HandleScope handles; + HandleScope handles(isolate); Handle code_handle(re_code); Handle subject(frame_entry(re_frame, kInputString)); + // Current string. - bool is_ascii = subject->IsAsciiRepresentation(); + bool is_ascii = subject->IsAsciiRepresentationUnderneath(); ASSERT(re_code->instruction_start() <= *return_address); ASSERT(*return_address <= @@ -1088,8 +1089,20 @@ int RegExpMacroAssemblerIA32::CheckStackGuardState(Address* return_address, return EXCEPTION; } + Handle subject_tmp = subject; + int slice_offset = 0; + + // Extract the underlying string and the slice offset. + if (StringShape(*subject_tmp).IsCons()) { + subject_tmp = Handle(ConsString::cast(*subject_tmp)->first()); + } else if (StringShape(*subject_tmp).IsSliced()) { + SlicedString* slice = SlicedString::cast(*subject_tmp); + subject_tmp = Handle(slice->parent()); + slice_offset = slice->offset(); + } + // String might have changed. - if (subject->IsAsciiRepresentation() != is_ascii) { + if (subject_tmp->IsAsciiRepresentation() != is_ascii) { // If we changed between an ASCII and an UC16 string, the specialized // code cannot be used, and we need to restart regexp matching from // scratch (including, potentially, compiling a new version of the code). @@ -1100,8 +1113,8 @@ int RegExpMacroAssemblerIA32::CheckStackGuardState(Address* return_address, // be a sequential or external string with the same content. // Update the start and end pointers in the stack frame to the current // location (whether it has actually moved or not). - ASSERT(StringShape(*subject).IsSequential() || - StringShape(*subject).IsExternal()); + ASSERT(StringShape(*subject_tmp).IsSequential() || + StringShape(*subject_tmp).IsExternal()); // The original start address of the characters to match. const byte* start_address = frame_entry(re_frame, kInputStart); @@ -1109,13 +1122,14 @@ int RegExpMacroAssemblerIA32::CheckStackGuardState(Address* return_address, // Find the current start address of the same character at the current string // position. int start_index = frame_entry(re_frame, kStartIndex); - const byte* new_address = StringCharacterPosition(*subject, start_index); + const byte* new_address = StringCharacterPosition(*subject_tmp, + start_index + slice_offset); if (start_address != new_address) { // If there is a difference, update the object pointer and start and end // addresses in the RegExp stack frame to match the new value. const byte* end_address = frame_entry(re_frame, kInputEnd); - int byte_length = end_address - start_address; + int byte_length = static_cast(end_address - start_address); frame_entry(re_frame, kInputString) = *subject; frame_entry(re_frame, kInputStart) = new_address; frame_entry(re_frame, kInputEnd) = new_address + byte_length; diff --git a/src/jsregexp.cc b/src/jsregexp.cc index 62d93a7..192fbf0 100644 --- a/src/jsregexp.cc +++ b/src/jsregexp.cc @@ -224,9 +224,9 @@ Handle RegExpImpl::AtomExec(Handle re, if (!subject->IsFlat()) FlattenString(subject); AssertNoAllocation no_heap_allocation; // ensure vectors stay valid - // Extract flattened substrings of cons strings before determining asciiness. String* needle = String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)); + ASSERT(StringShape(needle).IsSequential()); int needle_len = needle->length(); ASSERT(needle->IsFlat()); @@ -347,10 +347,7 @@ bool RegExpImpl::CompileIrregexp(Handle re, bool is_ascii) { JSRegExp::Flags flags = re->GetFlags(); Handle pattern(re->Pattern()); - if (!pattern->IsFlat()) { - FlattenString(pattern); - } - + if (!pattern->IsFlat()) FlattenString(pattern); RegExpCompileData compile_data; FlatStringReader reader(isolate, pattern); if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), @@ -434,22 +431,12 @@ void RegExpImpl::IrregexpInitialize(Handle re, int RegExpImpl::IrregexpPrepare(Handle regexp, Handle subject) { - if (!subject->IsFlat()) { - FlattenString(subject); - } + if (!subject->IsFlat()) FlattenString(subject); + // Check the asciiness of the underlying storage. - bool is_ascii; - { - AssertNoAllocation no_gc; - String* sequential_string = *subject; - if (subject->IsConsString()) { - sequential_string = ConsString::cast(*subject)->first(); - } - is_ascii = sequential_string->IsAsciiRepresentation(); - } - if (!EnsureCompiledIrregexp(regexp, is_ascii)) { - return -1; - } + bool is_ascii = subject->IsAsciiRepresentationUnderneath(); + if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1; + #ifdef V8_INTERPRETED_REGEXP // Byte-code regexp needs space allocated for all its registers. return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())); @@ -474,15 +461,11 @@ RegExpImpl::IrregexpResult RegExpImpl::IrregexpExecOnce( ASSERT(index <= subject->length()); ASSERT(subject->IsFlat()); - // A flat ASCII string might have a two-byte first part. - if (subject->IsConsString()) { - subject = Handle(ConsString::cast(*subject)->first(), isolate); - } + bool is_ascii = subject->IsAsciiRepresentationUnderneath(); #ifndef V8_INTERPRETED_REGEXP ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); do { - bool is_ascii = subject->IsAsciiRepresentation(); EnsureCompiledIrregexp(regexp, is_ascii); Handle code(IrregexpNativeCode(*irregexp, is_ascii), isolate); NativeRegExpMacroAssembler::Result res = @@ -510,13 +493,13 @@ RegExpImpl::IrregexpResult RegExpImpl::IrregexpExecOnce( // being internal and external, and even between being ASCII and UC16, // but the characters are always the same). IrregexpPrepare(regexp, subject); + is_ascii = subject->IsAsciiRepresentationUnderneath(); } while (true); UNREACHABLE(); return RE_EXCEPTION; #else // V8_INTERPRETED_REGEXP ASSERT(output.length() >= IrregexpNumberOfRegisters(*irregexp)); - bool is_ascii = subject->IsAsciiRepresentation(); // We must have done EnsureCompiledIrregexp, so we can get the number of // registers. int* register_vector = output.start(); diff --git a/src/mark-compact.cc b/src/mark-compact.cc index 0961350..3e4a617 100644 --- a/src/mark-compact.cc +++ b/src/mark-compact.cc @@ -394,6 +394,10 @@ class StaticMarkingVisitor : public StaticVisitorBase { ConsString::BodyDescriptor, void>::Visit); + table_.Register(kVisitSlicedString, + &FixedBodyVisitor::Visit); table_.Register(kVisitFixedArray, &FlexibleBodyVisitorInNewSpace(this)); } + if (IsConsString()) { + ConsString::cast(this)->ConsStringVerify(); + } else if (IsSlicedString()) { + SlicedString::cast(this)->SlicedStringVerify(); + } +} + + +void ConsString::ConsStringVerify() { + CHECK(this->first()->IsString()); + CHECK(this->second() == GetHeap()->empty_string() || + this->second()->IsString()); + CHECK(this->length() >= String::kMinNonFlatLength); + if (this->IsFlat()) { + // A flat cons can only be created by String::SlowTryFlatten. + // Afterwards, the first part may be externalized. + CHECK(this->first()->IsSeqString() || this->first()->IsExternalString()); + } +} + + +void SlicedString::SlicedStringVerify() { + CHECK(!this->parent()->IsConsString()); + CHECK(!this->parent()->IsSlicedString()); + CHECK(this->length() >= SlicedString::kMinLength); } diff --git a/src/objects-inl.h b/src/objects-inl.h index 098bd7a..b4c6a3a 100644 --- a/src/objects-inl.h +++ b/src/objects-inl.h @@ -178,10 +178,14 @@ bool Object::IsSymbol() { bool Object::IsConsString() { - if (!this->IsHeapObject()) return false; - uint32_t type = HeapObject::cast(this)->map()->instance_type(); - return (type & (kIsNotStringMask | kStringRepresentationMask)) == - (kStringTag | kConsStringTag); + if (!IsString()) return false; + return StringShape(String::cast(this)).IsCons(); +} + + +bool Object::IsSlicedString() { + if (!IsString()) return false; + return StringShape(String::cast(this)).IsSliced(); } @@ -269,6 +273,38 @@ bool String::IsTwoByteRepresentation() { } +bool String::IsAsciiRepresentationUnderneath() { + uint32_t type = map()->instance_type(); + STATIC_ASSERT(kIsIndirectStringTag != 0); + STATIC_ASSERT((kIsIndirectStringMask & kStringEncodingMask) == 0); + ASSERT(IsFlat()); + switch (type & (kIsIndirectStringMask | kStringEncodingMask)) { + case kAsciiStringTag: + return true; + case kTwoByteStringTag: + return false; + default: // Cons or sliced string. Need to go deeper. + return GetUnderlying()->IsAsciiRepresentation(); + } +} + + +bool String::IsTwoByteRepresentationUnderneath() { + uint32_t type = map()->instance_type(); + STATIC_ASSERT(kIsIndirectStringTag != 0); + STATIC_ASSERT((kIsIndirectStringMask & kStringEncodingMask) == 0); + ASSERT(IsFlat()); + switch (type & (kIsIndirectStringMask | kStringEncodingMask)) { + case kAsciiStringTag: + return false; + case kTwoByteStringTag: + return true; + default: // Cons or sliced string. Need to go deeper. + return GetUnderlying()->IsTwoByteRepresentation(); + } +} + + bool String::HasOnlyAsciiChars() { uint32_t type = map()->instance_type(); return (type & kStringEncodingMask) == kAsciiStringTag || @@ -281,6 +317,16 @@ bool StringShape::IsCons() { } +bool StringShape::IsSliced() { + return (type_ & kStringRepresentationMask) == kSlicedStringTag; +} + + +bool StringShape::IsIndirect() { + return (type_ & kIsIndirectStringMask) == kIsIndirectStringTag; +} + + bool StringShape::IsExternal() { return (type_ & kStringRepresentationMask) == kExternalStringTag; } @@ -2075,6 +2121,7 @@ CAST_ACCESSOR(String) CAST_ACCESSOR(SeqString) CAST_ACCESSOR(SeqAsciiString) CAST_ACCESSOR(SeqTwoByteString) +CAST_ACCESSOR(SlicedString) CAST_ACCESSOR(ConsString) CAST_ACCESSOR(ExternalString) CAST_ACCESSOR(ExternalAsciiString) @@ -2156,7 +2203,7 @@ bool String::Equals(String* other) { MaybeObject* String::TryFlatten(PretenureFlag pretenure) { if (!StringShape(this).IsCons()) return this; ConsString* cons = ConsString::cast(this); - if (cons->second()->length() == 0) return cons->first(); + if (cons->IsFlat()) return cons->first(); return SlowTryFlatten(pretenure); } @@ -2164,10 +2211,8 @@ MaybeObject* String::TryFlatten(PretenureFlag pretenure) { String* String::TryFlattenGetString(PretenureFlag pretenure) { MaybeObject* flat = TryFlatten(pretenure); Object* successfully_flattened; - if (flat->ToObject(&successfully_flattened)) { - return String::cast(successfully_flattened); - } - return this; + if (!flat->ToObject(&successfully_flattened)) return this; + return String::cast(successfully_flattened); } @@ -2185,6 +2230,9 @@ uint16_t String::Get(int index) { return ExternalAsciiString::cast(this)->ExternalAsciiStringGet(index); case kExternalStringTag | kTwoByteStringTag: return ExternalTwoByteString::cast(this)->ExternalTwoByteStringGet(index); + case kSlicedStringTag | kAsciiStringTag: + case kSlicedStringTag | kTwoByteStringTag: + return SlicedString::cast(this)->SlicedStringGet(index); default: break; } @@ -2205,15 +2253,19 @@ void String::Set(int index, uint16_t value) { bool String::IsFlat() { - switch (StringShape(this).representation_tag()) { - case kConsStringTag: { - String* second = ConsString::cast(this)->second(); - // Only flattened strings have second part empty. - return second->length() == 0; - } - default: - return true; - } + if (!StringShape(this).IsCons()) return true; + return ConsString::cast(this)->second()->length() == 0; +} + + +String* String::GetUnderlying() { + // Giving direct access to underlying string only makes sense if the + // wrapping string is already flattened. + ASSERT(this->IsFlat()); + ASSERT(StringShape(this).IsIndirect()); + STATIC_ASSERT(ConsString::kFirstOffset == SlicedString::kParentOffset); + const int kUnderlyingOffset = SlicedString::kParentOffset; + return String::cast(READ_FIELD(this, kUnderlyingOffset)); } @@ -2272,6 +2324,20 @@ int SeqAsciiString::SeqAsciiStringSize(InstanceType instance_type) { } +String* SlicedString::parent() { + return String::cast(READ_FIELD(this, kParentOffset)); +} + + +void SlicedString::set_parent(String* parent) { + ASSERT(parent->IsSeqString()); + WRITE_FIELD(this, kParentOffset, parent); +} + + +SMI_ACCESSORS(SlicedString, offset, kOffsetOffset) + + String* ConsString::first() { return String::cast(READ_FIELD(this, kFirstOffset)); } diff --git a/src/objects-visiting.cc b/src/objects-visiting.cc index 84ab57f..bde9e83 100644 --- a/src/objects-visiting.cc +++ b/src/objects-visiting.cc @@ -58,6 +58,9 @@ StaticVisitorBase::VisitorId StaticVisitorBase::GetVisitorId( return kVisitConsString; } + case kSlicedStringTag: + return kVisitSlicedString; + case kExternalStringTag: return GetVisitorIdForSize(kVisitDataObject, kVisitDataObjectGeneric, diff --git a/src/objects-visiting.h b/src/objects-visiting.h index c96a8ef..4ce1bd0 100644 --- a/src/objects-visiting.h +++ b/src/objects-visiting.h @@ -115,6 +115,7 @@ class StaticVisitorBase : public AllStatic { kVisitStructGeneric, kVisitConsString, + kVisitSlicedString, kVisitOddball, kVisitCode, kVisitMap, @@ -299,6 +300,11 @@ class StaticNewSpaceVisitor : public StaticVisitorBase { ConsString::BodyDescriptor, int>::Visit); + table_.Register(kVisitSlicedString, + &FixedBodyVisitor::Visit); + table_.Register(kVisitFixedArray, &FlexibleBodyVisitor(this)-> @@ -5042,6 +5045,7 @@ String::FlatContent String::GetFlatContent() { int length = this->length(); StringShape shape(this); String* string = this; + int offset = 0; if (shape.representation_tag() == kConsStringTag) { ConsString* cons = ConsString::cast(string); if (cons->second()->length() != 0) { @@ -5050,6 +5054,14 @@ String::FlatContent String::GetFlatContent() { string = cons->first(); shape = StringShape(string); } + if (shape.representation_tag() == kSlicedStringTag) { + SlicedString* slice = SlicedString::cast(string); + offset = slice->offset(); + string = slice->parent(); + shape = StringShape(string); + ASSERT(shape.representation_tag() != kConsStringTag && + shape.representation_tag() != kSlicedStringTag); + } if (shape.encoding_tag() == kAsciiStringTag) { const char* start; if (shape.representation_tag() == kSeqStringTag) { @@ -5057,7 +5069,7 @@ String::FlatContent String::GetFlatContent() { } else { start = ExternalAsciiString::cast(string)->resource()->data(); } - return FlatContent(Vector(start, length)); + return FlatContent(Vector(start + offset, length)); } else { ASSERT(shape.encoding_tag() == kTwoByteStringTag); const uc16* start; @@ -5066,7 +5078,7 @@ String::FlatContent String::GetFlatContent() { } else { start = ExternalTwoByteString::cast(string)->resource()->data(); } - return FlatContent(Vector(start, length)); + return FlatContent(Vector(start + offset, length)); } } @@ -5138,13 +5150,17 @@ const uc16* String::GetTwoByteData() { const uc16* String::GetTwoByteData(unsigned start) { - ASSERT(!IsAsciiRepresentation()); + ASSERT(!IsAsciiRepresentationUnderneath()); switch (StringShape(this).representation_tag()) { case kSeqStringTag: return SeqTwoByteString::cast(this)->SeqTwoByteStringGetData(start); case kExternalStringTag: return ExternalTwoByteString::cast(this)-> ExternalTwoByteStringGetData(start); + case kSlicedStringTag: { + SlicedString* slice = SlicedString::cast(this); + return slice->parent()->GetTwoByteData(start + slice->offset()); + } case kConsStringTag: UNREACHABLE(); return NULL; @@ -5435,6 +5451,10 @@ const unibrow::byte* String::ReadBlock(String* input, max_chars); return rbb->util_buffer; } + case kSlicedStringTag: + return SlicedString::cast(input)->SlicedStringReadBlock(rbb, + offset_ptr, + max_chars); default: break; } @@ -5578,6 +5598,11 @@ void String::ReadBlockIntoBuffer(String* input, max_chars); } return; + case kSlicedStringTag: + SlicedString::cast(input)->SlicedStringReadBlockIntoBuffer(rbb, + offset_ptr, + max_chars); + return; default: break; } @@ -5712,6 +5737,31 @@ uint16_t ConsString::ConsStringGet(int index) { } +uint16_t SlicedString::SlicedStringGet(int index) { + return parent()->Get(offset() + index); +} + + +const unibrow::byte* SlicedString::SlicedStringReadBlock( + ReadBlockBuffer* buffer, unsigned* offset_ptr, unsigned chars) { + unsigned offset = this->offset(); + *offset_ptr += offset; + const unibrow::byte* answer = String::ReadBlock(String::cast(parent()), + buffer, offset_ptr, chars); + *offset_ptr -= offset; + return answer; +} + + +void SlicedString::SlicedStringReadBlockIntoBuffer( + ReadBlockBuffer* buffer, unsigned* offset_ptr, unsigned chars) { + unsigned offset = this->offset(); + *offset_ptr += offset; + String::ReadBlockIntoBuffer(String::cast(parent()), + buffer, offset_ptr, chars); + *offset_ptr -= offset; +} + template void String::WriteToFlat(String* src, sinkchar* sink, @@ -5779,6 +5829,13 @@ void String::WriteToFlat(String* src, } break; } + case kAsciiStringTag | kSlicedStringTag: + case kTwoByteStringTag | kSlicedStringTag: { + SlicedString* slice = SlicedString::cast(source); + unsigned offset = slice->offset(); + WriteToFlat(slice->parent(), sink, from + offset, to + offset); + return; + } } } } diff --git a/src/objects.h b/src/objects.h index fc5d2e7..6c8888b 100644 --- a/src/objects.h +++ b/src/objects.h @@ -89,6 +89,7 @@ // - SeqString // - SeqAsciiString // - SeqTwoByteString +// - SlicedString // - ConsString // - ExternalString // - ExternalAsciiString @@ -283,6 +284,7 @@ static const int kVariableSizeSentinel = 0; V(ASCII_STRING_TYPE) \ V(CONS_STRING_TYPE) \ V(CONS_ASCII_STRING_TYPE) \ + V(SLICED_STRING_TYPE) \ V(EXTERNAL_STRING_TYPE) \ V(EXTERNAL_STRING_WITH_ASCII_DATA_TYPE) \ V(EXTERNAL_ASCII_STRING_TYPE) \ @@ -401,6 +403,14 @@ static const int kVariableSizeSentinel = 0; ConsString::kSize, \ cons_ascii_string, \ ConsAsciiString) \ + V(SLICED_STRING_TYPE, \ + SlicedString::kSize, \ + sliced_string, \ + SlicedString) \ + V(SLICED_ASCII_STRING_TYPE, \ + SlicedString::kSize, \ + sliced_ascii_string, \ + SlicedAsciiString) \ V(EXTERNAL_STRING_TYPE, \ ExternalTwoByteString::kSize, \ external_string, \ @@ -474,9 +484,17 @@ const uint32_t kStringRepresentationMask = 0x03; enum StringRepresentationTag { kSeqStringTag = 0x0, kConsStringTag = 0x1, - kExternalStringTag = 0x2 + kExternalStringTag = 0x2, + kSlicedStringTag = 0x3 }; -const uint32_t kIsConsStringMask = 0x1; +const uint32_t kIsIndirectStringMask = 0x1; +const uint32_t kIsIndirectStringTag = 0x1; +STATIC_ASSERT((kSeqStringTag & kIsIndirectStringMask) == 0); +STATIC_ASSERT((kExternalStringTag & kIsIndirectStringMask) == 0); +STATIC_ASSERT( + (kConsStringTag & kIsIndirectStringMask) == kIsIndirectStringTag); +STATIC_ASSERT( + (kSlicedStringTag & kIsIndirectStringMask) == kIsIndirectStringTag); // If bit 7 is clear, then bit 3 indicates whether this two-byte // string actually contains ascii data. @@ -511,6 +529,8 @@ enum InstanceType { ASCII_STRING_TYPE = kAsciiStringTag | kSeqStringTag, CONS_STRING_TYPE = kTwoByteStringTag | kConsStringTag, CONS_ASCII_STRING_TYPE = kAsciiStringTag | kConsStringTag, + SLICED_STRING_TYPE = kTwoByteStringTag | kSlicedStringTag, + SLICED_ASCII_STRING_TYPE = kAsciiStringTag | kSlicedStringTag, EXTERNAL_STRING_TYPE = kTwoByteStringTag | kExternalStringTag, EXTERNAL_STRING_WITH_ASCII_DATA_TYPE = kTwoByteStringTag | kExternalStringTag | kAsciiDataHintTag, @@ -718,6 +738,7 @@ class MaybeObject BASE_EMBEDDED { V(SeqString) \ V(ExternalString) \ V(ConsString) \ + V(SlicedString) \ V(ExternalTwoByteString) \ V(ExternalAsciiString) \ V(SeqTwoByteString) \ @@ -5783,6 +5804,8 @@ class StringShape BASE_EMBEDDED { inline bool IsSequential(); inline bool IsExternal(); inline bool IsCons(); + inline bool IsSliced(); + inline bool IsIndirect(); inline bool IsExternalAscii(); inline bool IsExternalTwoByte(); inline bool IsSequentialAscii(); @@ -5874,14 +5897,19 @@ class String: public HeapObject { inline uint32_t hash_field(); inline void set_hash_field(uint32_t value); - inline bool IsAsciiRepresentation(); - inline bool IsTwoByteRepresentation(); - // Returns whether this string has only ASCII chars, i.e. all of them can // be ASCII encoded. This might be the case even if the string is // two-byte. Such strings may appear when the embedder prefers // two-byte external representations even for ASCII data. - // + inline bool IsAsciiRepresentation(); + inline bool IsTwoByteRepresentation(); + + // Cons and slices have an encoding flag that may not represent the actual + // encoding of the underlying string. This is taken into account here. + // Requires: this->IsFlat() + inline bool IsAsciiRepresentationUnderneath(); + inline bool IsTwoByteRepresentationUnderneath(); + // NOTE: this should be considered only a hint. False negatives are // possible. inline bool HasOnlyAsciiChars(); @@ -5921,6 +5949,10 @@ class String: public HeapObject { // kind. FlatContent GetFlatContent(); + // Returns the parent of a sliced string or first part of a flat cons string. + // Requires: StringShape(this).IsIndirect() && this->IsFlat() + inline String* GetUnderlying(); + // Mark the string as an undetectable object. It only applies to // ascii and two byte string types. bool MarkAsUndetectable(); @@ -6349,11 +6381,69 @@ class ConsString: public String { typedef FixedBodyDescriptor BodyDescriptor; +#ifdef DEBUG + void ConsStringVerify(); +#endif + private: DISALLOW_IMPLICIT_CONSTRUCTORS(ConsString); }; +// The Sliced String class describes strings that are substrings of another +// sequential string. The motivation is to save time and memory when creating +// a substring. A Sliced String is described as a pointer to the parent, +// the offset from the start of the parent string and the length. Using +// a Sliced String therefore requires unpacking of the parent string and +// adding the offset to the start address. A substring of a Sliced String +// are not nested since the double indirection is simplified when creating +// such a substring. +// Currently missing features are: +// - handling externalized parent strings +// - external strings as parent +// - truncating sliced string to enable otherwise unneeded parent to be GC'ed. +class SlicedString: public String { + public: + + inline String* parent(); + inline void set_parent(String* parent); + inline int offset(); + inline void set_offset(int offset); + + // Dispatched behavior. + uint16_t SlicedStringGet(int index); + + // Casting. + static inline SlicedString* cast(Object* obj); + + // Layout description. + static const int kParentOffset = POINTER_SIZE_ALIGN(String::kSize); + static const int kOffsetOffset = kParentOffset + kPointerSize; + static const int kSize = kOffsetOffset + kPointerSize; + + // Support for StringInputBuffer + inline const unibrow::byte* SlicedStringReadBlock(ReadBlockBuffer* buffer, + unsigned* offset_ptr, + unsigned chars); + inline void SlicedStringReadBlockIntoBuffer(ReadBlockBuffer* buffer, + unsigned* offset_ptr, + unsigned chars); + // Minimum length for a sliced string. + static const int kMinLength = 13; + + typedef FixedBodyDescriptor + BodyDescriptor; + +#ifdef DEBUG + void SlicedStringVerify(); +#endif + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(SlicedString); +}; + + // The ExternalString class describes string values that are backed by // a string resource that lies outside the V8 heap. ExternalStrings // consist of the length field common to all strings, a pointer to the diff --git a/src/regexp-macro-assembler.cc b/src/regexp-macro-assembler.cc index 5578243..f91ea93 100644 --- a/src/regexp-macro-assembler.cc +++ b/src/regexp-macro-assembler.cc @@ -120,27 +120,31 @@ NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match( String* subject_ptr = *subject; // Character offsets into string. int start_offset = previous_index; - int end_offset = subject_ptr->length(); + int char_length = subject_ptr->length() - start_offset; + int slice_offset = 0; - // The string has been flattened, so it it is a cons string it contains the + // The string has been flattened, so if it is a cons string it contains the // full string in the first part. if (StringShape(subject_ptr).IsCons()) { ASSERT_EQ(0, ConsString::cast(subject_ptr)->second()->length()); subject_ptr = ConsString::cast(subject_ptr)->first(); + } else if (StringShape(subject_ptr).IsSliced()) { + SlicedString* slice = SlicedString::cast(subject_ptr); + subject_ptr = slice->parent(); + slice_offset = slice->offset(); } // Ensure that an underlying string has the same ascii-ness. bool is_ascii = subject_ptr->IsAsciiRepresentation(); ASSERT(subject_ptr->IsExternalString() || subject_ptr->IsSeqString()); // String is now either Sequential or External int char_size_shift = is_ascii ? 0 : 1; - int char_length = end_offset - start_offset; const byte* input_start = - StringCharacterPosition(subject_ptr, start_offset); + StringCharacterPosition(subject_ptr, start_offset + slice_offset); int byte_length = char_length << char_size_shift; const byte* input_end = input_start + byte_length; Result res = Execute(*regexp_code, - subject_ptr, + *subject, start_offset, input_start, input_end, @@ -152,7 +156,7 @@ NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match( NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute( Code* code, - String* input, + String* input, // This needs to be the unpacked (sliced, cons) string. int start_offset, const byte* input_start, const byte* input_end, diff --git a/src/runtime.cc b/src/runtime.cc index d49cb10..aa8983f 100644 --- a/src/runtime.cc +++ b/src/runtime.cc @@ -3674,7 +3674,7 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_RegExpExecMultiple) { HandleScope handles(isolate); CONVERT_ARG_CHECKED(String, subject, 1); - if (!subject->IsFlat()) { FlattenString(subject); } + if (!subject->IsFlat()) FlattenString(subject); CONVERT_ARG_CHECKED(JSRegExp, regexp, 0); CONVERT_ARG_CHECKED(JSArray, last_match_info, 2); CONVERT_ARG_CHECKED(JSArray, result_array, 3); diff --git a/src/x64/code-stubs-x64.cc b/src/x64/code-stubs-x64.cc index 94ed0cb..0b785c5 100644 --- a/src/x64/code-stubs-x64.cc +++ b/src/x64/code-stubs-x64.cc @@ -2374,7 +2374,6 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { __ testq(kScratchRegister, kScratchRegister); __ j(zero, &runtime); - // Check that the first argument is a JSRegExp object. __ movq(rax, Operand(rsp, kJSRegExpOffset)); __ JumpIfSmi(rax, &runtime); @@ -2445,10 +2444,14 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { __ cmpl(rdx, rdi); __ j(greater, &runtime); + // Reset offset for possibly sliced string. + __ Set(r14, 0); // rax: RegExp data (FixedArray) // Check the representation and encoding of the subject string. Label seq_ascii_string, seq_two_byte_string, check_code; __ movq(rdi, Operand(rsp, kSubjectOffset)); + // Make a copy of the original subject string. + __ movq(r15, rdi); __ movq(rbx, FieldOperand(rdi, HeapObject::kMapOffset)); __ movzxbl(rbx, FieldOperand(rbx, Map::kInstanceTypeOffset)); // First check for flat two byte string. @@ -2457,28 +2460,40 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { STATIC_ASSERT((kStringTag | kSeqStringTag | kTwoByteStringTag) == 0); __ j(zero, &seq_two_byte_string, Label::kNear); // Any other flat string must be a flat ascii string. - __ testb(rbx, Immediate(kIsNotStringMask | kStringRepresentationMask)); + __ andb(rbx, Immediate(kIsNotStringMask | kStringRepresentationMask)); __ j(zero, &seq_ascii_string, Label::kNear); - // Check for flat cons string. + // Check for flat cons string or sliced string. // A flat cons string is a cons string where the second part is the empty // string. In that case the subject string is just the first part of the cons // string. Also in this case the first part of the cons string is known to be // a sequential string or an external string. - STATIC_ASSERT(kExternalStringTag !=0); - STATIC_ASSERT((kConsStringTag & kExternalStringTag) == 0); - __ testb(rbx, Immediate(kIsNotStringMask | kExternalStringTag)); - __ j(not_zero, &runtime); - // String is a cons string. + // In the case of a sliced string its offset has to be taken into account. + Label cons_string, check_encoding; + STATIC_ASSERT((kConsStringTag < kExternalStringTag)); + STATIC_ASSERT((kSlicedStringTag > kExternalStringTag)); + __ cmpq(rbx, Immediate(kExternalStringTag)); + __ j(less, &cons_string, Label::kNear); + __ j(equal, &runtime); + + // String is sliced. + __ SmiToInteger32(r14, FieldOperand(rdi, SlicedString::kOffsetOffset)); + __ movq(rdi, FieldOperand(rdi, SlicedString::kParentOffset)); + // r14: slice offset + // r15: original subject string + // rdi: parent string + __ jmp(&check_encoding, Label::kNear); + // String is a cons string, check whether it is flat. + __ bind(&cons_string); __ CompareRoot(FieldOperand(rdi, ConsString::kSecondOffset), Heap::kEmptyStringRootIndex); __ j(not_equal, &runtime); __ movq(rdi, FieldOperand(rdi, ConsString::kFirstOffset)); + // rdi: first part of cons string or parent of sliced string. + // rbx: map of first part of cons string or map of parent of sliced string. + // Is first part of cons or parent of slice a flat two byte string? + __ bind(&check_encoding); __ movq(rbx, FieldOperand(rdi, HeapObject::kMapOffset)); - // String is a cons string with empty second part. - // rdi: first part of cons string. - // rbx: map of first part of cons string. - // Is first part a flat two byte string? __ testb(FieldOperand(rbx, Map::kInstanceTypeOffset), Immediate(kStringRepresentationMask | kStringEncodingMask)); STATIC_ASSERT((kSeqStringTag | kTwoByteStringTag) == 0); @@ -2575,33 +2590,40 @@ void RegExpExecStub::Generate(MacroAssembler* masm) { // rbx: previous index // rcx: encoding of subject string (1 if ascii 0 if two_byte); // r11: code + // r14: slice offset + // r15: original subject string + + // Argument 2: Previous index. + __ movq(arg2, rbx); // Argument 4: End of string data // Argument 3: Start of string data - Label setup_two_byte, setup_rest; + Label setup_two_byte, setup_rest, got_length, length_not_from_slice; + // Prepare start and end index of the input. + // Load the length from the original sliced string if that is the case. + __ addq(rbx, r14); + __ SmiToInteger32(arg3, FieldOperand(r15, String::kLengthOffset)); + __ addq(r14, arg3); // Using arg3 as scratch. + + // rbx: start index of the input + // r14: end index of the input + // r15: original subject string __ testb(rcx, rcx); // Last use of rcx as encoding of subject string. __ j(zero, &setup_two_byte, Label::kNear); - __ SmiToInteger32(rcx, FieldOperand(rdi, String::kLengthOffset)); - __ lea(arg4, FieldOperand(rdi, rcx, times_1, SeqAsciiString::kHeaderSize)); + __ lea(arg4, FieldOperand(rdi, r14, times_1, SeqAsciiString::kHeaderSize)); __ lea(arg3, FieldOperand(rdi, rbx, times_1, SeqAsciiString::kHeaderSize)); __ jmp(&setup_rest, Label::kNear); __ bind(&setup_two_byte); - __ SmiToInteger32(rcx, FieldOperand(rdi, String::kLengthOffset)); - __ lea(arg4, FieldOperand(rdi, rcx, times_2, SeqTwoByteString::kHeaderSize)); + __ lea(arg4, FieldOperand(rdi, r14, times_2, SeqTwoByteString::kHeaderSize)); __ lea(arg3, FieldOperand(rdi, rbx, times_2, SeqTwoByteString::kHeaderSize)); - __ bind(&setup_rest); - // Argument 2: Previous index. - __ movq(arg2, rbx); - // Argument 1: Subject string. -#ifdef _WIN64 - __ movq(arg1, rdi); -#else - // Already there in AMD64 calling convention. - ASSERT(arg1.is(rdi)); - USE(arg1); -#endif + // Argument 1: Original subject string. + // The original subject is in the previous stack frame. Therefore we have to + // use rbp, which points exactly to one pointer size below the previous rsp. + // (Because creating a new stack frame pushes the previous rbp onto the stack + // and thereby moves up rsp by one kPointerSize.) + __ movq(arg1, r15); // Locate the code entry and call it. __ addq(r11, Immediate(Code::kHeaderSize - kHeapObjectTag)); @@ -3851,6 +3873,7 @@ void StringCharCodeAtGenerator::GenerateFast(MacroAssembler* masm) { Label flat_string; Label ascii_string; Label got_char_code; + Label sliced_string; // If the receiver is a smi trigger the non-string case. __ JumpIfSmi(object_, receiver_not_string_); @@ -3879,25 +3902,39 @@ void StringCharCodeAtGenerator::GenerateFast(MacroAssembler* masm) { __ j(zero, &flat_string); // Handle non-flat strings. - __ testb(result_, Immediate(kIsConsStringMask)); - __ j(zero, &call_runtime_); + __ and_(result_, Immediate(kStringRepresentationMask)); + STATIC_ASSERT((kConsStringTag < kExternalStringTag)); + STATIC_ASSERT((kSlicedStringTag > kExternalStringTag)); + __ cmpb(result_, Immediate(kExternalStringTag)); + __ j(greater, &sliced_string); + __ j(equal, &call_runtime_); // ConsString. // Check whether the right hand side is the empty string (i.e. if // this is really a flat string in a cons string). If that is not // the case we would rather go to the runtime system now to flatten // the string. + Label assure_seq_string; __ CompareRoot(FieldOperand(object_, ConsString::kSecondOffset), Heap::kEmptyStringRootIndex); __ j(not_equal, &call_runtime_); // Get the first of the two strings and load its instance type. __ movq(object_, FieldOperand(object_, ConsString::kFirstOffset)); + __ jmp(&assure_seq_string, Label::kNear); + + // SlicedString, unpack and add offset. + __ bind(&sliced_string); + __ addq(scratch_, FieldOperand(object_, SlicedString::kOffsetOffset)); + __ movq(object_, FieldOperand(object_, SlicedString::kParentOffset)); + + __ bind(&assure_seq_string); __ movq(result_, FieldOperand(object_, HeapObject::kMapOffset)); __ movzxbl(result_, FieldOperand(result_, Map::kInstanceTypeOffset)); // If the first cons component is also non-flat, then go to runtime. STATIC_ASSERT(kSeqStringTag == 0); __ testb(result_, Immediate(kStringRepresentationMask)); __ j(not_zero, &call_runtime_); + __ jmp(&flat_string); // Check for 1-byte or 2-byte string. __ bind(&flat_string); @@ -4208,6 +4245,8 @@ void StringAddStub::Generate(MacroAssembler* masm) { __ and_(rcx, Immediate(kStringRepresentationMask)); __ cmpl(rcx, Immediate(kExternalStringTag)); __ j(equal, &string_add_runtime); + // We cannot encounter sliced strings here since: + STATIC_ASSERT(SlicedString::kMinLength >= String::kMinNonFlatLength); // Now check if both strings are ascii strings. // rax: first string // rbx: length of resulting flat string @@ -4600,6 +4639,9 @@ void StringHelper::GenerateHashGetHash(MacroAssembler* masm, void SubStringStub::Generate(MacroAssembler* masm) { Label runtime; + if (FLAG_string_slices) { + __ jmp(&runtime); + } // Stack frame on entry. // rsp[0]: return address // rsp[8]: to diff --git a/src/x64/lithium-codegen-x64.cc b/src/x64/lithium-codegen-x64.cc index acbac44..76a9453 100644 --- a/src/x64/lithium-codegen-x64.cc +++ b/src/x64/lithium-codegen-x64.cc @@ -3200,95 +3200,80 @@ void LCodeGen::DoStringCharCodeAt(LStringCharCodeAt* instr) { }; Register string = ToRegister(instr->string()); - Register index = no_reg; - int const_index = -1; - if (instr->index()->IsConstantOperand()) { - const_index = ToInteger32(LConstantOperand::cast(instr->index())); - STATIC_ASSERT(String::kMaxLength <= Smi::kMaxValue); - if (!Smi::IsValid(const_index)) { - // Guaranteed to be out of bounds because of the assert above. - // So the bounds check that must dominate this instruction must - // have deoptimized already. - if (FLAG_debug_code) { - __ Abort("StringCharCodeAt: out of bounds index."); - } - // No code needs to be generated. - return; - } - } else { - index = ToRegister(instr->index()); - } + Register index = ToRegister(instr->index()); Register result = ToRegister(instr->result()); DeferredStringCharCodeAt* deferred = new DeferredStringCharCodeAt(this, instr); - Label flat_string, ascii_string, done; - // Fetch the instance type of the receiver into result register. __ movq(result, FieldOperand(string, HeapObject::kMapOffset)); __ movzxbl(result, FieldOperand(result, Map::kInstanceTypeOffset)); - // We need special handling for non-sequential strings. - STATIC_ASSERT(kSeqStringTag == 0); - __ testb(result, Immediate(kStringRepresentationMask)); - __ j(zero, &flat_string, Label::kNear); - - // Handle cons strings and go to deferred code for the rest. - __ testb(result, Immediate(kIsConsStringMask)); - __ j(zero, deferred->entry()); - - // ConsString. + // We need special handling for indirect strings. + Label check_sequential; + __ testb(result, Immediate(kIsIndirectStringMask)); + __ j(zero, &check_sequential, Label::kNear); + + // Dispatch on the indirect string shape: slice or cons. + Label cons_string; + const uint32_t kSlicedNotConsMask = kSlicedStringTag & ~kConsStringTag; + ASSERT(IsPowerOf2(kSlicedNotConsMask) && kSlicedNotConsMask != 0); + __ testb(result, Immediate(kSlicedNotConsMask)); + __ j(zero, &cons_string, Label::kNear); + + // Handle slices. + Label indirect_string_loaded; + __ SmiToInteger32(result, FieldOperand(string, SlicedString::kOffsetOffset)); + __ addq(index, result); + __ movq(string, FieldOperand(string, SlicedString::kParentOffset)); + __ jmp(&indirect_string_loaded, Label::kNear); + + // Handle conses. // Check whether the right hand side is the empty string (i.e. if // this is really a flat string in a cons string). If that is not // the case we would rather go to the runtime system now to flatten // the string. + __ bind(&cons_string); __ CompareRoot(FieldOperand(string, ConsString::kSecondOffset), Heap::kEmptyStringRootIndex); __ j(not_equal, deferred->entry()); - // Get the first of the two strings and load its instance type. __ movq(string, FieldOperand(string, ConsString::kFirstOffset)); + + __ bind(&indirect_string_loaded); __ movq(result, FieldOperand(string, HeapObject::kMapOffset)); __ movzxbl(result, FieldOperand(result, Map::kInstanceTypeOffset)); - // If the first cons component is also non-flat, then go to runtime. + + // Check whether the string is sequential. The only non-sequential + // shapes we support have just been unwrapped above. + __ bind(&check_sequential); STATIC_ASSERT(kSeqStringTag == 0); __ testb(result, Immediate(kStringRepresentationMask)); __ j(not_zero, deferred->entry()); - // Check for ASCII or two-byte string. - __ bind(&flat_string); + // Dispatch on the encoding: ASCII or two-byte. + Label ascii_string; STATIC_ASSERT(kAsciiStringTag != 0); __ testb(result, Immediate(kStringEncodingMask)); __ j(not_zero, &ascii_string, Label::kNear); // Two-byte string. // Load the two-byte character code into the result register. + Label done; STATIC_ASSERT(kSmiTag == 0 && kSmiTagSize == 1); - if (instr->index()->IsConstantOperand()) { - __ movzxwl(result, - FieldOperand(string, - SeqTwoByteString::kHeaderSize + - (kUC16Size * const_index))); - } else { - __ movzxwl(result, FieldOperand(string, - index, - times_2, - SeqTwoByteString::kHeaderSize)); - } + __ movzxwl(result, FieldOperand(string, + index, + times_2, + SeqTwoByteString::kHeaderSize)); __ jmp(&done, Label::kNear); // ASCII string. // Load the byte into the result register. __ bind(&ascii_string); - if (instr->index()->IsConstantOperand()) { - __ movzxbl(result, FieldOperand(string, - SeqAsciiString::kHeaderSize + const_index)); - } else { - __ movzxbl(result, FieldOperand(string, - index, - times_1, - SeqAsciiString::kHeaderSize)); - } + __ movzxbl(result, FieldOperand(string, + index, + times_1, + SeqAsciiString::kHeaderSize)); __ bind(&done); __ bind(deferred->exit()); } diff --git a/src/x64/lithium-x64.cc b/src/x64/lithium-x64.cc index 9d08d37..bd31956 100644 --- a/src/x64/lithium-x64.cc +++ b/src/x64/lithium-x64.cc @@ -1984,8 +1984,8 @@ LInstruction* LChunkBuilder::DoStringAdd(HStringAdd* instr) { LInstruction* LChunkBuilder::DoStringCharCodeAt(HStringCharCodeAt* instr) { - LOperand* string = UseRegister(instr->string()); - LOperand* index = UseRegisterOrConstant(instr->index()); + LOperand* string = UseTempRegister(instr->string()); + LOperand* index = UseTempRegister(instr->index()); LStringCharCodeAt* result = new LStringCharCodeAt(string, index); return AssignEnvironment(AssignPointerMap(DefineAsRegister(result))); } diff --git a/src/x64/regexp-macro-assembler-x64.cc b/src/x64/regexp-macro-assembler-x64.cc index 395466e..8595c6d 100644 --- a/src/x64/regexp-macro-assembler-x64.cc +++ b/src/x64/regexp-macro-assembler-x64.cc @@ -1170,12 +1170,13 @@ int RegExpMacroAssemblerX64::CheckStackGuardState(Address* return_address, } // Prepare for possible GC. - HandleScope handles; + HandleScope handles(isolate); Handle code_handle(re_code); Handle subject(frame_entry(re_frame, kInputString)); + // Current string. - bool is_ascii = subject->IsAsciiRepresentation(); + bool is_ascii = subject->IsAsciiRepresentationUnderneath(); ASSERT(re_code->instruction_start() <= *return_address); ASSERT(*return_address <= @@ -1184,7 +1185,7 @@ int RegExpMacroAssemblerX64::CheckStackGuardState(Address* return_address, MaybeObject* result = Execution::HandleStackGuardInterrupt(); if (*code_handle != re_code) { // Return address no longer valid - intptr_t delta = *code_handle - re_code; + int delta = *code_handle - re_code; // Overwrite the return address on the stack. *return_address += delta; } @@ -1193,8 +1194,20 @@ int RegExpMacroAssemblerX64::CheckStackGuardState(Address* return_address, return EXCEPTION; } + Handle subject_tmp = subject; + int slice_offset = 0; + + // Extract the underlying string and the slice offset. + if (StringShape(*subject_tmp).IsCons()) { + subject_tmp = Handle(ConsString::cast(*subject_tmp)->first()); + } else if (StringShape(*subject_tmp).IsSliced()) { + SlicedString* slice = SlicedString::cast(*subject_tmp); + subject_tmp = Handle(slice->parent()); + slice_offset = slice->offset(); + } + // String might have changed. - if (subject->IsAsciiRepresentation() != is_ascii) { + if (subject_tmp->IsAsciiRepresentation() != is_ascii) { // If we changed between an ASCII and an UC16 string, the specialized // code cannot be used, and we need to restart regexp matching from // scratch (including, potentially, compiling a new version of the code). @@ -1205,8 +1218,8 @@ int RegExpMacroAssemblerX64::CheckStackGuardState(Address* return_address, // be a sequential or external string with the same content. // Update the start and end pointers in the stack frame to the current // location (whether it has actually moved or not). - ASSERT(StringShape(*subject).IsSequential() || - StringShape(*subject).IsExternal()); + ASSERT(StringShape(*subject_tmp).IsSequential() || + StringShape(*subject_tmp).IsExternal()); // The original start address of the characters to match. const byte* start_address = frame_entry(re_frame, kInputStart); @@ -1214,7 +1227,8 @@ int RegExpMacroAssemblerX64::CheckStackGuardState(Address* return_address, // Find the current start address of the same character at the current string // position. int start_index = frame_entry(re_frame, kStartIndex); - const byte* new_address = StringCharacterPosition(*subject, start_index); + const byte* new_address = StringCharacterPosition(*subject_tmp, + start_index + slice_offset); if (start_address != new_address) { // If there is a difference, update the object pointer and start and end diff --git a/test/cctest/test-debug.cc b/test/cctest/test-debug.cc index 58d970c..b7962de 100644 --- a/test/cctest/test-debug.cc +++ b/test/cctest/test-debug.cc @@ -2174,7 +2174,7 @@ TEST(ScriptBreakPointLine) { f = v8::Local::Cast(env->Global()->Get(v8::String::New("f"))); g = v8::Local::Cast(env->Global()->Get(v8::String::New("g"))); - // Chesk that a break point was hit when the script was run. + // Check that a break point was hit when the script was run. CHECK_EQ(1, break_point_hit_count); CHECK_EQ(0, StrLength(last_function_hit)); diff --git a/test/cctest/test-strings.cc b/test/cctest/test-strings.cc index 4d9b264..17020a3 100644 --- a/test/cctest/test-strings.cc +++ b/test/cctest/test-strings.cc @@ -430,8 +430,7 @@ TEST(ExternalShortStringAdd) { " return 0;" "};" "test()"; - CHECK_EQ(0, - v8::Script::Compile(v8::String::New(source))->Run()->Int32Value()); + CHECK_EQ(0, CompileRun(source)->Int32Value()); } @@ -481,3 +480,52 @@ TEST(CachedHashOverflow) { } } } + + +TEST(SliceFromCons) { + FLAG_string_slices = true; + InitializeVM(); + v8::HandleScope scope; + Handle string = + FACTORY->NewStringFromAscii(CStrVector("parentparentparent")); + Handle parent = FACTORY->NewConsString(string, string); + CHECK(parent->IsConsString()); + CHECK(!parent->IsFlat()); + Handle slice = FACTORY->NewSubString(parent, 1, 25); + // After slicing, the original string becomes a flat cons. + CHECK(parent->IsFlat()); + CHECK(slice->IsSlicedString()); + CHECK_EQ(SlicedString::cast(*slice)->parent(), + ConsString::cast(*parent)->first()); + CHECK(SlicedString::cast(*slice)->parent()->IsSeqString()); + CHECK(slice->IsFlat()); +} + + +TEST(TrivialSlice) { + // This tests whether a slice that contains the entire parent string + // actually creates a new string (it should not). + FLAG_string_slices = true; + InitializeVM(); + HandleScope scope; + v8::Local result; + Handle string; + const char* init = "var str = 'abcdefghijklmnopqrstuvwxyz';"; + const char* check = "str.slice(0,26)"; + const char* crosscheck = "str.slice(1,25)"; + + CompileRun(init); + + result = CompileRun(check); + CHECK(result->IsString()); + string = v8::Utils::OpenHandle(v8::String::Cast(*result)); + CHECK(!string->IsSlicedString()); + + string = FACTORY->NewSubString(string, 0, 26); + CHECK(!string->IsSlicedString()); + result = CompileRun(crosscheck); + CHECK(result->IsString()); + string = v8::Utils::OpenHandle(v8::String::Cast(*result)); + CHECK(string->IsSlicedString()); + CHECK_EQ("bcdefghijklmnopqrstuvwxy", *(string->ToCString())); +} diff --git a/test/mjsunit/string-slices-regexp.js b/test/mjsunit/string-slices-regexp.js new file mode 100644 index 0000000..a8cadae --- /dev/null +++ b/test/mjsunit/string-slices-regexp.js @@ -0,0 +1,81 @@ +// Copyright 2009 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Flags: --string-slices + +//assertEquals('345"12345 6"1234567"123', +// '12345""12345 6""1234567""1234'.slice(2,-1).replace(/""/g, '"')); + +var foo = "lsdfj sldkfj sdklfj læsdfjl sdkfjlsdk fjsdl fjsdljskdj flsj flsdkj flskd regexp: /foobar/\nldkfj sdlkfj sdkl"; +for(var i = 0; i < 1000; i++) { + assertTrue(/^([a-z]+): (.*)/.test(foo.substring(foo.indexOf("regexp:")))); + assertEquals("regexp", RegExp.$1, "RegExp.$1"); +} + +var re = /^(((N({)?)|(R)|(U)|(V)|(B)|(H)|(n((n)|(r)|(v)|(h))?)|(r(r)?)|(v)|(b((n)|(b))?)|(h))|((Y)|(A)|(E)|(o(u)?)|(p(u)?)|(q(u)?)|(s)|(t)|(u)|(w)|(x(u)?)|(y)|(z)|(a((T)|(A)|(L))?)|(c)|(e)|(f(u)?)|(g(u)?)|(i)|(j)|(l)|(m(u)?)))+/; +var r = new RegExp(re) +var str = "_Avtnennan gunzvmu pubExnY nEvln vaTxh rmuhguhaTxnY_".slice(1,-1); +str = str + str; +assertTrue(r.test(str)); +assertTrue(r.test(str)); +var re = /x/; +assertEquals("a.yb", "_axyb_".slice(1,-1).replace(re, ".")); +re.compile("y"); +assertEquals("ax.b", "_axyb_".slice(1,-1).replace(re, ".")); +re.compile("(x)"); +assertEquals(["x", "x"], re.exec("_axyb_".slice(1,-1))); +re.compile("(y)"); +assertEquals(["y", "y"], re.exec("_axyb_".slice(1,-1))); + +for(var i = 0; i < 100; i++) { + var a = "aaaaaaaaaaaaaaaaaaaaaaaabbaacabbabaaaaabbaaaabbac".slice(24,-1); + var b = "bbaacabbabaaaaabbaaaabba" + a; + // The first time, the cons string will be flattened and handled by the + // runtime system. + assertEquals(["bbaa", "a", "", "a"], /((\3|b)\2(a)){2,}/.exec(b)); + // The second time, the cons string is already flattened and will be + // handled by generated code. + assertEquals(["bbaa", "a", "", "a"], /((\3|b)\2(a)){2,}/.exec(b)); + assertEquals(["bbaa", "a", "", "a"], /((\3|b)\2(a)){2,}/.exec(a)); + assertEquals(["bbaa", "a", "", "a"], /((\3|b)\2(a)){2,}/.exec(a)); +} + +var c = "ABCDEFGHIJKLMN".slice(2,-2); +var d = "ABCDEF\u1234GHIJKLMN".slice(2,-2); +var e = "ABCDEFGHIJKLMN".slice(0,-2); +assertTrue(/^C.*L$/.test(c)); +assertTrue(/^C.*L$/.test(c)); +assertTrue(/^C.*L$/.test(d)); +assertTrue(/^C.*L$/.test(d)); +assertTrue(/^A\w{10}L$/.test(e)); +assertTrue(/^A\w{10}L$/.test(e)); + +var e = "qui-opIasd-fghjklzx-cvbn-mqwer-tyuio-pasdf-ghIjkl-zx".slice(6,-6); +var e_split = e.split("-"); +assertEquals(e_split[0], "Iasd"); +assertEquals(e_split[1], "fghjklzx"); +assertEquals(e_split[6], "ghI"); diff --git a/test/mjsunit/string-slices.js b/test/mjsunit/string-slices.js new file mode 100755 index 0000000..b0b05ec --- /dev/null +++ b/test/mjsunit/string-slices.js @@ -0,0 +1,198 @@ +// Copyright 2008 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Flags: --string-slices --expose-externalize-string + +var s = 'abcdefghijklmn'; +assertEquals(s, s.substr()); +assertEquals(s, s.substr(0)); +assertEquals(s, s.substr('0')); +assertEquals(s, s.substr(void 0)); +assertEquals(s, s.substr(null)); +assertEquals(s, s.substr(false)); +assertEquals(s, s.substr(0.9)); +assertEquals(s, s.substr({ valueOf: function() { return 0; } })); +assertEquals(s, s.substr({ toString: function() { return '0'; } })); + +var s1 = s.substring(1); +assertEquals(s1, s.substr(1)); +assertEquals(s1, s.substr('1')); +assertEquals(s1, s.substr(true)); +assertEquals(s1, s.substr(1.1)); +assertEquals(s1, s.substr({ valueOf: function() { return 1; } })); +assertEquals(s1, s.substr({ toString: function() { return '1'; } })); + + +assertEquals(s.substring(s.length - 1), s.substr(-1)); +assertEquals(s.substring(s.length - 1), s.substr(-1.2)); +assertEquals(s.substring(s.length - 1), s.substr(-1.7)); +assertEquals(s.substring(s.length - 2), s.substr(-2)); +assertEquals(s.substring(s.length - 2), s.substr(-2.3)); +assertEquals(s.substring(s.length - 2, s.length - 1), s.substr(-2, 1)); +assertEquals(s, s.substr(-100)); +assertEquals('abc', s.substr(-100, 3)); +assertEquals(s1, s.substr(-s.length + 1)); + +// assertEquals('', s.substr(0, void 0)); // smjs and rhino +assertEquals('abcdefghijklmn', s.substr(0, void 0)); // kjs and v8 +assertEquals('', s.substr(0, null)); +assertEquals(s, s.substr(0, String(s.length))); +assertEquals('a', s.substr(0, true)); + + +// Test substrings of different lengths and alignments. +// First ASCII. +var x = "ASCII"; +for (var i = 0; i < 25; i++) { + x += (i >> 4).toString(16) + (i & 0x0f).toString(16); +} +/x/.exec(x); // Try to force a flatten. +for (var i = 5; i < 25; i++) { + for (var j = 12; j < 25; j++) { + var z = x.substring(i, i+j); + var w = Math.random() * 42; // Allocate something new in new-space. + assertEquals(j, z.length); + for (var k = 0; k < j; k++) { + assertEquals(x.charAt(i+k), z.charAt(k)); + } + } +} +// Then two-byte strings. +x = "UC16\u2028"; // Non-ascii char forces two-byte string. +for (var i = 0; i < 25; i++) { + x += (i >> 4).toString(16) + (i & 0x0f).toString(16); +} +/x/.exec(x); // Try to force a flatten. +for (var i = 5; i < 25; i++) { + for (var j = 0; j < 25; j++) { + var z = x.substring(i, i + j); + var w = Math.random() * 42; // Allocate something new in new-space. + assertEquals(j, z.length); + for (var k = 0; k < j; k++) { + assertEquals(x.charAt(i+k), z.charAt(k)); + } + } +} + +// Keep creating strings to to force allocation failure on substring creation. +var x = "0123456789ABCDEF"; +x += x; // 2^5 +x += x; +x += x; +x += x; +x += x; +x += x; // 2^10 +x += x; +x += x; +var xl = x.length; +var cache = []; +for (var i = 0; i < 10000; i++) { + var z = x.substring(i % xl); + assertEquals(xl - (i % xl), z.length); + cache.push(z); +} + + +// Same with two-byte strings +var x = "\u2028123456789ABCDEF"; +x += x; // 2^5 +x += x; +x += x; +x += x; +x += x; +x += x; // 2^10 +x += x; +x += x; +var xl = x.length; +var cache = []; +for (var i = 0; i < 10000; i++) { + var z = x.substring(i % xl); + assertEquals(xl - (i % xl), z.length); + cache.push(z); +} + +// Substring of substring. +var cache = []; +var last = x; +var offset = 0; +for (var i = 0; i < 64; i++) { + var z = last.substring(i); + last = z; + cache.push(z); + offset += i; +} +for (var i = 63; i >= 0; i--) { + var z = cache.pop(); + assertTrue(/\u2028123456789ABCDEF/.test(z)); + assertEquals(xl - offset, z.length); + offset -= i; +} + +// Test charAt for different strings. +function f(s1, s2, s3, i) { + assertEquals(String.fromCharCode(97+i%11), s1.charAt(i%11)); + assertEquals(String.fromCharCode(97+i%11), s2.charAt(i%11)); + assertEquals(String.fromCharCode(98+i%11), s3.charAt(i%11)); + assertEquals(String.fromCharCode(101), s3.charAt(3)); +} + +flat = "abcdefghijkl12345"; +cons = flat + flat.toUpperCase(); +slice = "abcdefghijklmn12345".slice(1, -1); +for ( var i = 0; i < 1000; i++) { + f(flat, cons, slice, i); +} +flat = "abcdefghijkl1\u20232345"; +cons = flat + flat.toUpperCase(); +slice = "abcdefghijklmn1\u20232345".slice(1, -1); +for ( var i = 0; i < 1000; i++) { + f(flat, cons, slice, i); +} + +// Concatenate substrings. +var ascii = 'abcdefghijklmnop'; +var utf = '\u03B1\u03B2\u03B3\u03B4\u03B5\u03B6\u03B7\u03B8\u03B9\u03BA\u03BB'; +assertEquals("klmno", ascii.substring(10,15) + ascii.substring(16)); +assertEquals("\u03B4\u03B7", utf.substring(3,4) + utf.substring(6,7)); +assertEquals("klp", ascii.substring(10,12) + ascii.substring(15,16)); +assertEquals("\u03B1\u03B4\u03B5", utf.substring(0,1) + utf.substring(5,3)); +assertEquals("", ascii.substring(16) + utf.substring(16)); +assertEquals("bcdef\u03B4\u03B5\u03B6\u03B7\u03B8\u03B9", + ascii.substring(1,6) + utf.substring(3,9)); +assertEquals("\u03B4\u03B5\u03B6\u03B7\u03B8\u03B9abcdefghijklmnop", + utf.substring(3,9) + ascii); +assertEquals("\u03B2\u03B3\u03B4\u03B5\u03B4\u03B5\u03B6\u03B7", + utf.substring(5,1) + utf.substring(3,7)); + +/* +// Externalizing strings. +var a = "123456789qwertyuiopasdfghjklzxcvbnm"; +var b = a.slice(1,-1); +assertEquals(a.slice(1,-1), b); +externalizeString(a); +assertEquals(a.slice(1,-1), b); +*/ \ No newline at end of file diff --git a/test/mjsunit/substr.js b/test/mjsunit/substr.js index f69a9c0..cffaf94 100755 --- a/test/mjsunit/substr.js +++ b/test/mjsunit/substr.js @@ -135,3 +135,20 @@ for (var i = 0; i < 10000; i++) { assertEquals(xl - (i % xl), z.length); cache.push(z); } + +// Substring of substring. +var cache = []; +var last = x; +var offset = 0; +for (var i = 0; i < 64; i++) { + var z = last.substring(i); + last = z; + cache.push(z); + offset += i; +} +for (var i = 63; i >= 0; i--) { + var z = cache.pop(); + assertTrue(/\u2028123456789ABCDEF/.test(z)); + assertEquals(xl - offset, z.length); + offset -= i; +}