From 40d86c616c5dfd88a7cf850f75d77ff34209b326 Mon Sep 17 00:00:00 2001 From: "lrn@chromium.org" Date: Thu, 24 Jun 2010 09:03:49 +0000 Subject: [PATCH] X64: Remove more fpu code. Unroll more local initialization loops. Review URL: http://codereview.chromium.org/2815028 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@4934 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/x64/assembler-x64.cc | 22 +++++++++++++++ src/x64/assembler-x64.h | 3 ++ src/x64/codegen-x64.cc | 2 +- src/x64/ic-x64.cc | 66 +++++++++----------------------------------- src/x64/virtual-frame-x64.cc | 36 ++++++++++++++++++------ src/x64/virtual-frame-x64.h | 2 +- 6 files changed, 68 insertions(+), 63 deletions(-) diff --git a/src/x64/assembler-x64.cc b/src/x64/assembler-x64.cc index 58d4739..2bb92d7 100644 --- a/src/x64/assembler-x64.cc +++ b/src/x64/assembler-x64.cc @@ -2738,6 +2738,28 @@ void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) { } +void Assembler::cvtsd2si(Register dst, XMMRegister src) { + EnsureSpace ensure_space(this); + last_pc_ = pc_; + emit(0xF2); + emit_optional_rex_32(dst, src); + emit(0x0F); + emit(0x2D); + emit_sse_operand(dst, src); +} + + +void Assembler::cvtsd2siq(Register dst, XMMRegister src) { + EnsureSpace ensure_space(this); + last_pc_ = pc_; + emit(0xF2); + emit_rex_64(dst, src); + emit(0x0F); + emit(0x2D); + emit_sse_operand(dst, src); +} + + void Assembler::addsd(XMMRegister dst, XMMRegister src) { EnsureSpace ensure_space(this); last_pc_ = pc_; diff --git a/src/x64/assembler-x64.h b/src/x64/assembler-x64.h index 407863f..213db2c 100644 --- a/src/x64/assembler-x64.h +++ b/src/x64/assembler-x64.h @@ -1128,6 +1128,9 @@ class Assembler : public Malloced { void cvtss2sd(XMMRegister dst, const Operand& src); void cvtsd2ss(XMMRegister dst, XMMRegister src); + void cvtsd2si(Register dst, XMMRegister src); + void cvtsd2siq(Register dst, XMMRegister src); + void addsd(XMMRegister dst, XMMRegister src); void subsd(XMMRegister dst, XMMRegister src); void mulsd(XMMRegister dst, XMMRegister src); diff --git a/src/x64/codegen-x64.cc b/src/x64/codegen-x64.cc index 572fa62..46e0114 100644 --- a/src/x64/codegen-x64.cc +++ b/src/x64/codegen-x64.cc @@ -2641,7 +2641,7 @@ void CodeGenerator::VisitArrayLiteral(ArrayLiteral* node) { // Generate code to set the elements in the array that are not // literals. - for (int i = 0; i < node->values()->length(); i++) { + for (int i = 0; i < length; i++) { Expression* value = node->values()->at(i); // If value is a literal the property value is already set in the diff --git a/src/x64/ic-x64.cc b/src/x64/ic-x64.cc index e17ff1f..31a806a 100644 --- a/src/x64/ic-x64.cc +++ b/src/x64/ic-x64.cc @@ -791,7 +791,6 @@ void KeyedLoadIC::GenerateExternalArray(MacroAssembler* masm, // Allocate a HeapNumber for the int and perform int-to-double // conversion. - ASSERT(array_type == kExternalUnsignedIntArray); // The value is zero-extended since we loaded the value from memory // with movl. __ cvtqsi2sd(xmm0, rcx); @@ -1121,55 +1120,41 @@ void KeyedStoreIC::GenerateExternalArray(MacroAssembler* masm, // The WebGL specification leaves the behavior of storing NaN and // +/-Infinity into integer arrays basically undefined. For more // reproducible behavior, convert these to zero. - __ fld_d(FieldOperand(rax, HeapNumber::kValueOffset)); + __ movsd(xmm0, FieldOperand(rax, HeapNumber::kValueOffset)); __ movq(rbx, FieldOperand(rbx, ExternalArray::kExternalPointerOffset)); // rdi: untagged index // rbx: base pointer of external storage // top of FPU stack: value if (array_type == kExternalFloatArray) { - __ fstp_s(Operand(rbx, rdi, times_4, 0)); + __ cvtsd2ss(xmm0, xmm0); + __ movss(Operand(rbx, rdi, times_4, 0), xmm0); __ ret(0); } else { // Need to perform float-to-int conversion. - // Test the top of the FP stack for NaN. - Label is_nan; - __ fucomi(0); - __ j(parity_even, &is_nan); - - __ push(rdx); // Make room on the stack. Receiver is no longer needed. - // TODO(lrn): If the rounding of this conversion is not deliberate, maybe - // switch to xmm registers. - __ fistp_d(Operand(rsp, 0)); - __ pop(rdx); + // Test the value for NaN. + + // Convert to int32 and store the low byte/word. + // If the value is NaN or +/-infinity, the result is 0x80000000, + // which is automatically zero when taken mod 2^n, n < 32. // rdx: value (converted to an untagged integer) // rdi: untagged index // rbx: base pointer of external storage switch (array_type) { case kExternalByteArray: case kExternalUnsignedByteArray: + __ cvtsd2si(rdx, xmm0); __ movb(Operand(rbx, rdi, times_1, 0), rdx); break; case kExternalShortArray: case kExternalUnsignedShortArray: + __ cvtsd2si(rdx, xmm0); __ movw(Operand(rbx, rdi, times_2, 0), rdx); break; case kExternalIntArray: case kExternalUnsignedIntArray: { - // We also need to explicitly check for +/-Infinity. These are - // converted to MIN_INT, but we need to be careful not to - // confuse with legal uses of MIN_INT. Since MIN_INT truncated - // to 8 or 16 bits is zero, we only perform this test when storing - // 32-bit ints. - Label not_infinity; - // This test would apparently detect both NaN and Infinity, - // but we've already checked for NaN using the FPU hardware - // above. - __ movzxwq(rcx, FieldOperand(rax, HeapNumber::kValueOffset + 6)); - __ and_(rcx, Immediate(0x7FF0)); - __ cmpw(rcx, Immediate(0x7FF0)); - __ j(not_equal, ¬_infinity); - __ movq(rdx, Immediate(0)); - __ bind(¬_infinity); + // Convert to int64, so that NaN and infinities become + // 0x8000000000000000, which is zero mod 2^32. + __ cvtsd2siq(rdx, xmm0); __ movl(Operand(rbx, rdi, times_4, 0), rdx); break; } @@ -1178,31 +1163,6 @@ void KeyedStoreIC::GenerateExternalArray(MacroAssembler* masm, break; } __ ret(0); - - __ bind(&is_nan); - // rdi: untagged index - // rbx: base pointer of external storage - __ ffree(); - __ fincstp(); - __ Set(rdx, 0); - switch (array_type) { - case kExternalByteArray: - case kExternalUnsignedByteArray: - __ movb(Operand(rbx, rdi, times_1, 0), rdx); - break; - case kExternalShortArray: - case kExternalUnsignedShortArray: - __ movw(Operand(rbx, rdi, times_2, 0), rdx); - break; - case kExternalIntArray: - case kExternalUnsignedIntArray: - __ movl(Operand(rbx, rdi, times_4, 0), rdx); - break; - default: - UNREACHABLE(); - break; - } - __ ret(0); } // Slow case: call runtime. diff --git a/src/x64/virtual-frame-x64.cc b/src/x64/virtual-frame-x64.cc index e65378d..f5e17fd 100644 --- a/src/x64/virtual-frame-x64.cc +++ b/src/x64/virtual-frame-x64.cc @@ -115,25 +115,45 @@ void VirtualFrame::AllocateStackSlots() { Handle undefined = Factory::undefined_value(); FrameElement initial_value = FrameElement::ConstantElement(undefined, FrameElement::SYNCED); - if (count == 1) { - __ Push(undefined); - } else if (count < kLocalVarBound) { - // For less locals the unrolled loop is more compact. - __ movq(kScratchRegister, undefined, RelocInfo::EMBEDDED_OBJECT); + if (count < kLocalVarBound) { + // For fewer locals the unrolled loop is more compact. + + // Hope for one of the first eight registers, where the push operation + // takes only one byte (kScratchRegister needs the REX.W bit). + Result tmp = cgen()->allocator()->Allocate(); + ASSERT(tmp.is_valid()); + __ movq(tmp.reg(), undefined, RelocInfo::EMBEDDED_OBJECT); for (int i = 0; i < count; i++) { - __ push(kScratchRegister); + __ push(tmp.reg()); } } else { // For more locals a loop in generated code is more compact. Label alloc_locals_loop; Result cnt = cgen()->allocator()->Allocate(); ASSERT(cnt.is_valid()); - __ movq(cnt.reg(), Immediate(count)); __ movq(kScratchRegister, undefined, RelocInfo::EMBEDDED_OBJECT); +#ifdef DEBUG + Label loop_size; + __ bind(&loop_size); +#endif + if (is_uint8(count)) { + // Loading imm8 is shorter than loading imm32. + // Loading only partial byte register, and using decb below. + __ movb(cnt.reg(), Immediate(count)); + } else { + __ movl(cnt.reg(), Immediate(count)); + } __ bind(&alloc_locals_loop); __ push(kScratchRegister); - __ decl(cnt.reg()); + if (is_uint8(count)) { + __ decb(cnt.reg()); + } else { + __ decl(cnt.reg()); + } __ j(not_zero, &alloc_locals_loop); +#ifdef DEBUG + CHECK(masm()->SizeOfCodeGeneratedSince(&loop_size) < kLocalVarBound); +#endif } for (int i = 0; i < count; i++) { elements_.Add(initial_value); diff --git a/src/x64/virtual-frame-x64.h b/src/x64/virtual-frame-x64.h index dc270fe..0549e3c 100644 --- a/src/x64/virtual-frame-x64.h +++ b/src/x64/virtual-frame-x64.h @@ -200,7 +200,7 @@ class VirtualFrame : public ZoneObject { inline void PrepareForReturn(); // Number of local variables after when we use a loop for allocating. - static const int kLocalVarBound = 7; + static const int kLocalVarBound = 14; // Allocate and initialize the frame-allocated locals. void AllocateStackSlots(); -- 2.7.4