From: lrn@chromium.org Date: Mon, 3 May 2010 10:43:49 +0000 (+0000) Subject: X64: Port inline transcendental cache to X64. X-Git-Tag: upstream/4.7.83~21891 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=1790c3534a5faa8d2a1adb0eb293081b9c64b24c;p=platform%2Fupstream%2Fv8.git X64: Port inline transcendental cache to X64. Review URL: http://codereview.chromium.org/1860001 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@4567 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- diff --git a/src/objects.h b/src/objects.h index 40be3a2..48be902 100644 --- a/src/objects.h +++ b/src/objects.h @@ -1116,6 +1116,8 @@ class HeapNumber: public HeapObject { static const uint32_t kSignMask = 0x80000000u; static const uint32_t kExponentMask = 0x7ff00000u; static const uint32_t kMantissaMask = 0xfffffu; + static const int kMantissaBits = 52; + static const int KExponentBits = 11; static const int kExponentBias = 1023; static const int kExponentShift = 20; static const int kMantissaBitsInTopWord = 20; diff --git a/src/x64/assembler-x64-inl.h b/src/x64/assembler-x64-inl.h index 9c7f9b6..c9ab627 100644 --- a/src/x64/assembler-x64-inl.h +++ b/src/x64/assembler-x64-inl.h @@ -89,6 +89,11 @@ void Assembler::emit_rex_64(XMMRegister reg, Register rm_reg) { } +void Assembler::emit_rex_64(Register reg, XMMRegister rm_reg) { + emit(0x48 | (reg.code() & 0x8) >> 1 | rm_reg.code() >> 3); +} + + void Assembler::emit_rex_64(Register reg, const Operand& op) { emit(0x48 | reg.high_bit() << 2 | op.rex_); } @@ -160,6 +165,12 @@ void Assembler::emit_optional_rex_32(XMMRegister reg, Register base) { } +void Assembler::emit_optional_rex_32(Register reg, XMMRegister base) { + byte rex_bits = (reg.code() & 0x8) >> 1 | (base.code() & 0x8) >> 3; + if (rex_bits != 0) emit(0x40 | rex_bits); +} + + void Assembler::emit_optional_rex_32(Register rm_reg) { if (rm_reg.high_bit()) emit(0x41); } diff --git a/src/x64/assembler-x64.cc b/src/x64/assembler-x64.cc index 23b0b70..1c00ebc 100644 --- a/src/x64/assembler-x64.cc +++ b/src/x64/assembler-x64.cc @@ -2042,6 +2042,14 @@ void Assembler::fldz() { } +void Assembler::fldpi() { + EnsureSpace ensure_space(this); + last_pc_ = pc_; + emit(0xD9); + emit(0xEB); +} + + void Assembler::fld_s(const Operand& adr) { EnsureSpace ensure_space(this); last_pc_ = pc_; @@ -2400,6 +2408,53 @@ void Assembler::movd(XMMRegister dst, Register src) { } +void Assembler::movd(Register dst, XMMRegister src) { + EnsureSpace ensure_space(this); + last_pc_ = pc_; + emit(0x66); + emit_optional_rex_32(dst, src); + emit(0x0F); + emit(0x7E); + emit_sse_operand(dst, src); +} + + +void Assembler::movq(XMMRegister dst, Register src) { + EnsureSpace ensure_space(this); + last_pc_ = pc_; + emit(0x66); + emit_rex_64(dst, src); + emit(0x0F); + emit(0x6E); + emit_sse_operand(dst, src); +} + + +void Assembler::movq(Register dst, XMMRegister src) { + EnsureSpace ensure_space(this); + last_pc_ = pc_; + emit(0x66); + emit_rex_64(dst, src); + emit(0x0F); + emit(0x7E); + emit_sse_operand(dst, src); +} + + +void Assembler::extractps(Register dst, XMMRegister src, byte imm8) { + ASSERT(is_uint2(imm8)); + EnsureSpace ensure_space(this); + last_pc_ = pc_; + emit(0x66); + emit_optional_rex_32(dst, src); + emit(0x0F); + emit(0x3A); + emit(0x17); + emit_sse_operand(dst, src); + emit(imm8); +} + + void Assembler::movsd(const Operand& dst, XMMRegister src) { EnsureSpace ensure_space(this); last_pc_ = pc_; @@ -2601,6 +2656,10 @@ void Assembler::emit_sse_operand(XMMRegister dst, Register src) { emit(0xC0 | (dst.low_bits() << 3) | src.low_bits()); } +void Assembler::emit_sse_operand(Register dst, XMMRegister src) { + emit(0xC0 | (dst.low_bits() << 3) | src.low_bits()); +} + // Relocation information implementations. diff --git a/src/x64/assembler-x64.h b/src/x64/assembler-x64.h index 6bc18f6..d077865 100644 --- a/src/x64/assembler-x64.h +++ b/src/x64/assembler-x64.h @@ -1019,6 +1019,7 @@ class Assembler : public Malloced { void fld1(); void fldz(); + void fldpi(); void fld_s(const Operand& adr); void fld_d(const Operand& adr); @@ -1080,6 +1081,10 @@ class Assembler : public Malloced { // SSE2 instructions void movd(XMMRegister dst, Register src); + void movd(Register dst, XMMRegister src); + void movq(XMMRegister dst, Register src); + void movq(Register dst, XMMRegister src); + void extractps(Register dst, XMMRegister src, byte imm8); void movsd(const Operand& dst, XMMRegister src); void movsd(XMMRegister dst, XMMRegister src); @@ -1110,6 +1115,7 @@ class Assembler : public Malloced { void emit_sse_operand(XMMRegister dst, XMMRegister src); void emit_sse_operand(XMMRegister reg, const Operand& adr); void emit_sse_operand(XMMRegister dst, Register src); + void emit_sse_operand(Register dst, XMMRegister src); // Use either movsd or movlpd. // void movdbl(XMMRegister dst, const Operand& src); @@ -1176,8 +1182,9 @@ class Assembler : public Malloced { // the top bit of both register codes. // High bit of reg goes to REX.R, high bit of rm_reg goes to REX.B. // REX.W is set. - inline void emit_rex_64(Register reg, Register rm_reg); inline void emit_rex_64(XMMRegister reg, Register rm_reg); + inline void emit_rex_64(Register reg, XMMRegister rm_reg); + inline void emit_rex_64(Register reg, Register rm_reg); // Emits a REX prefix that encodes a 64-bit operand size and // the top bit of the destination, index, and base register codes. @@ -1235,9 +1242,13 @@ class Assembler : public Malloced { inline void emit_optional_rex_32(XMMRegister reg, XMMRegister base); // As for emit_optional_rex_32(Register, Register), except that - // the registers are XMM registers. + // one of the registers is an XMM registers. inline void emit_optional_rex_32(XMMRegister reg, Register base); + // As for emit_optional_rex_32(Register, Register), except that + // one of the registers is an XMM registers. + inline void emit_optional_rex_32(Register reg, XMMRegister base); + // As for emit_optional_rex_32(Register, const Operand&), except that // the register is an XMM register. inline void emit_optional_rex_32(XMMRegister reg, const Operand& op); diff --git a/src/x64/codegen-x64.cc b/src/x64/codegen-x64.cc index a6f856e..3b0cb09 100644 --- a/src/x64/codegen-x64.cc +++ b/src/x64/codegen-x64.cc @@ -43,6 +43,8 @@ namespace internal { #define __ ACCESS_MASM(masm_) +static int64_t kNaNValue = V8_INT64_C(0x7ff8000000000000); + // ------------------------------------------------------------------------- // Platform-specific DeferredCode functions. @@ -4516,19 +4518,19 @@ void CodeGenerator::GenerateCallFunction(ZoneList* args) { void CodeGenerator::GenerateMathSin(ZoneList* args) { ASSERT_EQ(args->length(), 1); - // Load the argument on the stack and jump to the runtime. Load(args->at(0)); - Result answer = frame_->CallRuntime(Runtime::kMath_sin, 1); - frame_->Push(&answer); + TranscendentalCacheStub stub(TranscendentalCache::SIN); + Result result = frame_->CallStub(&stub, 1); + frame_->Push(&result); } void CodeGenerator::GenerateMathCos(ZoneList* args) { ASSERT_EQ(args->length(), 1); - // Load the argument on the stack and jump to the runtime. Load(args->at(0)); - Result answer = frame_->CallRuntime(Runtime::kMath_cos, 1); - frame_->Push(&answer); + TranscendentalCacheStub stub(TranscendentalCache::COS); + Result result = frame_->CallStub(&stub, 1); + frame_->Push(&result); } @@ -7459,6 +7461,216 @@ bool CodeGenerator::FoldConstantSmis(Token::Value op, int left, int right) { // End of CodeGenerator implementation. +void TranscendentalCacheStub::Generate(MacroAssembler* masm) { + // Input on stack: + // rsp[8]: argument (should be number). + // rsp[0]: return address. + Label runtime_call; + Label runtime_call_clear_stack; + Label input_not_smi; + Label loaded; + // Test that rax is a number. + __ movq(rax, Operand(rsp, kPointerSize)); + __ JumpIfNotSmi(rax, &input_not_smi); + // Input is a smi. Untag and load it onto the FPU stack. + // Then load the bits of the double into rbx. + __ SmiToInteger32(rax, rax); + __ subq(rsp, Immediate(kPointerSize)); + __ cvtlsi2sd(xmm1, rax); + __ movsd(Operand(rsp, 0), xmm1); + __ movq(rbx, xmm1); + __ movq(rdx, xmm1); + __ fld_d(Operand(rsp, 0)); + __ addq(rsp, Immediate(kPointerSize)); + __ jmp(&loaded); + + __ bind(&input_not_smi); + // Check if input is a HeapNumber. + __ Move(rbx, Factory::heap_number_map()); + __ cmpq(rbx, FieldOperand(rax, HeapObject::kMapOffset)); + __ j(not_equal, &runtime_call); + // Input is a HeapNumber. Push it on the FPU stack and load its + // bits into rbx. + __ fld_d(FieldOperand(rax, HeapNumber::kValueOffset)); + __ movq(rbx, FieldOperand(rax, HeapNumber::kValueOffset)); + __ movq(rdx, rbx); + __ bind(&loaded); + // ST[0] == double value + // rbx = bits of double value. + // rdx = also bits of double value. + // Compute hash (h is 32 bits, bits are 64): + // h = h0 = bits ^ (bits >> 32); + // h ^= h >> 16; + // h ^= h >> 8; + // h = h & (cacheSize - 1); + // or h = (h0 ^ (h0 >> 8) ^ (h0 >> 16) ^ (h0 >> 24)) & (cacheSize - 1) + __ sar(rdx, Immediate(32)); + __ xorl(rdx, rbx); + __ movl(rcx, rdx); + __ movl(rax, rdx); + __ movl(rdi, rdx); + __ sarl(rdx, Immediate(8)); + __ sarl(rcx, Immediate(16)); + __ sarl(rax, Immediate(24)); + __ xorl(rcx, rdx); + __ xorl(rax, rdi); + __ xorl(rcx, rax); + ASSERT(IsPowerOf2(TranscendentalCache::kCacheSize)); + __ andl(rcx, Immediate(TranscendentalCache::kCacheSize - 1)); + // ST[0] == double value. + // rbx = bits of double value. + // rcx = TranscendentalCache::hash(double value). + __ movq(rax, ExternalReference::transcendental_cache_array_address()); + // rax points to cache array. + __ movq(rax, Operand(rax, type_ * sizeof(TranscendentalCache::caches_[0]))); + // rax points to the cache for the type type_. + // If NULL, the cache hasn't been initialized yet, so go through runtime. + __ testq(rax, rax); + __ j(zero, &runtime_call_clear_stack); +#ifdef DEBUG + // Check that the layout of cache elements match expectations. + { // NOLINT - doesn't like a single brace on a line. + TranscendentalCache::Element test_elem[2]; + char* elem_start = reinterpret_cast(&test_elem[0]); + char* elem2_start = reinterpret_cast(&test_elem[1]); + char* elem_in0 = reinterpret_cast(&(test_elem[0].in[0])); + char* elem_in1 = reinterpret_cast(&(test_elem[0].in[1])); + char* elem_out = reinterpret_cast(&(test_elem[0].output)); + // Two uint_32's and a pointer per element. + CHECK_EQ(16, static_cast(elem2_start - elem_start)); + CHECK_EQ(0, static_cast(elem_in0 - elem_start)); + CHECK_EQ(kIntSize, static_cast(elem_in1 - elem_start)); + CHECK_EQ(2 * kIntSize, static_cast(elem_out - elem_start)); + } +#endif + // Find the address of the rcx'th entry in the cache, i.e., &rax[rcx*16]. + __ addl(rcx, rcx); + __ lea(rcx, Operand(rax, rcx, times_8, 0)); + // Check if cache matches: Double value is stored in uint32_t[2] array. + Label cache_miss; + __ cmpq(rbx, Operand(rcx, 0)); + __ j(not_equal, &cache_miss); + // Cache hit! + __ movq(rax, Operand(rcx, 2 * kIntSize)); + __ fstp(0); // Clear FPU stack. + __ ret(kPointerSize); + + __ bind(&cache_miss); + // Update cache with new value. + __ AllocateHeapNumber(rax, rdi, &runtime_call_clear_stack); + GenerateOperation(masm); + __ movq(Operand(rcx, 0), rbx); + __ movq(Operand(rcx, 2 * kIntSize), rax); + __ fstp_d(FieldOperand(rax, HeapNumber::kValueOffset)); + __ ret(kPointerSize); + + __ bind(&runtime_call_clear_stack); + __ fstp(0); + __ bind(&runtime_call); + __ TailCallExternalReference(ExternalReference(RuntimeFunction()), 1, 1); +} + + +Runtime::FunctionId TranscendentalCacheStub::RuntimeFunction() { + switch (type_) { + // Add more cases when necessary. + case TranscendentalCache::SIN: return Runtime::kMath_sin; + case TranscendentalCache::COS: return Runtime::kMath_cos; + default: + UNIMPLEMENTED(); + return Runtime::kAbort; + } +} + + +void TranscendentalCacheStub::GenerateOperation(MacroAssembler* masm) { + // Only free register is rdi. + Label done; + ASSERT(type_ == TranscendentalCache::SIN || + type_ == TranscendentalCache::COS); + // More transcendental types can be added later. + + // Both fsin and fcos require arguments in the range +/-2^63 and + // return NaN for infinities and NaN. They can share all code except + // the actual fsin/fcos operation. + Label in_range; + // If argument is outside the range -2^63..2^63, fsin/cos doesn't + // work. We must reduce it to the appropriate range. + __ movq(rdi, rbx); + // Move exponent and sign bits to low bits. + __ shr(rdi, Immediate(HeapNumber::kMantissaBits)); + // Remove sign bit. + __ andl(rdi, Immediate((1 << HeapNumber::KExponentBits) - 1)); + int supported_exponent_limit = (63 + HeapNumber::kExponentBias); + __ cmpl(rdi, Immediate(supported_exponent_limit)); + __ j(below, &in_range); + // Check for infinity and NaN. Both return NaN for sin. + __ cmpl(rdi, Immediate(0x7ff)); + Label non_nan_result; + __ j(not_equal, &non_nan_result); + // Input is +/-Infinity or NaN. Result is NaN. + __ fstp(0); // Clear fpu stack. + // NaN is represented by 0x7ff8000000000000. + __ movq(rdi, kNaNValue, RelocInfo::NONE); + __ push(rdi); + __ fld_d(Operand(rsp, 0)); + __ addq(rsp, Immediate(kPointerSize)); + __ jmp(&done); + + __ bind(&non_nan_result); + + // Use fpmod to restrict argument to the range +/-2*PI. + __ movq(rdi, rax); // Save rax before using fnstsw_ax. + __ fldpi(); + __ fadd(0); + __ fld(1); + // FPU Stack: input, 2*pi, input. + { + Label no_exceptions; + __ fwait(); + __ fnstsw_ax(); + // Clear if Illegal Operand or Zero Division exceptions are set. + __ testl(rax, Immediate(5)); + __ j(zero, &no_exceptions); + __ fnclex(); + __ bind(&no_exceptions); + } + + // Compute st(0) % st(1) + { + Label partial_remainder_loop; + __ bind(&partial_remainder_loop); + __ fprem1(); + __ fwait(); + __ fnstsw_ax(); + __ testl(rax, Immediate(0x400)); // Check C2 bit of FPU status word. + // If C2 is set, computation only has partial result. Loop to + // continue computation. + __ j(not_zero, &partial_remainder_loop); + } + // FPU Stack: input, 2*pi, input % 2*pi + __ fstp(2); + // FPU Stack: input % 2*pi, 2*pi, + __ fstp(0); + // FPU Stack: input % 2*pi + __ movq(rax, rdi); // Restore rax (allocated HeapNumber pointer). + + // FPU Stack: input % 2*pi + __ bind(&in_range); + switch (type_) { + case TranscendentalCache::SIN: + __ fsin(); + break; + case TranscendentalCache::COS: + __ fcos(); + break; + default: + UNREACHABLE(); + } + __ bind(&done); +} + + // Get the integer part of a heap number. Surprisingly, all this bit twiddling // is faster than using the built-in instructions on floating point registers. // Trashes rdi and rbx. Dest is rcx. Source cannot be rcx or one of the @@ -10995,7 +11207,6 @@ ModuloFunction CreateModuloFunction() { __ testb(rax, Immediate(5)); __ j(zero, &valid_result); __ fstp(0); // Drop result in st(0). - int64_t kNaNValue = V8_INT64_C(0x7ff8000000000000); __ movq(rcx, kNaNValue, RelocInfo::NONE); __ movq(Operand(rsp, kPointerSize), rcx); __ movsd(xmm0, Operand(rsp, kPointerSize)); diff --git a/src/x64/codegen-x64.h b/src/x64/codegen-x64.h index 8416f39..9279d30 100644 --- a/src/x64/codegen-x64.h +++ b/src/x64/codegen-x64.h @@ -675,6 +675,22 @@ class CodeGenerator: public AstVisitor { }; +// Compute a transcendental math function natively, or call the +// TranscendentalCache runtime function. +class TranscendentalCacheStub: public CodeStub { + public: + explicit TranscendentalCacheStub(TranscendentalCache::Type type) + : type_(type) {} + void Generate(MacroAssembler* masm); + private: + TranscendentalCache::Type type_; + Major MajorKey() { return TranscendentalCache; } + int MinorKey() { return type_; } + Runtime::FunctionId RuntimeFunction(); + void GenerateOperation(MacroAssembler* masm); +}; + + // Flag that indicates how to generate code for the stub GenericBinaryOpStub. enum GenericBinaryFlags { NO_GENERIC_BINARY_FLAGS = 0, diff --git a/src/x64/disasm-x64.cc b/src/x64/disasm-x64.cc index 4258ab6..bd912cd 100644 --- a/src/x64/disasm-x64.cc +++ b/src/x64/disasm-x64.cc @@ -996,23 +996,44 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) { if (operand_size_ == 0x66) { // 0x66 0x0F prefix. int mod, regop, rm; - get_modrm(*current, &mod, ®op, &rm); - if (opcode == 0x6E) { - AppendToBuffer("movd %s,", NameOfXMMRegister(regop)); - current += PrintRightOperand(current); - } else { - const char* mnemonic = "?"; - if (opcode == 0x57) { - mnemonic = "xorpd"; - } else if (opcode == 0x2E) { - mnemonic = "comisd"; - } else if (opcode == 0x2F) { - mnemonic = "ucomisd"; + if (opcode == 0x3A) { + byte third_byte = *current; + current = data + 3; + if (third_byte == 0x17) { + get_modrm(*current, &mod, ®op, &rm); + AppendToBuffer("extractps "); // reg/m32, xmm, imm8 + current += PrintRightOperand(current); + AppendToBuffer(", %s, %d", NameOfCPURegister(regop), (*current) & 3); + current += 1; } else { UnimplementedInstruction(); } - AppendToBuffer("%s %s,", mnemonic, NameOfXMMRegister(regop)); - current += PrintRightXMMOperand(current); + } else { + get_modrm(*current, &mod, ®op, &rm); + if (opcode == 0x6E) { + AppendToBuffer("mov%c %s,", + rex_w() ? 'q' : 'd', + NameOfXMMRegister(regop)); + current += PrintRightOperand(current); + } else if (opcode == 0x7E) { + AppendToBuffer("mov%c %s,", + rex_w() ? 'q' : 'd', + NameOfCPURegister(regop)); + current += PrintRightXMMOperand(current); + } else { + const char* mnemonic = "?"; + if (opcode == 0x57) { + mnemonic = "xorpd"; + } else if (opcode == 0x2E) { + mnemonic = "comisd"; + } else if (opcode == 0x2F) { + mnemonic = "ucomisd"; + } else { + UnimplementedInstruction(); + } + AppendToBuffer("%s %s,", mnemonic, NameOfXMMRegister(regop)); + current += PrintRightXMMOperand(current); + } } } else if (group_1_prefix_ == 0xF2) { // Beginning of instructions with prefix 0xF2.