From 4f3d27e64fd9e7d2996de1e9ec742f1c2b56cdbf Mon Sep 17 00:00:00 2001 From: "weiliang.lin" Date: Tue, 2 Dec 2014 07:30:03 -0800 Subject: [PATCH] [ia32] Introduce FMA3 instructions on scalar data elements. port 83a635e0d70f144300ea83be0d7effc1eb1bf6ef BUG= Review URL: https://codereview.chromium.org/773783002 Cr-Commit-Position: refs/heads/master@{#25619} --- src/ia32/assembler-ia32.cc | 65 ++++++ src/ia32/assembler-ia32.h | 159 +++++++++++++ src/ia32/disasm-ia32.cc | 101 ++++++++- test/cctest/test-assembler-ia32.cc | 454 +++++++++++++++++++++++++++++++++++++ test/cctest/test-disasm-ia32.cc | 75 +++++- 5 files changed, 851 insertions(+), 3 deletions(-) diff --git a/src/ia32/assembler-ia32.cc b/src/ia32/assembler-ia32.cc index e34c02b..2805fa0 100644 --- a/src/ia32/assembler-ia32.cc +++ b/src/ia32/assembler-ia32.cc @@ -2443,6 +2443,71 @@ void Assembler::pinsrd(XMMRegister dst, const Operand& src, int8_t offset) { } +void Assembler::addss(XMMRegister dst, const Operand& src) { + EnsureSpace ensure_space(this); + EMIT(0xF3); + EMIT(0x0F); + EMIT(0x58); + emit_sse_operand(dst, src); +} + + +void Assembler::subss(XMMRegister dst, const Operand& src) { + EnsureSpace ensure_space(this); + EMIT(0xF3); + EMIT(0x0F); + EMIT(0x5C); + emit_sse_operand(dst, src); +} + + +void Assembler::mulss(XMMRegister dst, const Operand& src) { + EnsureSpace ensure_space(this); + EMIT(0xF3); + EMIT(0x0F); + EMIT(0x59); + emit_sse_operand(dst, src); +} + + +void Assembler::divss(XMMRegister dst, const Operand& src) { + EnsureSpace ensure_space(this); + EMIT(0xF3); + EMIT(0x0F); + EMIT(0x5E); + emit_sse_operand(dst, src); +} + + +void Assembler::ucomiss(XMMRegister dst, const Operand& src) { + EnsureSpace ensure_space(this); + EMIT(0x0f); + EMIT(0x2e); + emit_sse_operand(dst, src); +} + + +// AVX instructions +void Assembler::vfmasd(byte op, XMMRegister dst, XMMRegister src1, + const Operand& src2) { + DCHECK(IsEnabled(FMA3)); + EnsureSpace ensure_space(this); + emit_vex_prefix(src1, kLIG, k66, k0F38, kW1); + EMIT(op); + emit_sse_operand(dst, src2); +} + + +void Assembler::vfmass(byte op, XMMRegister dst, XMMRegister src1, + const Operand& src2) { + DCHECK(IsEnabled(FMA3)); + EnsureSpace ensure_space(this); + emit_vex_prefix(src1, kLIG, k66, k0F38, kW0); + EMIT(op); + emit_sse_operand(dst, src2); +} + + void Assembler::vsd(byte op, XMMRegister dst, XMMRegister src1, const Operand& src2) { DCHECK(IsEnabled(AVX)); diff --git a/src/ia32/assembler-ia32.h b/src/ia32/assembler-ia32.h index 11a0b4f..b913f7a 100644 --- a/src/ia32/assembler-ia32.h +++ b/src/ia32/assembler-ia32.h @@ -928,6 +928,17 @@ class Assembler : public AssemblerBase { void cpuid(); // SSE instructions + void addss(XMMRegister dst, XMMRegister src) { addss(dst, Operand(src)); } + void addss(XMMRegister dst, const Operand& src); + void subss(XMMRegister dst, XMMRegister src) { subss(dst, Operand(src)); } + void subss(XMMRegister dst, const Operand& src); + void mulss(XMMRegister dst, XMMRegister src) { mulss(dst, Operand(src)); } + void mulss(XMMRegister dst, const Operand& src); + void divss(XMMRegister dst, XMMRegister src) { divss(dst, Operand(src)); } + void divss(XMMRegister dst, const Operand& src); + + void ucomiss(XMMRegister dst, XMMRegister src) { ucomiss(dst, Operand(src)); } + void ucomiss(XMMRegister dst, const Operand& src); void movaps(XMMRegister dst, XMMRegister src); void shufps(XMMRegister dst, XMMRegister src, byte imm8); @@ -1053,6 +1064,154 @@ class Assembler : public AssemblerBase { void movntdq(const Operand& dst, XMMRegister src); // AVX instructions + void vfmadd132sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfmadd132sd(dst, src1, Operand(src2)); + } + void vfmadd213sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfmadd213sd(dst, src1, Operand(src2)); + } + void vfmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfmadd231sd(dst, src1, Operand(src2)); + } + void vfmadd132sd(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmasd(0x99, dst, src1, src2); + } + void vfmadd213sd(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmasd(0xa9, dst, src1, src2); + } + void vfmadd231sd(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmasd(0xb9, dst, src1, src2); + } + void vfmsub132sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfmsub132sd(dst, src1, Operand(src2)); + } + void vfmsub213sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfmsub213sd(dst, src1, Operand(src2)); + } + void vfmsub231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfmsub231sd(dst, src1, Operand(src2)); + } + void vfmsub132sd(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmasd(0x9b, dst, src1, src2); + } + void vfmsub213sd(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmasd(0xab, dst, src1, src2); + } + void vfmsub231sd(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmasd(0xbb, dst, src1, src2); + } + void vfnmadd132sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfnmadd132sd(dst, src1, Operand(src2)); + } + void vfnmadd213sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfnmadd213sd(dst, src1, Operand(src2)); + } + void vfnmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfnmadd231sd(dst, src1, Operand(src2)); + } + void vfnmadd132sd(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmasd(0x9d, dst, src1, src2); + } + void vfnmadd213sd(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmasd(0xad, dst, src1, src2); + } + void vfnmadd231sd(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmasd(0xbd, dst, src1, src2); + } + void vfnmsub132sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfnmsub132sd(dst, src1, Operand(src2)); + } + void vfnmsub213sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfnmsub213sd(dst, src1, Operand(src2)); + } + void vfnmsub231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfnmsub231sd(dst, src1, Operand(src2)); + } + void vfnmsub132sd(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmasd(0x9f, dst, src1, src2); + } + void vfnmsub213sd(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmasd(0xaf, dst, src1, src2); + } + void vfnmsub231sd(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmasd(0xbf, dst, src1, src2); + } + void vfmasd(byte op, XMMRegister dst, XMMRegister src1, const Operand& src2); + + void vfmadd132ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfmadd132ss(dst, src1, Operand(src2)); + } + void vfmadd213ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfmadd213ss(dst, src1, Operand(src2)); + } + void vfmadd231ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfmadd231ss(dst, src1, Operand(src2)); + } + void vfmadd132ss(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmass(0x99, dst, src1, src2); + } + void vfmadd213ss(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmass(0xa9, dst, src1, src2); + } + void vfmadd231ss(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmass(0xb9, dst, src1, src2); + } + void vfmsub132ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfmsub132ss(dst, src1, Operand(src2)); + } + void vfmsub213ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfmsub213ss(dst, src1, Operand(src2)); + } + void vfmsub231ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfmsub231ss(dst, src1, Operand(src2)); + } + void vfmsub132ss(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmass(0x9b, dst, src1, src2); + } + void vfmsub213ss(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmass(0xab, dst, src1, src2); + } + void vfmsub231ss(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmass(0xbb, dst, src1, src2); + } + void vfnmadd132ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfnmadd132ss(dst, src1, Operand(src2)); + } + void vfnmadd213ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfnmadd213ss(dst, src1, Operand(src2)); + } + void vfnmadd231ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfnmadd231ss(dst, src1, Operand(src2)); + } + void vfnmadd132ss(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmass(0x9d, dst, src1, src2); + } + void vfnmadd213ss(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmass(0xad, dst, src1, src2); + } + void vfnmadd231ss(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmass(0xbd, dst, src1, src2); + } + void vfnmsub132ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfnmsub132ss(dst, src1, Operand(src2)); + } + void vfnmsub213ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfnmsub213ss(dst, src1, Operand(src2)); + } + void vfnmsub231ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) { + vfnmsub231ss(dst, src1, Operand(src2)); + } + void vfnmsub132ss(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmass(0x9f, dst, src1, src2); + } + void vfnmsub213ss(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmass(0xaf, dst, src1, src2); + } + void vfnmsub231ss(XMMRegister dst, XMMRegister src1, const Operand& src2) { + vfmass(0xbf, dst, src1, src2); + } + void vfmass(byte op, XMMRegister dst, XMMRegister src1, const Operand& src2); + void vaddsd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { vaddsd(dst, src1, Operand(src2)); } diff --git a/src/ia32/disasm-ia32.cc b/src/ia32/disasm-ia32.cc index 57d1cc9..bf88f69 100644 --- a/src/ia32/disasm-ia32.cc +++ b/src/ia32/disasm-ia32.cc @@ -319,7 +319,7 @@ class DisassemblerIA32 { bool vex_w() { if (vex_byte0_ == 0xc5) return false; - return (vex_byte2_ & 0x80) == 1; + return (vex_byte2_ & 0x80) != 0; } bool vex_0f() { @@ -740,7 +740,74 @@ int DisassemblerIA32::CMov(byte* data) { int DisassemblerIA32::AVXInstruction(byte* data) { byte opcode = *data; byte* current = data + 1; - if (vex_f2() && vex_0f()) { + if (vex_66() && vex_0f38()) { + int mod, regop, rm, vvvv = vex_vreg(); + get_modrm(*current, &mod, ®op, &rm); + switch (opcode) { + case 0x99: + AppendToBuffer("vfmadd132s%c %s,%s,", float_size_code(), + NameOfXMMRegister(regop), NameOfXMMRegister(vvvv)); + current += PrintRightXMMOperand(current); + break; + case 0xa9: + AppendToBuffer("vfmadd213s%c %s,%s,", float_size_code(), + NameOfXMMRegister(regop), NameOfXMMRegister(vvvv)); + current += PrintRightXMMOperand(current); + break; + case 0xb9: + AppendToBuffer("vfmadd231s%c %s,%s,", float_size_code(), + NameOfXMMRegister(regop), NameOfXMMRegister(vvvv)); + current += PrintRightXMMOperand(current); + break; + case 0x9b: + AppendToBuffer("vfmsub132s%c %s,%s,", float_size_code(), + NameOfXMMRegister(regop), NameOfXMMRegister(vvvv)); + current += PrintRightXMMOperand(current); + break; + case 0xab: + AppendToBuffer("vfmsub213s%c %s,%s,", float_size_code(), + NameOfXMMRegister(regop), NameOfXMMRegister(vvvv)); + current += PrintRightXMMOperand(current); + break; + case 0xbb: + AppendToBuffer("vfmsub231s%c %s,%s,", float_size_code(), + NameOfXMMRegister(regop), NameOfXMMRegister(vvvv)); + current += PrintRightXMMOperand(current); + break; + case 0x9d: + AppendToBuffer("vfnmadd132s%c %s,%s,", float_size_code(), + NameOfXMMRegister(regop), NameOfXMMRegister(vvvv)); + current += PrintRightXMMOperand(current); + break; + case 0xad: + AppendToBuffer("vfnmadd213s%c %s,%s,", float_size_code(), + NameOfXMMRegister(regop), NameOfXMMRegister(vvvv)); + current += PrintRightXMMOperand(current); + break; + case 0xbd: + AppendToBuffer("vfnmadd231s%c %s,%s,", float_size_code(), + NameOfXMMRegister(regop), NameOfXMMRegister(vvvv)); + current += PrintRightXMMOperand(current); + break; + case 0x9f: + AppendToBuffer("vfnmsub132s%c %s,%s,", float_size_code(), + NameOfXMMRegister(regop), NameOfXMMRegister(vvvv)); + current += PrintRightXMMOperand(current); + break; + case 0xaf: + AppendToBuffer("vfnmsub213s%c %s,%s,", float_size_code(), + NameOfXMMRegister(regop), NameOfXMMRegister(vvvv)); + current += PrintRightXMMOperand(current); + break; + case 0xbf: + AppendToBuffer("vfnmsub231s%c %s,%s,", float_size_code(), + NameOfXMMRegister(regop), NameOfXMMRegister(vvvv)); + current += PrintRightXMMOperand(current); + break; + default: + UnimplementedInstruction(); + } + } else if (vex_f2() && vex_0f()) { int mod, regop, rm, vvvv = vex_vreg(); get_modrm(*current, &mod, ®op, &rm); switch (opcode) { @@ -1159,6 +1226,12 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector out_buffer, NameOfXMMRegister(regop), NameOfXMMRegister(rm)); data++; + } else if (f0byte == 0x2e) { + data += 2; + int mod, regop, rm; + get_modrm(*data, &mod, ®op, &rm); + AppendToBuffer("ucomiss %s,", NameOfXMMRegister(regop)); + data += PrintRightXMMOperand(data); } else if (f0byte >= 0x53 && f0byte <= 0x5F) { const char* const pseudo_op[] = { "rcpps", @@ -1729,12 +1802,36 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector out_buffer, get_modrm(*data, &mod, ®op, &rm); AppendToBuffer("cvttss2si %s,", NameOfCPURegister(regop)); data += PrintRightXMMOperand(data); + } else if (b2 == 0x58) { + data += 3; + int mod, regop, rm; + get_modrm(*data, &mod, ®op, &rm); + AppendToBuffer("addss %s,", NameOfXMMRegister(regop)); + data += PrintRightXMMOperand(data); + } else if (b2 == 0x59) { + data += 3; + int mod, regop, rm; + get_modrm(*data, &mod, ®op, &rm); + AppendToBuffer("mulss %s,", NameOfXMMRegister(regop)); + data += PrintRightXMMOperand(data); } else if (b2 == 0x5A) { data += 3; int mod, regop, rm; get_modrm(*data, &mod, ®op, &rm); AppendToBuffer("cvtss2sd %s,", NameOfXMMRegister(regop)); data += PrintRightXMMOperand(data); + } else if (b2 == 0x5c) { + data += 3; + int mod, regop, rm; + get_modrm(*data, &mod, ®op, &rm); + AppendToBuffer("subss %s,", NameOfXMMRegister(regop)); + data += PrintRightXMMOperand(data); + } else if (b2 == 0x5e) { + data += 3; + int mod, regop, rm; + get_modrm(*data, &mod, ®op, &rm); + AppendToBuffer("divss %s,", NameOfXMMRegister(regop)); + data += PrintRightXMMOperand(data); } else if (b2 == 0x6F) { data += 3; int mod, regop, rm; diff --git a/test/cctest/test-assembler-ia32.cc b/test/cctest/test-assembler-ia32.cc index d943297..f59c3c4 100644 --- a/test/cctest/test-assembler-ia32.cc +++ b/test/cctest/test-assembler-ia32.cc @@ -589,4 +589,458 @@ TEST(AssemblerIa32SSE) { } +typedef int (*F9)(double x, double y, double z); +TEST(AssemblerX64FMA_sd) { + CcTest::InitializeVM(); + if (!CpuFeatures::IsSupported(FMA3)) return; + + Isolate* isolate = reinterpret_cast(CcTest::isolate()); + HandleScope scope(isolate); + v8::internal::byte buffer[1024]; + MacroAssembler assm(isolate, buffer, sizeof buffer); + { + CpuFeatureScope fscope(&assm, FMA3); + Label exit; + __ movsd(xmm0, Operand(esp, 1 * kPointerSize)); + __ movsd(xmm1, Operand(esp, 3 * kPointerSize)); + __ movsd(xmm2, Operand(esp, 5 * kPointerSize)); + // argument in xmm0, xmm1 and xmm2 + // xmm0 * xmm1 + xmm2 + __ movaps(xmm3, xmm0); + __ mulsd(xmm3, xmm1); + __ addsd(xmm3, xmm2); // Expected result in xmm3 + + __ sub(esp, Immediate(kDoubleSize)); // For memory operand + // vfmadd132sd + __ mov(eax, Immediate(1)); // Test number + __ movaps(xmm4, xmm0); + __ vfmadd132sd(xmm4, xmm2, xmm1); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmadd213sd + __ inc(eax); + __ movaps(xmm4, xmm1); + __ vfmadd213sd(xmm4, xmm0, xmm2); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmadd231sd + __ inc(eax); + __ movaps(xmm4, xmm2); + __ vfmadd231sd(xmm4, xmm0, xmm1); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + + // vfmadd132sd + __ inc(eax); + __ movaps(xmm4, xmm0); + __ movsd(Operand(esp, 0), xmm1); + __ vfmadd132sd(xmm4, xmm2, Operand(esp, 0)); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmadd213sd + __ inc(eax); + __ movaps(xmm4, xmm1); + __ movsd(Operand(esp, 0), xmm2); + __ vfmadd213sd(xmm4, xmm0, Operand(esp, 0)); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmadd231sd + __ inc(eax); + __ movaps(xmm4, xmm2); + __ movsd(Operand(esp, 0), xmm1); + __ vfmadd231sd(xmm4, xmm0, Operand(esp, 0)); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + + // xmm0 * xmm1 - xmm2 + __ movaps(xmm3, xmm0); + __ mulsd(xmm3, xmm1); + __ subsd(xmm3, xmm2); // Expected result in xmm3 + + // vfmsub132sd + __ inc(eax); + __ movaps(xmm4, xmm0); + __ vfmsub132sd(xmm4, xmm2, xmm1); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmadd213sd + __ inc(eax); + __ movaps(xmm4, xmm1); + __ vfmsub213sd(xmm4, xmm0, xmm2); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmsub231sd + __ inc(eax); + __ movaps(xmm4, xmm2); + __ vfmsub231sd(xmm4, xmm0, xmm1); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + + // vfmsub132sd + __ inc(eax); + __ movaps(xmm4, xmm0); + __ movsd(Operand(esp, 0), xmm1); + __ vfmsub132sd(xmm4, xmm2, Operand(esp, 0)); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmsub213sd + __ inc(eax); + __ movaps(xmm4, xmm1); + __ movsd(Operand(esp, 0), xmm2); + __ vfmsub213sd(xmm4, xmm0, Operand(esp, 0)); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmsub231sd + __ inc(eax); + __ movaps(xmm4, xmm2); + __ movsd(Operand(esp, 0), xmm1); + __ vfmsub231sd(xmm4, xmm0, Operand(esp, 0)); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + + + // - xmm0 * xmm1 + xmm2 + __ movaps(xmm3, xmm0); + __ mulsd(xmm3, xmm1); + __ Move(xmm4, (uint64_t)1 << 63); + __ xorpd(xmm3, xmm4); + __ addsd(xmm3, xmm2); // Expected result in xmm3 + + // vfnmadd132sd + __ inc(eax); + __ movaps(xmm4, xmm0); + __ vfnmadd132sd(xmm4, xmm2, xmm1); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmadd213sd + __ inc(eax); + __ movaps(xmm4, xmm1); + __ vfnmadd213sd(xmm4, xmm0, xmm2); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + // vfnmadd231sd + __ inc(eax); + __ movaps(xmm4, xmm2); + __ vfnmadd231sd(xmm4, xmm0, xmm1); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + + // vfnmadd132sd + __ inc(eax); + __ movaps(xmm4, xmm0); + __ movsd(Operand(esp, 0), xmm1); + __ vfnmadd132sd(xmm4, xmm2, Operand(esp, 0)); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + // vfnmadd213sd + __ inc(eax); + __ movaps(xmm4, xmm1); + __ movsd(Operand(esp, 0), xmm2); + __ vfnmadd213sd(xmm4, xmm0, Operand(esp, 0)); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + // vfnmadd231sd + __ inc(eax); + __ movaps(xmm4, xmm2); + __ movsd(Operand(esp, 0), xmm1); + __ vfnmadd231sd(xmm4, xmm0, Operand(esp, 0)); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + + + // - xmm0 * xmm1 - xmm2 + __ movaps(xmm3, xmm0); + __ mulsd(xmm3, xmm1); + __ Move(xmm4, (uint64_t)1 << 63); + __ xorpd(xmm3, xmm4); + __ subsd(xmm3, xmm2); // Expected result in xmm3 + + // vfnmsub132sd + __ inc(eax); + __ movaps(xmm4, xmm0); + __ vfnmsub132sd(xmm4, xmm2, xmm1); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmsub213sd + __ inc(eax); + __ movaps(xmm4, xmm1); + __ vfnmsub213sd(xmm4, xmm0, xmm2); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + // vfnmsub231sd + __ inc(eax); + __ movaps(xmm4, xmm2); + __ vfnmsub231sd(xmm4, xmm0, xmm1); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + + // vfnmsub132sd + __ inc(eax); + __ movaps(xmm4, xmm0); + __ movsd(Operand(esp, 0), xmm1); + __ vfnmsub132sd(xmm4, xmm2, Operand(esp, 0)); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + // vfnmsub213sd + __ inc(eax); + __ movaps(xmm4, xmm1); + __ movsd(Operand(esp, 0), xmm2); + __ vfnmsub213sd(xmm4, xmm0, Operand(esp, 0)); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + // vfnmsub231sd + __ inc(eax); + __ movaps(xmm4, xmm2); + __ movsd(Operand(esp, 0), xmm1); + __ vfnmsub231sd(xmm4, xmm0, Operand(esp, 0)); + __ ucomisd(xmm4, xmm3); + __ j(not_equal, &exit); + + + __ xor_(eax, eax); + __ bind(&exit); + __ add(esp, Immediate(kDoubleSize)); + __ ret(0); + } + + CodeDesc desc; + assm.GetCode(&desc); + Handle code = isolate->factory()->NewCode( + desc, Code::ComputeFlags(Code::STUB), Handle()); +#ifdef OBJECT_PRINT + OFStream os(stdout); + code->Print(os); +#endif + + F9 f = FUNCTION_CAST(code->entry()); + CHECK_EQ(0, f(0.000092662107262076, -2.460774966188315, -1.0958787393627414)); +} + + +typedef int (*F10)(float x, float y, float z); +TEST(AssemblerX64FMA_ss) { + CcTest::InitializeVM(); + if (!CpuFeatures::IsSupported(FMA3)) return; + + Isolate* isolate = reinterpret_cast(CcTest::isolate()); + HandleScope scope(isolate); + v8::internal::byte buffer[1024]; + MacroAssembler assm(isolate, buffer, sizeof buffer); + { + CpuFeatureScope fscope(&assm, FMA3); + Label exit; + __ movss(xmm0, Operand(esp, 1 * kPointerSize)); + __ movss(xmm1, Operand(esp, 2 * kPointerSize)); + __ movss(xmm2, Operand(esp, 3 * kPointerSize)); + // arguments in xmm0, xmm1 and xmm2 + // xmm0 * xmm1 + xmm2 + __ movaps(xmm3, xmm0); + __ mulss(xmm3, xmm1); + __ addss(xmm3, xmm2); // Expected result in xmm3 + + __ sub(esp, Immediate(kDoubleSize)); // For memory operand + // vfmadd132ss + __ mov(eax, Immediate(1)); // Test number + __ movaps(xmm4, xmm0); + __ vfmadd132ss(xmm4, xmm2, xmm1); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmadd213ss + __ inc(eax); + __ movaps(xmm4, xmm1); + __ vfmadd213ss(xmm4, xmm0, xmm2); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmadd231ss + __ inc(eax); + __ movaps(xmm4, xmm2); + __ vfmadd231ss(xmm4, xmm0, xmm1); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + + // vfmadd132ss + __ inc(eax); + __ movaps(xmm4, xmm0); + __ movss(Operand(esp, 0), xmm1); + __ vfmadd132ss(xmm4, xmm2, Operand(esp, 0)); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmadd213ss + __ inc(eax); + __ movaps(xmm4, xmm1); + __ movss(Operand(esp, 0), xmm2); + __ vfmadd213ss(xmm4, xmm0, Operand(esp, 0)); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmadd231ss + __ inc(eax); + __ movaps(xmm4, xmm2); + __ movss(Operand(esp, 0), xmm1); + __ vfmadd231ss(xmm4, xmm0, Operand(esp, 0)); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + + // xmm0 * xmm1 - xmm2 + __ movaps(xmm3, xmm0); + __ mulss(xmm3, xmm1); + __ subss(xmm3, xmm2); // Expected result in xmm3 + + // vfmsub132ss + __ inc(eax); + __ movaps(xmm4, xmm0); + __ vfmsub132ss(xmm4, xmm2, xmm1); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmadd213ss + __ inc(eax); + __ movaps(xmm4, xmm1); + __ vfmsub213ss(xmm4, xmm0, xmm2); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmsub231ss + __ inc(eax); + __ movaps(xmm4, xmm2); + __ vfmsub231ss(xmm4, xmm0, xmm1); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + + // vfmsub132ss + __ inc(eax); + __ movaps(xmm4, xmm0); + __ movss(Operand(esp, 0), xmm1); + __ vfmsub132ss(xmm4, xmm2, Operand(esp, 0)); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmsub213ss + __ inc(eax); + __ movaps(xmm4, xmm1); + __ movss(Operand(esp, 0), xmm2); + __ vfmsub213ss(xmm4, xmm0, Operand(esp, 0)); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmsub231ss + __ inc(eax); + __ movaps(xmm4, xmm2); + __ movss(Operand(esp, 0), xmm1); + __ vfmsub231ss(xmm4, xmm0, Operand(esp, 0)); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + + + // - xmm0 * xmm1 + xmm2 + __ movaps(xmm3, xmm0); + __ mulss(xmm3, xmm1); + __ Move(xmm4, (uint32_t)1 << 31); + __ xorps(xmm3, xmm4); + __ addss(xmm3, xmm2); // Expected result in xmm3 + + // vfnmadd132ss + __ inc(eax); + __ movaps(xmm4, xmm0); + __ vfnmadd132ss(xmm4, xmm2, xmm1); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmadd213ss + __ inc(eax); + __ movaps(xmm4, xmm1); + __ vfnmadd213ss(xmm4, xmm0, xmm2); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + // vfnmadd231ss + __ inc(eax); + __ movaps(xmm4, xmm2); + __ vfnmadd231ss(xmm4, xmm0, xmm1); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + + // vfnmadd132ss + __ inc(eax); + __ movaps(xmm4, xmm0); + __ movss(Operand(esp, 0), xmm1); + __ vfnmadd132ss(xmm4, xmm2, Operand(esp, 0)); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + // vfnmadd213ss + __ inc(eax); + __ movaps(xmm4, xmm1); + __ movss(Operand(esp, 0), xmm2); + __ vfnmadd213ss(xmm4, xmm0, Operand(esp, 0)); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + // vfnmadd231ss + __ inc(eax); + __ movaps(xmm4, xmm2); + __ movss(Operand(esp, 0), xmm1); + __ vfnmadd231ss(xmm4, xmm0, Operand(esp, 0)); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + + + // - xmm0 * xmm1 - xmm2 + __ movaps(xmm3, xmm0); + __ mulss(xmm3, xmm1); + __ Move(xmm4, (uint32_t)1 << 31); + __ xorps(xmm3, xmm4); + __ subss(xmm3, xmm2); // Expected result in xmm3 + + // vfnmsub132ss + __ inc(eax); + __ movaps(xmm4, xmm0); + __ vfnmsub132ss(xmm4, xmm2, xmm1); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + // vfmsub213ss + __ inc(eax); + __ movaps(xmm4, xmm1); + __ vfnmsub213ss(xmm4, xmm0, xmm2); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + // vfnmsub231ss + __ inc(eax); + __ movaps(xmm4, xmm2); + __ vfnmsub231ss(xmm4, xmm0, xmm1); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + + // vfnmsub132ss + __ inc(eax); + __ movaps(xmm4, xmm0); + __ movss(Operand(esp, 0), xmm1); + __ vfnmsub132ss(xmm4, xmm2, Operand(esp, 0)); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + // vfnmsub213ss + __ inc(eax); + __ movaps(xmm4, xmm1); + __ movss(Operand(esp, 0), xmm2); + __ vfnmsub213ss(xmm4, xmm0, Operand(esp, 0)); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + // vfnmsub231ss + __ inc(eax); + __ movaps(xmm4, xmm2); + __ movss(Operand(esp, 0), xmm1); + __ vfnmsub231ss(xmm4, xmm0, Operand(esp, 0)); + __ ucomiss(xmm4, xmm3); + __ j(not_equal, &exit); + + + __ xor_(eax, eax); + __ bind(&exit); + __ add(esp, Immediate(kDoubleSize)); + __ ret(0); + } + + CodeDesc desc; + assm.GetCode(&desc); + Handle code = isolate->factory()->NewCode( + desc, Code::ComputeFlags(Code::STUB), Handle()); +#ifdef OBJECT_PRINT + OFStream os(stdout); + code->Print(os); +#endif + + F10 f = FUNCTION_CAST(code->entry()); + CHECK_EQ(0, f(9.26621069e-05f, -2.4607749f, -1.09587872f)); +} #undef __ diff --git a/test/cctest/test-disasm-ia32.cc b/test/cctest/test-disasm-ia32.cc index fb0c562..a2eaa15 100644 --- a/test/cctest/test-disasm-ia32.cc +++ b/test/cctest/test-disasm-ia32.cc @@ -51,7 +51,7 @@ TEST(DisasmIa320) { CcTest::InitializeVM(); Isolate* isolate = CcTest::i_isolate(); HandleScope scope(isolate); - v8::internal::byte buffer[2048]; + v8::internal::byte buffer[4096]; Assembler assm(isolate, buffer, sizeof buffer); DummyStaticFunction(NULL); // just bloody use it (DELETE; debugging) @@ -401,6 +401,14 @@ TEST(DisasmIa320) { __ xorps(xmm0, Operand(ebx, ecx, times_4, 10000)); // Arithmetic operation + __ addss(xmm1, xmm0); + __ addss(xmm1, Operand(ebx, ecx, times_4, 10000)); + __ mulss(xmm1, xmm0); + __ mulss(xmm1, Operand(ebx, ecx, times_4, 10000)); + __ subss(xmm1, xmm0); + __ subss(xmm1, Operand(ebx, ecx, times_4, 10000)); + __ divss(xmm1, xmm0); + __ divss(xmm1, Operand(ebx, ecx, times_4, 10000)); __ addps(xmm1, xmm0); __ addps(xmm1, Operand(ebx, ecx, times_4, 10000)); __ subps(xmm1, xmm0); @@ -409,6 +417,9 @@ TEST(DisasmIa320) { __ mulps(xmm1, Operand(ebx, ecx, times_4, 10000)); __ divps(xmm1, xmm0); __ divps(xmm1, Operand(ebx, ecx, times_4, 10000)); + + __ ucomiss(xmm0, xmm1); + __ ucomiss(xmm0, Operand(ebx, ecx, times_4, 10000)); } { __ cvttss2si(edx, Operand(ebx, ecx, times_4, 10000)); @@ -486,6 +497,68 @@ TEST(DisasmIa320) { } } + // FMA3 instruction + { + if (CpuFeatures::IsSupported(FMA3)) { + CpuFeatureScope scope(&assm, FMA3); + __ vfmadd132sd(xmm0, xmm1, xmm2); + __ vfmadd132sd(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + __ vfmadd213sd(xmm0, xmm1, xmm2); + __ vfmadd213sd(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + __ vfmadd231sd(xmm0, xmm1, xmm2); + __ vfmadd231sd(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + + __ vfmsub132sd(xmm0, xmm1, xmm2); + __ vfmsub132sd(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + __ vfmsub213sd(xmm0, xmm1, xmm2); + __ vfmsub213sd(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + __ vfmsub231sd(xmm0, xmm1, xmm2); + __ vfmsub231sd(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + + __ vfnmadd132sd(xmm0, xmm1, xmm2); + __ vfnmadd132sd(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + __ vfnmadd213sd(xmm0, xmm1, xmm2); + __ vfnmadd213sd(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + __ vfnmadd231sd(xmm0, xmm1, xmm2); + __ vfnmadd231sd(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + + __ vfnmsub132sd(xmm0, xmm1, xmm2); + __ vfnmsub132sd(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + __ vfnmsub213sd(xmm0, xmm1, xmm2); + __ vfnmsub213sd(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + __ vfnmsub231sd(xmm0, xmm1, xmm2); + __ vfnmsub231sd(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + + __ vfmadd132ss(xmm0, xmm1, xmm2); + __ vfmadd132ss(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + __ vfmadd213ss(xmm0, xmm1, xmm2); + __ vfmadd213ss(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + __ vfmadd231ss(xmm0, xmm1, xmm2); + __ vfmadd231ss(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + + __ vfmsub132ss(xmm0, xmm1, xmm2); + __ vfmsub132ss(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + __ vfmsub213ss(xmm0, xmm1, xmm2); + __ vfmsub213ss(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + __ vfmsub231ss(xmm0, xmm1, xmm2); + __ vfmsub231ss(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + + __ vfnmadd132ss(xmm0, xmm1, xmm2); + __ vfnmadd132ss(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + __ vfnmadd213ss(xmm0, xmm1, xmm2); + __ vfnmadd213ss(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + __ vfnmadd231ss(xmm0, xmm1, xmm2); + __ vfnmadd231ss(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + + __ vfnmsub132ss(xmm0, xmm1, xmm2); + __ vfnmsub132ss(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + __ vfnmsub213ss(xmm0, xmm1, xmm2); + __ vfnmsub213ss(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + __ vfnmsub231ss(xmm0, xmm1, xmm2); + __ vfnmsub231ss(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000)); + } + } + // xchg. { __ xchg(eax, eax); -- 2.7.4