From e21f9ab42b90bb125380a62f2f54071a705bdf92 Mon Sep 17 00:00:00 2001 From: bmeurer Date: Mon, 13 Apr 2015 01:25:27 -0700 Subject: [PATCH] [x86] Allow (v)divsd->(v)mulsd to execute in parallel. This tweak was already present in CrankShaft for the non-AVX case. As it turns out, it's also relevant even with AVX. Now the same optimization is applied in case of TurboFan as well. R=dcarney@chromium.org Review URL: https://codereview.chromium.org/1081033003 Cr-Commit-Position: refs/heads/master@{#27774} --- src/compiler/ia32/code-generator-ia32.cc | 12 ++++++++++++ src/compiler/x64/code-generator-x64.cc | 12 ++++++++++++ src/ia32/lithium-codegen-ia32.cc | 6 +++--- src/x64/lithium-codegen-x64.cc | 6 +++--- 4 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/compiler/ia32/code-generator-ia32.cc b/src/compiler/ia32/code-generator-ia32.cc index aaa34dcfd..76a457b8f 100644 --- a/src/compiler/ia32/code-generator-ia32.cc +++ b/src/compiler/ia32/code-generator-ia32.cc @@ -468,6 +468,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) { break; case kSSEFloat32Div: __ divss(i.InputDoubleRegister(0), i.InputOperand(1)); + // Don't delete this mov. It may improve performance on some CPUs, + // when there is a (v)mulss depending on the result. + __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister()); break; case kSSEFloat32Max: __ maxss(i.InputDoubleRegister(0), i.InputOperand(1)); @@ -506,6 +509,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) { break; case kSSEFloat64Div: __ divsd(i.InputDoubleRegister(0), i.InputOperand(1)); + // Don't delete this mov. It may improve performance on some CPUs, + // when there is a (v)mulsd depending on the result. + __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister()); break; case kSSEFloat64Max: __ maxsd(i.InputDoubleRegister(0), i.InputOperand(1)); @@ -629,6 +635,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) { CpuFeatureScope avx_scope(masm(), AVX); __ vdivss(i.OutputDoubleRegister(), i.InputDoubleRegister(0), i.InputOperand(1)); + // Don't delete this mov. It may improve performance on some CPUs, + // when there is a (v)mulss depending on the result. + __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister()); break; } case kAVXFloat32Max: { @@ -665,6 +674,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) { CpuFeatureScope avx_scope(masm(), AVX); __ vdivsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0), i.InputOperand(1)); + // Don't delete this mov. It may improve performance on some CPUs, + // when there is a (v)mulsd depending on the result. + __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister()); break; } case kAVXFloat64Max: { diff --git a/src/compiler/x64/code-generator-x64.cc b/src/compiler/x64/code-generator-x64.cc index 1e5d62f7e..4213c9f13 100644 --- a/src/compiler/x64/code-generator-x64.cc +++ b/src/compiler/x64/code-generator-x64.cc @@ -726,6 +726,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) { break; case kSSEFloat32Div: ASSEMBLE_SSE_BINOP(divss); + // Don't delete this mov. It may improve performance on some CPUs, + // when there is a (v)mulss depending on the result. + __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister()); break; case kSSEFloat32Abs: { // TODO(bmeurer): Use RIP relative 128-bit constants. @@ -767,6 +770,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) { break; case kSSEFloat64Div: ASSEMBLE_SSE_BINOP(divsd); + // Don't delete this mov. It may improve performance on some CPUs, + // when there is a (v)mulsd depending on the result. + __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister()); break; case kSSEFloat64Mod: { __ subq(rsp, Immediate(kDoubleSize)); @@ -919,6 +925,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) { break; case kAVXFloat32Div: ASSEMBLE_AVX_BINOP(vdivss); + // Don't delete this mov. It may improve performance on some CPUs, + // when there is a (v)mulss depending on the result. + __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister()); break; case kAVXFloat32Max: ASSEMBLE_AVX_BINOP(vmaxss); @@ -946,6 +955,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) { break; case kAVXFloat64Div: ASSEMBLE_AVX_BINOP(vdivsd); + // Don't delete this mov. It may improve performance on some CPUs, + // when there is a (v)mulsd depending on the result. + __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister()); break; case kAVXFloat64Max: ASSEMBLE_AVX_BINOP(vmaxsd); diff --git a/src/ia32/lithium-codegen-ia32.cc b/src/ia32/lithium-codegen-ia32.cc index 7714e9f6e..72bb270e7 100644 --- a/src/ia32/lithium-codegen-ia32.cc +++ b/src/ia32/lithium-codegen-ia32.cc @@ -2040,10 +2040,10 @@ void LCodeGen::DoArithmeticD(LArithmeticD* instr) { } else { DCHECK(result.is(left)); __ divsd(left, right); - // Don't delete this mov. It may improve performance on some CPUs, - // when there is a mulsd depending on the result - __ movaps(left, left); } + // Don't delete this mov. It may improve performance on some CPUs, + // when there is a (v)mulsd depending on the result + __ movaps(result, result); break; case Token::MOD: { // Pass two doubles as arguments on the stack. diff --git a/src/x64/lithium-codegen-x64.cc b/src/x64/lithium-codegen-x64.cc index 8da76fa67..3b72d690f 100644 --- a/src/x64/lithium-codegen-x64.cc +++ b/src/x64/lithium-codegen-x64.cc @@ -2076,10 +2076,10 @@ void LCodeGen::DoArithmeticD(LArithmeticD* instr) { } else { DCHECK(result.is(left)); __ divsd(left, right); - // Don't delete this mov. It may improve performance on some CPUs, - // when there is a mulsd depending on the result - __ movaps(left, left); } + // Don't delete this mov. It may improve performance on some CPUs, + // when there is a (v)mulsd depending on the result + __ movaps(result, result); break; case Token::MOD: { XMMRegister xmm_scratch = double_scratch0(); -- 2.34.1