[x86] Allow (v)divsd->(v)mulsd to execute in parallel.

author bmeurer <bmeurer@chromium.org>

Mon, 13 Apr 2015 08:25:27 +0000 (01:25 -0700)

committer Commit bot <commit-bot@chromium.org>

Mon, 13 Apr 2015 08:25:24 +0000 (08:25 +0000)
author bmeurer <bmeurer@chromium.org>
Mon, 13 Apr 2015 08:25:27 +0000 (01:25 -0700)
committer Commit bot <commit-bot@chromium.org>
Mon, 13 Apr 2015 08:25:24 +0000 (08:25 +0000)
diff --git a/src/compiler/ia32/code-generator-ia32.cc b/src/compiler/ia32/code-generator-ia32.cc

index aaa34dcfd66f3dcc58fc70edc50f044651074dbf..76a457b8f6645d82b8c97753511fe4777df111e4 100644 (file)
--- a/src/compiler/ia32/code-generator-ia32.cc
+++ b/src/compiler/ia32/code-generator-ia32.cc
@@ -468,6 +468,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
        break;
      case kSSEFloat32Div:
        __ divss(i.InputDoubleRegister(0), i.InputOperand(1));
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulss depending on the result.
+      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
        break;
      case kSSEFloat32Max:
        __ maxss(i.InputDoubleRegister(0), i.InputOperand(1));
@@ -506,6 +509,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
        break;
      case kSSEFloat64Div:
        __ divsd(i.InputDoubleRegister(0), i.InputOperand(1));
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulsd depending on the result.
+      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
        break;
      case kSSEFloat64Max:
        __ maxsd(i.InputDoubleRegister(0), i.InputOperand(1));
@@ -629,6 +635,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
        CpuFeatureScope avx_scope(masm(), AVX);
        __ vdivss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                  i.InputOperand(1));
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulss depending on the result.
+      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
        break;
      }
      case kAVXFloat32Max: {
@@ -665,6 +674,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
        CpuFeatureScope avx_scope(masm(), AVX);
        __ vdivsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                  i.InputOperand(1));
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulsd depending on the result.
+      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
        break;
      }
      case kAVXFloat64Max: {
diff --git a/src/compiler/x64/code-generator-x64.cc b/src/compiler/x64/code-generator-x64.cc

index 1e5d62f7e3c522a336c29bbf9214d5d24afc5d3a..4213c9f135131957dc51e93505a2b7f4444df20f 100644 (file)
--- a/src/compiler/x64/code-generator-x64.cc
+++ b/src/compiler/x64/code-generator-x64.cc
@@ -726,6 +726,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
        break;
      case kSSEFloat32Div:
        ASSEMBLE_SSE_BINOP(divss);
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulss depending on the result.
+      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
        break;
      case kSSEFloat32Abs: {
        // TODO(bmeurer): Use RIP relative 128-bit constants.
@@ -767,6 +770,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
        break;
      case kSSEFloat64Div:
        ASSEMBLE_SSE_BINOP(divsd);
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulsd depending on the result.
+      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
        break;
      case kSSEFloat64Mod: {
        __ subq(rsp, Immediate(kDoubleSize));
@@ -919,6 +925,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
        break;
      case kAVXFloat32Div:
        ASSEMBLE_AVX_BINOP(vdivss);
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulss depending on the result.
+      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
        break;
      case kAVXFloat32Max:
        ASSEMBLE_AVX_BINOP(vmaxss);
@@ -946,6 +955,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
        break;
      case kAVXFloat64Div:
        ASSEMBLE_AVX_BINOP(vdivsd);
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulsd depending on the result.
+      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
        break;
      case kAVXFloat64Max:
        ASSEMBLE_AVX_BINOP(vmaxsd);
diff --git a/src/ia32/lithium-codegen-ia32.cc b/src/ia32/lithium-codegen-ia32.cc

index 7714e9f6e9bae8a59a76f29e136268c548f0a30c..72bb270e7a6fcfe384da3579b12a989e721b9cd2 100644 (file)
--- a/src/ia32/lithium-codegen-ia32.cc
+++ b/src/ia32/lithium-codegen-ia32.cc
@@ -2040,10 +2040,10 @@ void LCodeGen::DoArithmeticD(LArithmeticD* instr) {
        } else {
          DCHECK(result.is(left));
          __ divsd(left, right);
-        // Don't delete this mov. It may improve performance on some CPUs,
-        // when there is a mulsd depending on the result
-        __ movaps(left, left);
        }
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulsd depending on the result
+      __ movaps(result, result);
        break;
      case Token::MOD: {
        // Pass two doubles as arguments on the stack.
diff --git a/src/x64/lithium-codegen-x64.cc b/src/x64/lithium-codegen-x64.cc

index 8da76fa673ed465cf6228b08ad8ffe1deb68f594..3b72d690fe514c17d17d939b3ad3b13d24748b86 100644 (file)
--- a/src/x64/lithium-codegen-x64.cc
+++ b/src/x64/lithium-codegen-x64.cc
@@ -2076,10 +2076,10 @@ void LCodeGen::DoArithmeticD(LArithmeticD* instr) {
        } else {
          DCHECK(result.is(left));
          __ divsd(left, right);
-        // Don't delete this mov. It may improve performance on some CPUs,
-        // when there is a mulsd depending on the result
-        __ movaps(left, left);
        }
+      // Don't delete this mov. It may improve performance on some CPUs,
+      // when there is a (v)mulsd depending on the result
+      __ movaps(result, result);
        break;
      case Token::MOD: {
        XMMRegister xmm_scratch = double_scratch0();
author	bmeurer <bmeurer@chromium.org>
	Mon, 13 Apr 2015 08:25:27 +0000 (01:25 -0700)
committer	Commit bot <commit-bot@chromium.org>
	Mon, 13 Apr 2015 08:25:24 +0000 (08:25 +0000)
src/compiler/ia32/code-generator-ia32.cc		patch \| blob \| history
src/compiler/x64/code-generator-x64.cc		patch \| blob \| history
src/ia32/lithium-codegen-ia32.cc		patch \| blob \| history
src/x64/lithium-codegen-x64.cc		patch \| blob \| history