instr-combiner: sum up all latencies of the transformed instructions

author Sebastian Pop <sebpop@gmail.com>

Sun, 11 Dec 2016 19:39:32 +0000 (19:39 +0000)

committer Sebastian Pop <sebpop@gmail.com>

Sun, 11 Dec 2016 19:39:32 +0000 (19:39 +0000)
author Sebastian Pop <sebpop@gmail.com>
Sun, 11 Dec 2016 19:39:32 +0000 (19:39 +0000)
committer Sebastian Pop <sebpop@gmail.com>
Sun, 11 Dec 2016 19:39:32 +0000 (19:39 +0000)
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp

index b4e217c..54c4741 100644 (file)
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -71,6 +71,7 @@ private:
    improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
                            MachineTraceMetrics::Trace BlockTrace,
                            SmallVectorImpl<MachineInstr *> &InsInstrs,
+                          SmallVectorImpl<MachineInstr *> &DelInstrs,
                            DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
                            MachineCombinerPattern Pattern);
    bool preservesResourceLen(MachineBasicBlock *MBB,
@@ -242,6 +243,7 @@ bool MachineCombiner::improvesCriticalPathLen(
      MachineBasicBlock *MBB, MachineInstr *Root,
      MachineTraceMetrics::Trace BlockTrace,
      SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
      DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
      MachineCombinerPattern Pattern) {
    assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
@@ -269,8 +271,13 @@ bool MachineCombiner::improvesCriticalPathLen(
    // A more flexible cost calculation for the critical path includes the slack
    // of the original code sequence. This may allow the transform to proceed
    // even if the instruction depths (data dependency cycles) become worse.
+
    unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
-  unsigned RootLatency = TSchedModel.computeInstrLatency(Root);
+  unsigned RootLatency = 0;
+
+  for (auto I : DelInstrs)
+    RootLatency += TSchedModel.computeInstrLatency(I);
+
    unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
  
    DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
@@ -421,7 +428,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
        // resource pressure.
        if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) ||
            (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
-                                   InstrIdxForVirtReg, P) &&
+                                   DelInstrs, InstrIdxForVirtReg, P) &&
             preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
          for (auto *InstrPtr : InsInstrs)
            MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
diff --git a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll

index ab875c0..95ef0f9 100644 (file)
--- a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
@@ -2,7 +2,7 @@
  define void @foo_2d(double* %src) {
  ; CHECK-LABEL: %entry
  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
  entry:
    %arrayidx1 = getelementptr inbounds double, double* %src, i64 5
    %arrayidx2 = getelementptr inbounds double, double* %src, i64 11
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-madd.ll b/llvm/test/CodeGen/AArch64/machine-combiner-madd.ll

new file mode 100644 (file)

index 0000000..ea31137
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-madd.ll
@@ -0,0 +1,40 @@
+; Test all AArch64 subarches with scheduling models.
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a72 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a73 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cyclone    < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m1  < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m2  < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=kryo       < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=vulcan     < %s | FileCheck %s
+
+; Make sure that inst-combine fuses the multiply add in the addressing mode of
+; the load.
+
+; CHECK-LABEL: fun:
+; CHECK-NOT: mul
+; CHECK:     madd
+; CHECK-NOT: mul
+
+%class.D = type { %class.basic_string.base, [4 x i8] }
+%class.basic_string.base = type <{ i64, i64, i32 }>
+@a = global %class.D* zeroinitializer, align 8
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+define internal void @fun() section ".text.startup" {
+entry:
+  %tmp.i.i = alloca %class.D, align 8
+  %y = bitcast %class.D* %tmp.i.i to i8*
+  br label %loop
+loop:
+  %conv11.i.i = phi i64 [ 0, %entry ], [ %inc.i.i, %loop ]
+  %i = phi i64 [ undef, %entry ], [ %inc.i.i, %loop ]
+  %x = load %class.D*, %class.D** getelementptr inbounds (%class.D*, %class.D** @a, i64 0), align 8
+  %arrayidx.i.i.i = getelementptr inbounds %class.D, %class.D* %x, i64 %conv11.i.i
+  %d = bitcast %class.D* %arrayidx.i.i.i to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %y, i8* %d, i64 24, i32 8, i1 false)
+  %inc.i.i = add i64 %i, 1
+  %cmp.i.i = icmp slt i64 %inc.i.i, 0
+  br i1 %cmp.i.i, label %loop, label %exit
+exit:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/mul-lohi.ll b/llvm/test/CodeGen/AArch64/mul-lohi.ll

index e935218..4ba4cfa 100644 (file)
--- a/llvm/test/CodeGen/AArch64/mul-lohi.ll
+++ b/llvm/test/CodeGen/AArch64/mul-lohi.ll
@@ -3,16 +3,18 @@
  
  define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
  ; CHECK-LABEL: test_128bitmul:
-; CHECK-DAG: mul [[PART1:x[0-9]+]], x0, x3
-; CHECK-DAG: umulh [[CARRY:x[0-9]+]], x0, x2
-; CHECK: mul [[PART2:x[0-9]+]], x1, x2
-; CHECK: mul x0, x0, x2
+; CHECK:       umulh [[HI:x[0-9]+]], x0, x2
+; CHECK:       madd  [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
+; CHECK-DAG:   madd  x1, x1, x2, [[TEMP1]]
+; CHECK-DAG:   mul   x0, x0, x2
+; CHECK-NEXT:  ret
  
  ; CHECK-BE-LABEL: test_128bitmul:
-; CHECK-BE-DAG: mul [[PART1:x[0-9]+]], x1, x2
-; CHECK-BE-DAG: umulh [[CARRY:x[0-9]+]], x1, x3
-; CHECK-BE: mul [[PART2:x[0-9]+]], x0, x3
-; CHECK-BE: mul x1, x1, x3
+; CHECK-BE:       umulh [[HI:x[0-9]+]], x1, x3
+; CHECK-BE:       madd  [[TEMP1:x[0-9]+]], x1, x2, [[HI]]
+; CHECK-BE-DAG:   madd  x0, x0, x3, [[TEMP1]]
+; CHECK-BE-DAG:   mul   x1, x1, x3
+; CHECK-BE-NEXT:  ret
  
    %prod = mul i128 %lhs, %rhs
    ret i128 %prod
@@ -25,8 +27,8 @@ define i128 @test_128bitmul_optsize(i128 %lhs, i128 %rhs) optsize {
  ; CHECK-LABEL: test_128bitmul_optsize:
  ; CHECK:       umulh [[HI:x[0-9]+]], x0, x2
  ; CHECK-NEXT:  madd  [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
-; CHECK-NEXT:  madd  x1, x1, x2, [[TEMP1]]
-; CHECK-NEXT:  mul   x0, x0, x2
+; CHECK-DAG:   madd  x1, x1, x2, [[TEMP1]]
+; CHECK-DAG:   mul   x0, x0, x2
  ; CHECK-NEXT:  ret
  
    %prod = mul i128 %lhs, %rhs
@@ -37,8 +39,8 @@ define i128 @test_128bitmul_minsize(i128 %lhs, i128 %rhs) minsize {
  ; CHECK-LABEL: test_128bitmul_minsize:
  ; CHECK:       umulh [[HI:x[0-9]+]], x0, x2
  ; CHECK-NEXT:  madd  [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
-; CHECK-NEXT:  madd  x1, x1, x2, [[TEMP1]]
-; CHECK-NEXT:  mul   x0, x0, x2
+; CHECK-DAG:   madd  x1, x1, x2, [[TEMP1]]
+; CHECK-DAG:   mul   x0, x0, x2
  ; CHECK-NEXT:  ret
  
    %prod = mul i128 %lhs, %rhs
author	Sebastian Pop <sebpop@gmail.com>
	Sun, 11 Dec 2016 19:39:32 +0000 (19:39 +0000)
committer	Sebastian Pop <sebpop@gmail.com>
	Sun, 11 Dec 2016 19:39:32 +0000 (19:39 +0000)
llvm/lib/CodeGen/MachineCombiner.cpp		patch \| blob \| history
llvm/test/CodeGen/AArch64/arm64-fma-combines.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/machine-combiner-madd.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/AArch64/mul-lohi.ll		patch \| blob \| history