improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
MachineTraceMetrics::Trace BlockTrace,
SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
MachineCombinerPattern Pattern);
bool preservesResourceLen(MachineBasicBlock *MBB,
MachineBasicBlock *MBB, MachineInstr *Root,
MachineTraceMetrics::Trace BlockTrace,
SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
MachineCombinerPattern Pattern) {
assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
// A more flexible cost calculation for the critical path includes the slack
// of the original code sequence. This may allow the transform to proceed
// even if the instruction depths (data dependency cycles) become worse.
+
unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
- unsigned RootLatency = TSchedModel.computeInstrLatency(Root);
+ unsigned RootLatency = 0;
+
+ for (auto I : DelInstrs)
+ RootLatency += TSchedModel.computeInstrLatency(I);
+
unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
// resource pressure.
if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) ||
(improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
- InstrIdxForVirtReg, P) &&
+ DelInstrs, InstrIdxForVirtReg, P) &&
preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
for (auto *InstrPtr : InsInstrs)
MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
define void @foo_2d(double* %src) {
; CHECK-LABEL: %entry
; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
entry:
%arrayidx1 = getelementptr inbounds double, double* %src, i64 5
%arrayidx2 = getelementptr inbounds double, double* %src, i64 11
--- /dev/null
+; Test all AArch64 subarches with scheduling models.
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a72 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a73 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cyclone < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m1 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m2 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=kryo < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=vulcan < %s | FileCheck %s
+
+; Make sure that inst-combine fuses the multiply add in the addressing mode of
+; the load.
+
+; CHECK-LABEL: fun:
+; CHECK-NOT: mul
+; CHECK: madd
+; CHECK-NOT: mul
+
+%class.D = type { %class.basic_string.base, [4 x i8] }
+%class.basic_string.base = type <{ i64, i64, i32 }>
+@a = global %class.D* zeroinitializer, align 8
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+define internal void @fun() section ".text.startup" {
+entry:
+ %tmp.i.i = alloca %class.D, align 8
+ %y = bitcast %class.D* %tmp.i.i to i8*
+ br label %loop
+loop:
+ %conv11.i.i = phi i64 [ 0, %entry ], [ %inc.i.i, %loop ]
+ %i = phi i64 [ undef, %entry ], [ %inc.i.i, %loop ]
+ %x = load %class.D*, %class.D** getelementptr inbounds (%class.D*, %class.D** @a, i64 0), align 8
+ %arrayidx.i.i.i = getelementptr inbounds %class.D, %class.D* %x, i64 %conv11.i.i
+ %d = bitcast %class.D* %arrayidx.i.i.i to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %y, i8* %d, i64 24, i32 8, i1 false)
+ %inc.i.i = add i64 %i, 1
+ %cmp.i.i = icmp slt i64 %inc.i.i, 0
+ br i1 %cmp.i.i, label %loop, label %exit
+exit:
+ ret void
+}
define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
; CHECK-LABEL: test_128bitmul:
-; CHECK-DAG: mul [[PART1:x[0-9]+]], x0, x3
-; CHECK-DAG: umulh [[CARRY:x[0-9]+]], x0, x2
-; CHECK: mul [[PART2:x[0-9]+]], x1, x2
-; CHECK: mul x0, x0, x2
+; CHECK: umulh [[HI:x[0-9]+]], x0, x2
+; CHECK: madd [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
+; CHECK-DAG: madd x1, x1, x2, [[TEMP1]]
+; CHECK-DAG: mul x0, x0, x2
+; CHECK-NEXT: ret
; CHECK-BE-LABEL: test_128bitmul:
-; CHECK-BE-DAG: mul [[PART1:x[0-9]+]], x1, x2
-; CHECK-BE-DAG: umulh [[CARRY:x[0-9]+]], x1, x3
-; CHECK-BE: mul [[PART2:x[0-9]+]], x0, x3
-; CHECK-BE: mul x1, x1, x3
+; CHECK-BE: umulh [[HI:x[0-9]+]], x1, x3
+; CHECK-BE: madd [[TEMP1:x[0-9]+]], x1, x2, [[HI]]
+; CHECK-BE-DAG: madd x0, x0, x3, [[TEMP1]]
+; CHECK-BE-DAG: mul x1, x1, x3
+; CHECK-BE-NEXT: ret
%prod = mul i128 %lhs, %rhs
ret i128 %prod
; CHECK-LABEL: test_128bitmul_optsize:
; CHECK: umulh [[HI:x[0-9]+]], x0, x2
; CHECK-NEXT: madd [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
-; CHECK-NEXT: madd x1, x1, x2, [[TEMP1]]
-; CHECK-NEXT: mul x0, x0, x2
+; CHECK-DAG: madd x1, x1, x2, [[TEMP1]]
+; CHECK-DAG: mul x0, x0, x2
; CHECK-NEXT: ret
%prod = mul i128 %lhs, %rhs
; CHECK-LABEL: test_128bitmul_minsize:
; CHECK: umulh [[HI:x[0-9]+]], x0, x2
; CHECK-NEXT: madd [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
-; CHECK-NEXT: madd x1, x1, x2, [[TEMP1]]
-; CHECK-NEXT: mul x0, x0, x2
+; CHECK-DAG: madd x1, x1, x2, [[TEMP1]]
+; CHECK-DAG: mul x0, x0, x2
; CHECK-NEXT: ret
%prod = mul i128 %lhs, %rhs