From fea532230bf4cf677d0ae6028eedf0135aa8b9e2 Mon Sep 17 00:00:00 2001 From: Sam Parker Date: Wed, 4 Sep 2019 08:41:34 +0000 Subject: [PATCH] [ARM][ParallelDSP] SExt mul for accumulation For any unpaired muls, we accumulate them as an input to the reduction. Check the type of the mul and perform a sext if the existing accumlator input type is not the same. Differential Revision: https://reviews.llvm.org/D66993 llvm-svn: 370851 --- llvm/lib/Target/ARM/ARMParallelDSP.cpp | 19 ++- llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll | 57 +++++++++ llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll | 123 +++++++++++++++++++ llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll | 46 +++++++ llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll | 149 +++++++++++++++++++++++ 5 files changed, 389 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp index 212c5a3..cb022dd 100644 --- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp +++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp @@ -649,18 +649,27 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) { if (MulCand->Paired) continue; - LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *MulCand->Root - << "\n"); + Value *Mul = MulCand->Root; + LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n"); + + if (R.getRoot()->getType() != Mul->getType()) { + assert(R.is64Bit() && "expected 64-bit result"); + Mul = Builder.CreateSExt(Mul, R.getRoot()->getType()); + } + if (!Acc) { - Acc = MulCand->Root; + Acc = Mul; continue; } - Acc = Builder.CreateAdd(MulCand->Root, Acc); + + Acc = Builder.CreateAdd(Mul, Acc); InsertAfter = cast(Acc); } if (!Acc) - Acc = ConstantInt::get(IntegerType::get(M->getContext(), 32), 0); + Acc = R.is64Bit() ? + ConstantInt::get(IntegerType::get(M->getContext(), 64), 0) : + ConstantInt::get(IntegerType::get(M->getContext(), 32), 0); IntegerType *Ty = IntegerType::get(M->getContext(), 32); for (auto &Pair : R.getMulPairs()) { diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll b/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll index d9dbd96..5ca8a16 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll @@ -25,6 +25,33 @@ entry: ret i32 %res } +; CHECK-LABEL: single_block_64 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i64 @llvm.arm.smlald(i32 [[A]], i32 [[B]], i64 %acc) +define i64 @single_block_64(i16* %a, i16* %b, i64 %acc) { +entry: + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.1 = load i16, i16* %addr.a.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %sext.mul.0 = sext i32 %mul.0 to i64 + %sext.mul.1 = sext i32 %mul.1 to i64 + %add = add i64 %sext.mul.0, %sext.mul.1 + %res = add i64 %add, %acc + ret i64 %res +} + ; CHECK-LABEL: multi_block ; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* ; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] @@ -53,6 +80,36 @@ bb.1: ret i32 %res } +; CHECK-LABEL: multi_block_64 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i64 @llvm.arm.smlald(i32 [[A]], i32 [[B]], i64 0) +define i64 @multi_block_64(i16* %a, i16* %b, i64 %acc) { +entry: + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.1 = load i16, i16* %addr.a.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %sext.mul.0 = sext i32 %mul.0 to i64 + %sext.mul.1 = sext i32 %mul.1 to i64 + %add = add i64 %sext.mul.0, %sext.mul.1 + br label %bb.1 + +bb.1: + %res = add i64 %add, %acc + ret i64 %res +} + ; CHECK-LABEL: multi_block_1 ; CHECK-NOT: call i32 @llvm.arm.smlad define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) { diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll b/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll index c072df4..0e2a21e 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll @@ -139,6 +139,87 @@ entry: ret i32 %res } +; CHECK-LABEL: exchange_multi_use_64_1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc +; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]]) +define i64 @exchange_multi_use_64_1(i16* %a, i16* %b, i64 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.1 + %mul.1 = mul i32 %sext.a.1, %sext.b.0 + %add = add i32 %mul.0, %mul.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.a.3, %sext.b.1 + %mul.3 = mul i32 %sext.a.2, %sext.b.0 + %add.1 = add i32 %mul.2, %mul.3 + %add.2 = add i32 %add, %add.1 + %sext.add.2 = sext i32 %add.2 to i64 + %res = add i64 %sext.add.2, %acc + ret i64 %res +} + +; CHECK-LABEL: exchange_multi_use_64_2 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc +; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]]) +define i64 @exchange_multi_use_64_2(i16* %a, i16* %b, i64 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.1 + %mul.1 = mul i32 %sext.a.1, %sext.b.0 + %add = add i32 %mul.0, %mul.1 + %sext.add = sext i32 %add to i64 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.a.3, %sext.b.1 + %mul.3 = mul i32 %sext.a.2, %sext.b.0 + %add.1 = add i32 %mul.2, %mul.3 + %sext.add.1 = sext i32 %add.1 to i64 + %add.2 = add i64 %sext.add, %sext.add.1 + %res = add i64 %add.2, %acc + ret i64 %res +} + ; CHECK-LABEL: exchange_multi_use_2 ; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] @@ -216,6 +297,48 @@ entry: ret i32 %res } +; TODO: Would it be better to generate a smlad and then sign extend it? +; CHECK-LABEL: exchange_multi_use_64_3 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 0) +; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_B]], i32 [[LD_A_2]], i64 [[ACC]]) +define i64 @exchange_multi_use_64_3(i16* %a, i16* %b, i64 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.b.0, %sext.a.3 + %mul.3 = mul i32 %sext.b.1, %sext.a.2 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.2, %mul.3 + %sext.add = sext i32 %add to i64 + %sext.add.1 = sext i32 %add.1 to i64 + %add.2 = add i64 %sext.add, %sext.add.1 + %res = sub i64 %acc, %add.2 + ret i64 %res +} + ; TODO: Why isn't smladx generated too? ; CHECK-LABEL: exchange_multi_use_4 ; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll b/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll index a071ec3..1f4b141 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll @@ -42,6 +42,52 @@ entry: ret i32 %res } +; TODO: Is it really best to generate smlald for the first instruction? Does +; this just increase register pressure unnecessarily? +; CHECK-LABEL: overlap_64_1 +; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1 +; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32* +; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]] +; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32* +; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] +; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A_1]], i32 [[LD_B_1]], i64 %acc) +; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 [[ACC]]) +; CHECK: ret i64 [[RES]] +define i64 @overlap_64_1(i16* %a, i16* %b, i64 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %mul.2 = mul i32 %sext.a.2, %sext.b.2 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.1, %mul.2 + %sext.add = sext i32 %add to i64 + %sext.add.1 = sext i32 %add.1 to i64 + %add.2 = add i64 %sext.add.1, %sext.add + %res = add i64 %add.2, %acc + ret i64 %res +} + ; CHECK-LABEL: overlap_2 ; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1 ; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll b/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll index a2f4745..5cccc05 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll @@ -141,5 +141,154 @@ entry: ret i32 %add30 } +; CHECK-LABEL: with_64bit_acc +; CHECK: [[ADDR_IN_MINUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -1 +; CHECK: [[LD_IN_MINUS_1:%[^ ]+]] = load i16, i16* [[ADDR_IN_MINUS_1]], align 2 +; CHECK: [[IN_MINUS_1:%[^ ]+]] = sext i16 [[LD_IN_MINUS_1]] to i32 +; CHECK: [[ADDR_B_PLUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 1 +; CHECK: [[LD_B_PLUS_1:%[^ ]+]] = load i16, i16* [[ADDR_B_PLUS_1]], align 2 +; CHECK: [[B_PLUS_1:%[^ ]+]] = sext i16 [[LD_B_PLUS_1]] to i32 +; CHECK: [[MUL0:%[^ ]+]] = mul nsw i32 [[B_PLUS_1]], [[IN_MINUS_1]] +; CHECK: [[SEXT1:%[^ ]+]] = sext i32 [[MUL0]] to i64 +; CHECK: [[ADD0:%[^ ]+]] = add i64 %sext.0, [[SEXT1]] +; CHECK: [[ADDR_IN_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -3 +; CHECK: [[CAST_ADDR_IN_MINUS_3:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_3]] to i32* +; CHECK: [[IN_MINUS_3:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_3]], align 2 +; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2 +; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32* +; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 +; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5 +; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32* +; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2 +; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4 +; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32* +; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 +; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i64 [[ADD0]]) +; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i64 [[ACC]]) +; CHECK: ret i64 [[RES]] +define i64 @with_64bit_acc(i16* nocapture readonly %in, i16* nocapture readonly %b) { +entry: + %0 = load i16, i16* %in, align 2 + %conv = sext i16 %0 to i32 + %1 = load i16, i16* %b, align 2 + %conv2 = sext i16 %1 to i32 + %call = tail call i32 @bar(i32 %conv, i32 %conv2) + %sext.0 = sext i32 %call to i64 + %arrayidx3 = getelementptr inbounds i16, i16* %in, i32 -1 + %2 = load i16, i16* %arrayidx3, align 2 + %conv4 = sext i16 %2 to i32 + %arrayidx5 = getelementptr inbounds i16, i16* %b, i32 1 + %3 = load i16, i16* %arrayidx5, align 2 + %conv6 = sext i16 %3 to i32 + %mul = mul nsw i32 %conv6, %conv4 + %sext.1 = sext i32 %mul to i64 + %add = add i64 %sext.0, %sext.1 + %arrayidx7 = getelementptr inbounds i16, i16* %in, i32 -2 + %4 = load i16, i16* %arrayidx7, align 2 + %conv8 = sext i16 %4 to i32 + %arrayidx9 = getelementptr inbounds i16, i16* %b, i32 2 + %5 = load i16, i16* %arrayidx9, align 2 + %conv10 = sext i16 %5 to i32 + %mul11 = mul nsw i32 %conv10, %conv8 + %sext.2 = sext i32 %mul11 to i64 + %add12 = add i64 %add, %sext.2 + %arrayidx13 = getelementptr inbounds i16, i16* %in, i32 -3 + %6 = load i16, i16* %arrayidx13, align 2 + %conv14 = sext i16 %6 to i32 + %arrayidx15 = getelementptr inbounds i16, i16* %b, i32 3 + %7 = load i16, i16* %arrayidx15, align 2 + %conv16 = sext i16 %7 to i32 + %mul17 = mul nsw i32 %conv16, %conv14 + %sext.3 = sext i32 %mul17 to i64 + %add18 = add i64 %add12, %sext.3 + %arrayidx19 = getelementptr inbounds i16, i16* %in, i32 -4 + %8 = load i16, i16* %arrayidx19, align 2 + %conv20 = sext i16 %8 to i32 + %arrayidx21 = getelementptr inbounds i16, i16* %b, i32 4 + %9 = load i16, i16* %arrayidx21, align 2 + %conv22 = sext i16 %9 to i32 + %mul23 = mul nsw i32 %conv22, %conv20 + %sext.4 = sext i32 %mul23 to i64 + %add24 = add i64 %add18, %sext.4 + %arrayidx25 = getelementptr inbounds i16, i16* %in, i32 -5 + %10 = load i16, i16* %arrayidx25, align 2 + %conv26 = sext i16 %10 to i32 + %arrayidx27 = getelementptr inbounds i16, i16* %b, i32 5 + %11 = load i16, i16* %arrayidx27, align 2 + %conv28 = sext i16 %11 to i32 + %mul29 = mul nsw i32 %conv28, %conv26 + %sext.5 = sext i32 %mul29 to i64 + %add30 = add i64 %add24, %sext.5 + ret i64 %add30 +} + +; CHECK: with_64bit_add_acc +; CHECK: [[ADDR_X_PLUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %px.10756.unr, i32 1 +; CHECK: [[X:%[^ ]+]] = load i16, i16* %px.10756.unr, align 2 +; CHECK: [[SEXT_X:%[^ ]+]] = sext i16 [[X]] to i32 +; CHECK: [[ADDR_Y_MINUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %py.8757.unr, i32 -1 +; CHECK: [[Y:%[^ ]+]] = load i16, i16* %py.8757.unr, align 2 +; CHECK: [[SEXT_Y:%[^ ]+]] = sext i16 [[Y]] to i32 +; CHECK: [[MUL0:%[^ ]+]] = mul nsw i32 [[SEXT_Y]], [[SEXT_X]] +; CHECK: [[SEXT_MUL0:%[^ ]+]] = sext i32 [[MUL0]] to i64 +; CHECK: [[ADD_1:%[^ ]+]] = add nsw i64 %sum.3758.unr, [[SEXT_MUL0]] +; CHECK: [[X_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %px.10756.unr, i32 2 +; CHECK: [[X_1:%[^ ]+]] = load i16, i16* [[ADDR_X_PLUS_1]], align 2 +; CHECK: [[SEXT_X_1:%[^ ]+]] = sext i16 [[X_1]] to i32 +; CHECK: [[Y_1:%[^ ]+]] = load i16, i16* [[ADDR_Y_MINUS_1]], align 2 +; CHECK: [[SEXT_Y_1:%[^ ]+]] = sext i16 [[Y_1]] to i32 +; CHECK: [[UNPAIRED:%[^ ]+]] = mul nsw i32 [[SEXT_Y_1]], [[SEXT_X_1]] +; CHECK: [[ADDR_X_PLUS_2:%[^ ]+]] = bitcast i16* [[X_PLUS_2]] to i32* +; CHECK: [[X_2:%[^ ]+]] = load i32, i32* [[ADDR_X_PLUS_2]], align 2 +; CHECK: [[Y_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %py.8757.unr, i32 -3 +; CHECK: [[ADDR_Y_MINUS_3:%[^ ]+]] = bitcast i16* [[Y_MINUS_3]] to i32* +; CHECK: [[Y_3:%[^ ]+]] = load i32, i32* [[ADDR_Y_MINUS_3]], align 2 +; CHECK: [[SEXT:%[^ ]+]] = sext i32 [[UNPAIRED]] to i64 +; CHECK: [[ACC:%[^ ]+]] = add i64 [[SEXT]], [[ADD_1]] +; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[Y_3]], i32 [[X_2]], i64 [[ACC]]) +; CHECK: ret i64 [[RES]] +define i64 @with_64bit_add_acc(i16* nocapture readonly %px.10756.unr, i16* nocapture readonly %py.8757.unr, i32 %acc) { +entry: + %sum.3758.unr = sext i32 %acc to i64 + br label %bb.1 + +bb.1: + %incdec.ptr184.epil = getelementptr inbounds i16, i16* %px.10756.unr, i32 1 + %tmp216 = load i16, i16* %px.10756.unr, align 2 + %conv185.epil = sext i16 %tmp216 to i32 + %incdec.ptr186.epil = getelementptr inbounds i16, i16* %py.8757.unr, i32 -1 + %tmp217 = load i16, i16* %py.8757.unr, align 2 + %conv187.epil = sext i16 %tmp217 to i32 + %mul.epil = mul nsw i32 %conv187.epil, %conv185.epil + %conv188.epil = sext i32 %mul.epil to i64 + %add189.epil = add nsw i64 %sum.3758.unr, %conv188.epil + %incdec.ptr190.epil = getelementptr inbounds i16, i16* %px.10756.unr, i32 2 + %tmp218 = load i16, i16* %incdec.ptr184.epil, align 2 + %conv191.epil = sext i16 %tmp218 to i32 + %incdec.ptr192.epil = getelementptr inbounds i16, i16* %py.8757.unr, i32 -2 + %tmp219 = load i16, i16* %incdec.ptr186.epil, align 2 + %conv193.epil = sext i16 %tmp219 to i32 + %mul194.epil = mul nsw i32 %conv193.epil, %conv191.epil + %conv195.epil = sext i32 %mul194.epil to i64 + %add196.epil = add nsw i64 %add189.epil, %conv195.epil + %incdec.ptr197.epil = getelementptr inbounds i16, i16* %px.10756.unr, i32 3 + %tmp220 = load i16, i16* %incdec.ptr190.epil, align 2 + %conv198.epil = sext i16 %tmp220 to i32 + %incdec.ptr199.epil = getelementptr inbounds i16, i16* %py.8757.unr, i32 -3 + %tmp221 = load i16, i16* %incdec.ptr192.epil, align 2 + %conv200.epil = sext i16 %tmp221 to i32 + %mul201.epil = mul nsw i32 %conv200.epil, %conv198.epil + %conv202.epil = sext i32 %mul201.epil to i64 + %add203.epil = add nsw i64 %add196.epil, %conv202.epil + %tmp222 = load i16, i16* %incdec.ptr197.epil, align 2 + %conv205.epil = sext i16 %tmp222 to i32 + %tmp223 = load i16, i16* %incdec.ptr199.epil, align 2 + %conv207.epil = sext i16 %tmp223 to i32 + %mul208.epil = mul nsw i32 %conv207.epil, %conv205.epil + %conv209.epil = sext i32 %mul208.epil to i64 + %add210.epil = add nsw i64 %add203.epil, %conv209.epil + ret i64 %add210.epil +} + declare dso_local i32 @bar(i32, i32) local_unnamed_addr -- 2.7.4