From ae9a346ef8620851b54682b4b6738788af709578 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 12 Aug 2021 16:37:39 +0100 Subject: [PATCH] [ARM] Fix DAG combine loop in reduction distribution Given a constant operand, the MVE and DAGCombine combines could fight, each redistributing in the opposite order. Add a guard to the MVE vecreduce distribution to prevent that. --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 3 +- llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll | 47 +++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 79cf91f..9e7f4030 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -13089,7 +13089,8 @@ static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) { // add(add(X, vecreduce(Y)), vecreduce(Z)) // to make better use of vaddva style instructions. if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) && - IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1))) { + IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) && + !isa(N0)) { SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0)); return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1)); } diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll index 44b8005..465457b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll @@ -3772,6 +3772,53 @@ entry: ret i8 %52 } + +define arm_aapcs_vfpcc i32 @add_two_const(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: add_two_const: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.u32 r0, q1 +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: adds r0, #10 +; CHECK-NEXT: bx lr +entry: + %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) + %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y) + %c = add i32 %a, %b + %d = add i32 %c, 10 + ret i32 %d +} + +define arm_aapcs_vfpcc i32 @add_two_const2(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: add_two_const2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.u32 r0, q1 +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: adds r0, #10 +; CHECK-NEXT: bx lr +entry: + %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) + %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y) + %c = add i32 %a, 10 + %d = add i32 %c, %b + ret i32 %d +} + +define arm_aapcs_vfpcc i32 @add_two_const3(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: add_two_const3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: vaddva.u32 r0, q1 +; CHECK-NEXT: adds r0, #20 +; CHECK-NEXT: bx lr +entry: + %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) + %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y) + %c = add i32 %a, 10 + %d = add i32 %b, 10 + %e = add i32 %c, %d + ret i32 %e +} + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) -- 2.7.4