From e49367e7f379e54e482480a502f64196db2c5663 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 17 Jan 2023 11:29:51 +0000 Subject: [PATCH] [ARM] Fix i1 shuffle lowering with multiple operands. The existing lowering of i1 vector shuffle was only considering single-source shuffles, always assuming the second was undef. This extends that to properly handle both operands. --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 13 ++- llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll | 122 +++++++++++++++++++++++++++ 2 files changed, 131 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 5d45a8e..9949ea1 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -8554,6 +8554,7 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, "No support for vector shuffle of boolean predicates"); SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); SDLoc dl(Op); if (isReverseMask(ShuffleMask, VT)) { SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); @@ -8571,12 +8572,16 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, // many cases the generated code might be even better than scalar code // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit // fields in a register into 8 other arbitrary 2-bit fields! - SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); - EVT NewVT = PredAsVector.getValueType(); + SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG); + EVT NewVT = PredAsVector1.getValueType(); + SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT) + : PromoteMVEPredVector(dl, V2, VT, DAG); + assert(PredAsVector2.getValueType() == NewVT && + "Expected identical vector type in expanded i1 shuffle!"); // Do the shuffle! - SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, - DAG.getUNDEF(NewVT), ShuffleMask); + SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1, + PredAsVector2, ShuffleMask); // Now return the result of comparing the shuffled vector with zero, // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll index 307d93e..cca1516 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll @@ -740,3 +740,125 @@ entry: %s = select <16 x i1> %sh, <16 x i8> %a, <16 x i8> %b ret <16 x i8> %s } + +define <16 x i8> @shuffle2src_v16i8(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: shuffle2src_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vmov d6, r0, r1 +; CHECK-NEXT: vldrw.u32 q2, [r12] +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmov d7, r2, r3 +; CHECK-NEXT: vcmp.i8 eq, q2, zr +; CHECK-NEXT: add r0, sp, #32 +; CHECK-NEXT: vpsel q2, q1, q0 +; CHECK-NEXT: vcmp.i8 eq, q3, zr +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmovnt.i16 q2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmp.i8 ne, q2, zr +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c1 = icmp eq <16 x i8> %src1, zeroinitializer + %c2 = icmp eq <16 x i8> %src2, zeroinitializer + %sh = shufflevector <16 x i1> %c1, <16 x i1> %c2, <16 x i32> + %s = select <16 x i1> %sh, <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %s +} + +define <8 x i16> @shuffle2src_v8i16(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: shuffle2src_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vmov d6, r0, r1 +; CHECK-NEXT: vldrw.u32 q2, [r12] +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmov d7, r2, r3 +; CHECK-NEXT: vcmp.i16 eq, q2, zr +; CHECK-NEXT: add r0, sp, #32 +; CHECK-NEXT: vpsel q2, q1, q0 +; CHECK-NEXT: vcmp.i16 eq, q3, zr +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmovnt.i32 q2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmp.i16 ne, q2, zr +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c1 = icmp eq <8 x i16> %src1, zeroinitializer + %c2 = icmp eq <8 x i16> %src2, zeroinitializer + %sh = shufflevector <8 x i1> %c1, <8 x i1> %c2, <8 x i32> + %s = select <8 x i1> %sh, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %s +} + +define <4 x i32> @shuffle2src_v4i32(<4 x i32> %src1, <4 x i32> %src2, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: shuffle2src_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vmov d6, r0, r1 +; CHECK-NEXT: vldrw.u32 q2, [r12] +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmov d7, r2, r3 +; CHECK-NEXT: vcmp.i32 eq, q2, zr +; CHECK-NEXT: add r0, sp, #32 +; CHECK-NEXT: vpsel q2, q1, q0 +; CHECK-NEXT: vcmp.i32 eq, q3, zr +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vmov.f32 s11, s2 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c1 = icmp eq <4 x i32> %src1, zeroinitializer + %c2 = icmp eq <4 x i32> %src2, zeroinitializer + %sh = shufflevector <4 x i1> %c1, <4 x i1> %c2, <4 x i32> + %s = select <4 x i1> %sh, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %s +} + +define <2 x i64> @shuffle2src_v2i64(<2 x i64> %src1, <2 x i64> %src2, <2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: shuffle2src_v2i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r2, r3, [sp] +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: csetm r2, eq +; CHECK-NEXT: orrs r0, r1 +; CHECK-NEXT: bfi r3, r2, #0, #8 +; CHECK-NEXT: csetm r0, eq +; CHECK-NEXT: bfi r3, r0, #8, #8 +; CHECK-NEXT: add r0, sp, #32 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r3 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %c1 = icmp eq <2 x i64> %src1, zeroinitializer + %c2 = icmp eq <2 x i64> %src2, zeroinitializer + %sh = shufflevector <2 x i1> %c1, <2 x i1> %c2, <2 x i32> + %s = select <2 x i1> %sh, <2 x i64> %a, <2 x i64> %b + ret <2 x i64> %s +} -- 2.7.4