From 120ce83660dea7e70abe1c8f9408f39fe2502f8d Mon Sep 17 00:00:00 2001 From: David Green Date: Sun, 5 Feb 2023 10:28:54 +0000 Subject: [PATCH] [DAG] Add visitABD optimizations This adds basic a visitABD to optimize ABDS and ABDU nodes, similar to the existing visitAVG method. The fold I was initially interested in was folding shuffles though the binop. This also: - Marks ABDS and ABDU as commutative binops (https://alive2.llvm.org/ce/z/oCDogb and https://alive2.llvm.org/ce/z/7zrs86). - Add reassociative folds. - Add constant folding using max(x,y)-min(x,y) - Canonicalizes constants to the RHS - Folds abds x, 0 -> abs(x) (https://alive2.llvm.org/ce/z/4ZEibv) - Folds abdu x, 0 -> x (https://alive2.llvm.org/ce/z/J_rKqx) - Folds abd x, undef -> 0 (https://alive2.llvm.org/ce/z/NV6Nsv and https://alive2.llvm.org/ce/z/vs92hu). Differential Revision: https://reviews.llvm.org/D143193 --- llvm/include/llvm/CodeGen/TargetLowering.h | 2 + llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 43 ++++++++++++++++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 ++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 13 +++-- llvm/test/CodeGen/AArch64/abd-combine.ll | 67 ++++++++----------------- llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll | 4 +- llvm/test/CodeGen/Thumb2/mve-vabdus.ll | 19 +++---- 7 files changed, 83 insertions(+), 69 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 9ad82ab..e26540a 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2694,6 +2694,8 @@ public: case ISD::AVGFLOORU: case ISD::AVGCEILS: case ISD::AVGCEILU: + case ISD::ABDS: + case ISD::ABDU: return true; default: return false; } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 25ff0c5..112d729 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -435,6 +435,7 @@ namespace { SDValue visitMULHU(SDNode *N); SDValue visitMULHS(SDNode *N); SDValue visitAVG(SDNode *N); + SDValue visitABD(SDNode *N); SDValue visitSMUL_LOHI(SDNode *N); SDValue visitUMUL_LOHI(SDNode *N); SDValue visitMULO(SDNode *N); @@ -1721,6 +1722,8 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::AVGFLOORU: case ISD::AVGCEILS: case ISD::AVGCEILU: return visitAVG(N); + case ISD::ABDS: + case ISD::ABDU: return visitABD(N); case ISD::SMUL_LOHI: return visitSMUL_LOHI(N); case ISD::UMUL_LOHI: return visitUMUL_LOHI(N); case ISD::SMULO: @@ -4892,6 +4895,46 @@ SDValue DAGCombiner::visitAVG(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitABD(SDNode *N) { + unsigned Opcode = N->getOpcode(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + // fold (abd c1, c2) + if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1})) + return C; + // reassociate if possible + if (SDValue C = reassociateOps(Opcode, DL, N0, N1, N->getFlags())) + return C; + + // canonicalize constant to RHS. + if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && + !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0); + + if (VT.isVector()) { + if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) + return FoldedVOp; + + // fold (abds x, 0) -> abs x + // fold (abdu x, 0) -> x + if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) { + if (Opcode == ISD::ABDS) + return DAG.getNode(ISD::ABS, DL, VT, N0); + if (Opcode == ISD::ABDU) + return N0; + } + } + + // fold (abd x, undef) -> 0 + if (N0.isUndef() || N1.isUndef()) + return DAG.getConstant(0, DL, VT); + + return SDValue(); +} + /// Perform optimizations common to nodes that compute two values. LoOp and HiOp /// give the opcodes for the two computations that are being performed. Return /// true if a simplification was made. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 9a3609b..e26e703 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5710,6 +5710,10 @@ static std::optional FoldValue(unsigned Opcode, const APInt &C1, APInt C2Ext = C2.zext(FullWidth); return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1); } + case ISD::ABDS: + return APIntOps::smax(C1, C2) - APIntOps::smin(C1, C2); + case ISD::ABDU: + return APIntOps::umax(C1, C2) - APIntOps::umin(C1, C2); } return std::nullopt; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 10741e1..06fbc9e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5264,13 +5264,6 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } - case Intrinsic::aarch64_neon_sabd: - case Intrinsic::aarch64_neon_uabd: { - unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uabd ? ISD::ABDU - : ISD::ABDS; - return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), - Op.getOperand(2)); - } case Intrinsic::aarch64_neon_saddlp: case Intrinsic::aarch64_neon_uaddlp: { unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp @@ -18185,6 +18178,12 @@ static SDValue performIntrinsicCombine(SDNode *N, DAG.getConstant(N->getConstantOperandVal(2), DL, VT)); return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Sht); } + case Intrinsic::aarch64_neon_sabd: + return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_uabd: + return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_crc32b: case Intrinsic::aarch64_crc32cb: return tryCombineCRC32(0xff, N, DAG); diff --git a/llvm/test/CodeGen/AArch64/abd-combine.ll b/llvm/test/CodeGen/AArch64/abd-combine.ll index e2ed700..a5be2cd2 100644 --- a/llvm/test/CodeGen/AArch64/abd-combine.ll +++ b/llvm/test/CodeGen/AArch64/abd-combine.ll @@ -129,7 +129,7 @@ define <8 x i16> @abdu_i_const_lhs(<8 x i16> %src1) { ; CHECK-LABEL: abdu_i_const_lhs: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.8h, #1 -; CHECK-NEXT: uabd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: uabd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> , <8 x i16> %src1) ret <8 x i16> %result @@ -138,8 +138,7 @@ define <8 x i16> @abdu_i_const_lhs(<8 x i16> %src1) { define <8 x i16> @abdu_i_const_zero(float %t, <8 x i16> %src1) { ; CHECK-LABEL: abdu_i_const_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: uabd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> , <8 x i16> %src1) ret <8 x i16> %result @@ -148,9 +147,7 @@ define <8 x i16> @abdu_i_const_zero(float %t, <8 x i16> %src1) { define <8 x i16> @abdu_i_const_both() { ; CHECK-LABEL: abdu_i_const_both: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.8h, #1 -; CHECK-NEXT: movi v1.8h, #3 -; CHECK-NEXT: uabd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: movi v0.8h, #2 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -159,9 +156,7 @@ define <8 x i16> @abdu_i_const_both() { define <8 x i16> @abdu_i_const_bothhigh() { ; CHECK-LABEL: abdu_i_const_bothhigh: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0xffffffffffffffff -; CHECK-NEXT: mvni v1.8h, #1 -; CHECK-NEXT: uabd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: movi v0.8h, #1 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -170,10 +165,8 @@ define <8 x i16> @abdu_i_const_bothhigh() { define <8 x i16> @abdu_i_const_onehigh() { ; CHECK-LABEL: abdu_i_const_onehigh: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32766 -; CHECK-NEXT: movi v0.8h, #1 -; CHECK-NEXT: dup v1.8h, w8 -; CHECK-NEXT: uabd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: mov w8, #32765 +; CHECK-NEXT: dup v0.8h, w8 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -182,10 +175,7 @@ define <8 x i16> @abdu_i_const_onehigh() { define <8 x i16> @abdu_i_const_oneneg() { ; CHECK-LABEL: abdu_i_const_oneneg: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32766 -; CHECK-NEXT: mvni v1.8h, #1 -; CHECK-NEXT: dup v0.8h, w8 -; CHECK-NEXT: uabd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: movi v0.8h, #128, lsl #8 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -194,8 +184,7 @@ define <8 x i16> @abdu_i_const_oneneg() { define <8 x i16> @abdu_i_zero(<8 x i16> %t, <8 x i16> %src1) { ; CHECK-LABEL: abdu_i_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: uabd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> , <8 x i16> %src1) ret <8 x i16> %result @@ -204,7 +193,7 @@ define <8 x i16> @abdu_i_zero(<8 x i16> %t, <8 x i16> %src1) { define <8 x i16> @abdu_i_undef(<8 x i16> %t, <8 x i16> %src1) { ; CHECK-LABEL: abdu_i_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: uabd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> undef, <8 x i16> %src1) ret <8 x i16> %result @@ -213,10 +202,8 @@ define <8 x i16> @abdu_i_undef(<8 x i16> %t, <8 x i16> %src1) { define <8 x i16> @abdu_i_reassoc(<8 x i16> %src1) { ; CHECK-LABEL: abdu_i_reassoc: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.8h, #3 -; CHECK-NEXT: movi v2.8h, #1 +; CHECK-NEXT: movi v1.8h, #2 ; CHECK-NEXT: uabd v0.8h, v0.8h, v1.8h -; CHECK-NEXT: uabd v0.8h, v0.8h, v2.8h ; CHECK-NEXT: ret %r1 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %src1, <8 x i16> ) %result = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %r1, <8 x i16> ) @@ -360,7 +347,7 @@ define <8 x i16> @abds_i_const_lhs(<8 x i16> %src1) { ; CHECK-LABEL: abds_i_const_lhs: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.8h, #1 -; CHECK-NEXT: sabd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: sabd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> , <8 x i16> %src1) ret <8 x i16> %result @@ -369,8 +356,7 @@ define <8 x i16> @abds_i_const_lhs(<8 x i16> %src1) { define <8 x i16> @abds_i_const_zero(<8 x i16> %src1) { ; CHECK-LABEL: abds_i_const_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: sabd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: abs v0.8h, v0.8h ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> , <8 x i16> %src1) ret <8 x i16> %result @@ -379,9 +365,7 @@ define <8 x i16> @abds_i_const_zero(<8 x i16> %src1) { define <8 x i16> @abds_i_const_both() { ; CHECK-LABEL: abds_i_const_both: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.8h, #1 -; CHECK-NEXT: movi v1.8h, #3 -; CHECK-NEXT: sabd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: movi v0.8h, #2 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -390,10 +374,7 @@ define <8 x i16> @abds_i_const_both() { define <8 x i16> @abds_i_const_bothhigh() { ; CHECK-LABEL: abds_i_const_bothhigh: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32766 -; CHECK-NEXT: mvni v1.8h, #128, lsl #8 -; CHECK-NEXT: dup v0.8h, w8 -; CHECK-NEXT: sabd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: movi v0.8h, #1 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -402,10 +383,8 @@ define <8 x i16> @abds_i_const_bothhigh() { define <8 x i16> @abds_i_const_onehigh() { ; CHECK-LABEL: abds_i_const_onehigh: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32766 -; CHECK-NEXT: movi v0.8h, #1 -; CHECK-NEXT: dup v1.8h, w8 -; CHECK-NEXT: sabd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: mov w8, #32765 +; CHECK-NEXT: dup v0.8h, w8 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -414,10 +393,7 @@ define <8 x i16> @abds_i_const_onehigh() { define <8 x i16> @abds_i_const_oneneg() { ; CHECK-LABEL: abds_i_const_oneneg: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32766 -; CHECK-NEXT: mvni v1.8h, #1 -; CHECK-NEXT: dup v0.8h, w8 -; CHECK-NEXT: sabd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: movi v0.8h, #128, lsl #8 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -426,8 +402,7 @@ define <8 x i16> @abds_i_const_oneneg() { define <8 x i16> @abds_i_zero(<8 x i16> %t, <8 x i16> %src1) { ; CHECK-LABEL: abds_i_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: sabd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: abs v0.8h, v1.8h ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> , <8 x i16> %src1) ret <8 x i16> %result @@ -436,7 +411,7 @@ define <8 x i16> @abds_i_zero(<8 x i16> %t, <8 x i16> %src1) { define <8 x i16> @abds_i_undef(<8 x i16> %t, <8 x i16> %src1) { ; CHECK-LABEL: abds_i_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: sabd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> undef, <8 x i16> %src1) ret <8 x i16> %result @@ -445,10 +420,8 @@ define <8 x i16> @abds_i_undef(<8 x i16> %t, <8 x i16> %src1) { define <8 x i16> @abds_i_reassoc(<8 x i16> %src1) { ; CHECK-LABEL: abds_i_reassoc: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.8h, #3 -; CHECK-NEXT: movi v2.8h, #1 +; CHECK-NEXT: movi v1.8h, #2 ; CHECK-NEXT: sabd v0.8h, v0.8h, v1.8h -; CHECK-NEXT: sabd v0.8h, v0.8h, v2.8h ; CHECK-NEXT: ret %r1 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %src1, <8 x i16> ) %result = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %r1, <8 x i16> ) diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll index f1dc577..a13eac9 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll @@ -200,9 +200,7 @@ define <2 x i32> @test_sabd_v2i32_const() { ; CHECK-LABEL: test_sabd_v2i32_const: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI19_0 -; CHECK-NEXT: movi d0, #0x00ffffffff0000 -; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI19_0] -; CHECK-NEXT: sabd v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI19_0] ; CHECK-NEXT: ret %1 = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32( <2 x i32> , diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll index 87e0997..bdb1168 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll @@ -617,9 +617,8 @@ for.cond.cleanup: ; preds = %vector.body define arm_aapcs_vfpcc <4 x i32> @vabd_v4u32_commutative(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vabd_v4u32_commutative: ; CHECK: @ %bb.0: -; CHECK-NEXT: vabd.u32 q2, q1, q0 -; CHECK-NEXT: vabd.u32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vabd.u32 q0, q1, q0 +; CHECK-NEXT: vadd.i32 q0, q0, q0 ; CHECK-NEXT: bx lr %azextsrc1 = zext <4 x i32> %src1 to <4 x i64> %azextsrc2 = zext <4 x i32> %src2 to <4 x i64> @@ -642,15 +641,11 @@ define arm_aapcs_vfpcc <4 x i32> @vabd_v4u32_commutative(<4 x i32> %src1, <4 x i define arm_aapcs_vfpcc <4 x i32> @vabd_v4u32_shuffle(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vabd_v4u32_shuffle: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.f32 s8, s7 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s10, s5 -; CHECK-NEXT: vmov.f32 s11, s4 -; CHECK-NEXT: vmov.f32 s4, s3 -; CHECK-NEXT: vmov.f32 s5, s2 -; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vabd.u32 q0, q1, q2 +; CHECK-NEXT: vabd.u32 q1, q0, q1 +; CHECK-NEXT: vmov.f32 s0, s7 +; CHECK-NEXT: vmov.f32 s1, s6 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov.f32 s3, s4 ; CHECK-NEXT: bx lr %s1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <4 x i32> %s2 = shufflevector <4 x i32> %src2, <4 x i32> undef, <4 x i32> -- 2.7.4