From 693d3b7e76367a1dd31b594ba72bdda5391dfef3 Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 26 Mar 2022 21:10:43 +0000 Subject: [PATCH] [AArch64] Lower 3 and 4 sources buildvectors to TBL The default expansion for buildvectors is to extract each element and insert them into a new vector. That involves a lot of copying to/from the GPR registers. TLB3 and TLB4 can be relatively slow instructions with the mask needing to be loaded from a constant pool, but they should always be better than all the moves to/from GPRs. Differential Revision: https://reviews.llvm.org/D121137 --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 68 ++++- llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll | 355 ++++++++++------------ llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll | 295 ++++++++---------- llvm/test/CodeGen/AArch64/neon-extracttruncate.ll | 37 +-- llvm/test/CodeGen/AArch64/shuffle-tbl34.ll | 349 +++++++++++---------- llvm/test/CodeGen/AArch64/tbl-loops.ll | 152 ++++----- 6 files changed, 592 insertions(+), 664 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index cb8207e..c482899 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9079,10 +9079,72 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, Source->MaxElt = std::max(Source->MaxElt, EltNo); } + // If we have 3 or 4 sources, try to generate a TBL, which will at least be + // better than moving to/from gpr registers for larger vectors. + if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) { + // Construct a mask for the tbl. We may need to adjust the index for types + // larger than i8. + SmallVector Mask; + unsigned OutputFactor = VT.getScalarSizeInBits() / 8; + for (unsigned I = 0; I < NumElts; ++I) { + SDValue V = Op.getOperand(I); + if (V.isUndef()) { + for (unsigned OF = 0; OF < OutputFactor; OF++) + Mask.push_back(-1); + continue; + } + // Set the Mask lanes adjusted for the size of the input and output + // lanes. The Mask is always i8, so it will set OutputFactor lanes per + // output element, adjusted in their positions per input and output types. + unsigned Lane = V.getConstantOperandVal(1); + for (unsigned S = 0; S < Sources.size(); S++) { + if (V.getOperand(0) == Sources[S].Vec) { + unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits(); + unsigned InputBase = 16 * S + Lane * InputSize / 8; + for (unsigned OF = 0; OF < OutputFactor; OF++) + Mask.push_back(InputBase + OF); + break; + } + } + } + + // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to + // v16i8, and the TBLMask + SmallVector TBLOperands; + TBLOperands.push_back(DAG.getConstant(Sources.size() == 3 + ? Intrinsic::aarch64_neon_tbl3 + : Intrinsic::aarch64_neon_tbl4, + dl, MVT::i32)); + for (unsigned i = 0; i < Sources.size(); i++) { + SDValue Src = Sources[i].Vec; + EVT SrcVT = Src.getValueType(); + Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src); + assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) && + "Expected a legally typed vector"); + if (SrcVT.is64BitVector()) + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src, + DAG.getUNDEF(MVT::v8i8)); + TBLOperands.push_back(Src); + } + + SmallVector TBLMask; + for (unsigned i = 0; i < Mask.size(); i++) + TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32)); + assert((Mask.size() == 8 || Mask.size() == 16) && + "Expected a v8i8 or v16i8 Mask"); + TBLOperands.push_back( + DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask)); + + SDValue Shuffle = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, + Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands); + return DAG.getBitcast(VT, Shuffle); + } + if (Sources.size() > 2) { - LLVM_DEBUG( - dbgs() << "Reshuffle failed: currently only do something sane when at " - "most two source vectors are involved\n"); + LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something " + << "sensible when at most two source vectors are " + << "involved\n"); return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll index 33fa504..6e30267 100644 --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -3321,75 +3321,63 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) { define <8 x i8> @test_signed_v8f64_v8i8(<8 x double> %f) { ; CHECK-LABEL: test_signed_v8f64_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov d4, v0.d[1] +; CHECK-NEXT: mov d4, v3.d[1] ; CHECK-NEXT: mov w8, #127 -; CHECK-NEXT: fcvtzs w11, d0 -; CHECK-NEXT: mov w9, #-128 -; CHECK-NEXT: mov d0, v2.d[1] -; CHECK-NEXT: fcvtzs w13, d1 -; CHECK-NEXT: fcvtzs w15, d3 -; CHECK-NEXT: fcvtzs w10, d4 -; CHECK-NEXT: mov d4, v1.d[1] -; CHECK-NEXT: mov d1, v3.d[1] -; CHECK-NEXT: fcvtzs w14, d0 +; CHECK-NEXT: fcvtzs w10, d3 +; CHECK-NEXT: mov w11, #-128 +; CHECK-NEXT: mov d3, v1.d[1] +; CHECK-NEXT: fcvtzs w13, d2 +; CHECK-NEXT: fcvtzs w15, d1 +; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: fcvtzs w9, d4 +; CHECK-NEXT: mov d4, v2.d[1] +; CHECK-NEXT: fcvtzs w14, d3 +; CHECK-NEXT: cmp w9, #127 +; CHECK-NEXT: csel w9, w9, w8, lt +; CHECK-NEXT: fcvtzs w12, d4 +; CHECK-NEXT: cmn w9, #128 +; CHECK-NEXT: csel w9, w9, w11, gt ; CHECK-NEXT: cmp w10, #127 ; CHECK-NEXT: csel w10, w10, w8, lt -; CHECK-NEXT: fcvtzs w12, d4 ; CHECK-NEXT: cmn w10, #128 -; CHECK-NEXT: csel w10, w10, w9, gt -; CHECK-NEXT: cmp w11, #127 -; CHECK-NEXT: csel w11, w11, w8, lt -; CHECK-NEXT: cmn w11, #128 -; CHECK-NEXT: csel w11, w11, w9, gt +; CHECK-NEXT: csel w10, w10, w11, gt ; CHECK-NEXT: cmp w12, #127 ; CHECK-NEXT: csel w12, w12, w8, lt ; CHECK-NEXT: cmn w12, #128 -; CHECK-NEXT: csel w12, w12, w9, gt -; CHECK-NEXT: cmp w13, #127 -; CHECK-NEXT: fmov s0, w11 -; CHECK-NEXT: csel w11, w13, w8, lt -; CHECK-NEXT: cmn w11, #128 -; CHECK-NEXT: fcvtzs w13, d2 -; CHECK-NEXT: csel w11, w11, w9, gt -; CHECK-NEXT: cmp w14, #127 -; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: csel w10, w14, w8, lt -; CHECK-NEXT: cmn w10, #128 -; CHECK-NEXT: fmov s2, w11 -; CHECK-NEXT: csel w10, w10, w9, gt +; CHECK-NEXT: csel w12, w12, w11, gt ; CHECK-NEXT: cmp w13, #127 -; CHECK-NEXT: mov w11, v0.s[1] ; CHECK-NEXT: csel w13, w13, w8, lt -; CHECK-NEXT: mov v2.s[1], w12 +; CHECK-NEXT: fmov s5, w10 ; CHECK-NEXT: cmn w13, #128 -; CHECK-NEXT: fcvtzs w12, d1 -; CHECK-NEXT: csel w13, w13, w9, gt -; CHECK-NEXT: mov v0.b[1], w11 -; CHECK-NEXT: fmov w14, s2 -; CHECK-NEXT: cmp w12, #127 -; CHECK-NEXT: fmov s1, w13 -; CHECK-NEXT: csel w12, w12, w8, lt -; CHECK-NEXT: cmn w12, #128 -; CHECK-NEXT: mov w11, v2.s[1] -; CHECK-NEXT: mov v0.b[2], w14 -; CHECK-NEXT: csel w12, w12, w9, gt +; CHECK-NEXT: csel w13, w13, w11, gt +; CHECK-NEXT: cmp w14, #127 +; CHECK-NEXT: csel w14, w14, w8, lt +; CHECK-NEXT: cmn w14, #128 +; CHECK-NEXT: csel w10, w14, w11, gt ; CHECK-NEXT: cmp w15, #127 -; CHECK-NEXT: mov v1.s[1], w10 +; CHECK-NEXT: fcvtzs w14, d1 +; CHECK-NEXT: csel w15, w15, w8, lt +; CHECK-NEXT: cmn w15, #128 +; CHECK-NEXT: mov v5.s[1], w9 +; CHECK-NEXT: csel w9, w15, w11, gt +; CHECK-NEXT: cmp w14, #127 +; CHECK-NEXT: fcvtzs w15, d0 +; CHECK-NEXT: fmov s4, w13 +; CHECK-NEXT: csel w13, w14, w8, lt +; CHECK-NEXT: cmn w13, #128 +; CHECK-NEXT: csel w13, w13, w11, gt +; CHECK-NEXT: cmp w15, #127 +; CHECK-NEXT: mov v4.s[1], w12 ; CHECK-NEXT: csel w8, w15, w8, lt +; CHECK-NEXT: fmov s3, w9 ; CHECK-NEXT: cmn w8, #128 -; CHECK-NEXT: csel w8, w8, w9, gt -; CHECK-NEXT: mov v0.b[3], w11 -; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: csel w8, w8, w11, gt +; CHECK-NEXT: mov v3.s[1], w10 ; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov v0.b[4], w9 -; CHECK-NEXT: mov v2.s[1], w12 -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov w9, v2.s[1] -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w9 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI82_0 +; CHECK-NEXT: mov v2.s[1], w13 +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI82_0] +; CHECK-NEXT: tbl v0.8b, { v2.16b, v3.16b, v4.16b, v5.16b }, v0.8b ; CHECK-NEXT: ret %x = call <8 x i8> @llvm.fptosi.sat.v8f64.v8i8(<8 x double> %f) ret <8 x i8> %x @@ -3542,17 +3530,17 @@ define <16 x i8> @test_signed_v16f64_v16i8(<16 x double> %f) { define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) { ; CHECK-LABEL: test_signed_v8f64_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov d4, v0.d[1] +; CHECK-NEXT: mov d4, v3.d[1] ; CHECK-NEXT: mov w8, #32767 -; CHECK-NEXT: fcvtzs w10, d0 +; CHECK-NEXT: fcvtzs w10, d3 ; CHECK-NEXT: mov w11, #-32768 -; CHECK-NEXT: mov d0, v2.d[1] -; CHECK-NEXT: fcvtzs w13, d1 -; CHECK-NEXT: fcvtzs w15, d3 +; CHECK-NEXT: mov d3, v1.d[1] +; CHECK-NEXT: fcvtzs w13, d2 +; CHECK-NEXT: fcvtzs w15, d1 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: fcvtzs w9, d4 -; CHECK-NEXT: mov d4, v1.d[1] -; CHECK-NEXT: mov d1, v3.d[1] -; CHECK-NEXT: fcvtzs w14, d0 +; CHECK-NEXT: mov d4, v2.d[1] +; CHECK-NEXT: fcvtzs w14, d3 ; CHECK-NEXT: cmp w9, w8 ; CHECK-NEXT: csel w9, w9, w8, lt ; CHECK-NEXT: fcvtzs w12, d4 @@ -3567,49 +3555,38 @@ define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) { ; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w12, w12, w11, gt ; CHECK-NEXT: cmp w13, w8 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: csel w10, w13, w8, lt -; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-NEXT: fcvtzs w13, d2 -; CHECK-NEXT: csel w10, w10, w11, gt -; CHECK-NEXT: cmp w14, w8 -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: csel w9, w14, w8, lt -; CHECK-NEXT: cmn w9, #8, lsl #12 // =32768 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: csel w9, w9, w11, gt -; CHECK-NEXT: cmp w13, w8 -; CHECK-NEXT: mov w10, v0.s[1] ; CHECK-NEXT: csel w13, w13, w8, lt -; CHECK-NEXT: mov v2.s[1], w12 +; CHECK-NEXT: fmov s5, w10 ; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768 -; CHECK-NEXT: fcvtzs w12, d1 ; CHECK-NEXT: csel w13, w13, w11, gt -; CHECK-NEXT: mov v0.h[1], w10 -; CHECK-NEXT: fmov w14, s2 -; CHECK-NEXT: cmp w12, w8 -; CHECK-NEXT: fmov s1, w13 -; CHECK-NEXT: csel w12, w12, w8, lt -; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768 -; CHECK-NEXT: mov w10, v2.s[1] -; CHECK-NEXT: mov v0.h[2], w14 -; CHECK-NEXT: csel w12, w12, w11, gt +; CHECK-NEXT: cmp w14, w8 +; CHECK-NEXT: csel w14, w14, w8, lt +; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w10, w14, w11, gt ; CHECK-NEXT: cmp w15, w8 -; CHECK-NEXT: mov v1.s[1], w9 +; CHECK-NEXT: fcvtzs w14, d1 +; CHECK-NEXT: csel w15, w15, w8, lt +; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 +; CHECK-NEXT: mov v5.s[1], w9 +; CHECK-NEXT: csel w9, w15, w11, gt +; CHECK-NEXT: cmp w14, w8 +; CHECK-NEXT: fcvtzs w15, d0 +; CHECK-NEXT: fmov s4, w13 +; CHECK-NEXT: csel w13, w14, w8, lt +; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w13, w13, w11, gt +; CHECK-NEXT: cmp w15, w8 +; CHECK-NEXT: mov v4.s[1], w12 ; CHECK-NEXT: csel w8, w15, w8, lt +; CHECK-NEXT: fmov s3, w9 ; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w8, w8, w11, gt -; CHECK-NEXT: mov v0.h[3], w10 -; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov v3.s[1], w10 ; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov v0.h[4], w9 -; CHECK-NEXT: mov v2.s[1], w12 -; CHECK-NEXT: mov v0.h[5], w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov w9, v2.s[1] -; CHECK-NEXT: mov v0.h[6], w8 -; CHECK-NEXT: mov v0.h[7], w9 +; CHECK-NEXT: adrp x8, .LCPI84_0 +; CHECK-NEXT: mov v2.s[1], w13 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI84_0] +; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b, v4.16b, v5.16b }, v0.16b ; CHECK-NEXT: ret %x = call <8 x i16> @llvm.fptosi.sat.v8f64.v8i16(<8 x double> %f) ret <8 x i16> %x @@ -3618,140 +3595,116 @@ define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) { define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) { ; CHECK-LABEL: test_signed_v16f64_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov d16, v0.d[1] +; CHECK-NEXT: mov d16, v3.d[1] ; CHECK-NEXT: mov w9, #32767 -; CHECK-NEXT: fcvtzs w11, d0 +; CHECK-NEXT: fcvtzs w11, d3 ; CHECK-NEXT: mov w8, #-32768 -; CHECK-NEXT: mov d0, v2.d[1] -; CHECK-NEXT: fcvtzs w12, d1 +; CHECK-NEXT: mov d3, v1.d[1] ; CHECK-NEXT: fcvtzs w14, d2 -; CHECK-NEXT: mov d2, v4.d[1] +; CHECK-NEXT: fcvtzs w15, d1 +; CHECK-NEXT: mov d1, v7.d[1] ; CHECK-NEXT: fcvtzs w10, d16 -; CHECK-NEXT: mov d16, v1.d[1] -; CHECK-NEXT: mov d1, v3.d[1] -; CHECK-NEXT: fcvtzs w16, d3 -; CHECK-NEXT: fcvtzs w15, d0 -; CHECK-NEXT: mov d3, v6.d[1] +; CHECK-NEXT: mov d16, v2.d[1] +; CHECK-NEXT: mov d2, v0.d[1] +; CHECK-NEXT: fcvtzs w18, d0 +; CHECK-NEXT: mov d0, v6.d[1] +; CHECK-NEXT: fcvtzs w0, d7 ; CHECK-NEXT: cmp w10, w9 +; CHECK-NEXT: fcvtzs w2, d6 ; CHECK-NEXT: csel w10, w10, w9, lt -; CHECK-NEXT: fcvtzs w13, d16 +; CHECK-NEXT: fcvtzs w12, d16 ; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-NEXT: fcvtzs w17, d1 +; CHECK-NEXT: fcvtzs w17, d2 ; CHECK-NEXT: csel w10, w10, w8, gt ; CHECK-NEXT: cmp w11, w9 ; CHECK-NEXT: csel w11, w11, w9, lt -; CHECK-NEXT: mov d1, v5.d[1] +; CHECK-NEXT: fcvtzs w1, d0 +; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 +; CHECK-NEXT: mov d0, v4.d[1] +; CHECK-NEXT: csel w13, w11, w8, gt +; CHECK-NEXT: cmp w12, w9 +; CHECK-NEXT: csel w11, w12, w9, lt +; CHECK-NEXT: fcvtzs w12, d3 ; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w11, w11, w8, gt -; CHECK-NEXT: cmp w13, w9 -; CHECK-NEXT: csel w13, w13, w9, lt -; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w13, w13, w8, gt +; CHECK-NEXT: cmp w14, w9 +; CHECK-NEXT: csel w14, w14, w9, lt +; CHECK-NEXT: fmov s19, w13 +; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w14, w14, w8, gt ; CHECK-NEXT: cmp w12, w9 ; CHECK-NEXT: csel w12, w12, w9, lt -; CHECK-NEXT: fmov s0, w11 ; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w12, w12, w8, gt ; CHECK-NEXT: cmp w15, w9 ; CHECK-NEXT: csel w15, w15, w9, lt ; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w11, w15, w8, gt -; CHECK-NEXT: cmp w14, w9 -; CHECK-NEXT: csel w14, w14, w9, lt -; CHECK-NEXT: fcvtzs w15, d4 -; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w14, w14, w8, gt +; CHECK-NEXT: csel w16, w15, w8, gt ; CHECK-NEXT: cmp w17, w9 -; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: csel w10, w17, w9, lt -; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-NEXT: fcvtzs w17, d2 -; CHECK-NEXT: csel w10, w10, w8, gt -; CHECK-NEXT: cmp w16, w9 -; CHECK-NEXT: fmov s2, w12 -; CHECK-NEXT: csel w12, w16, w9, lt -; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768 -; CHECK-NEXT: mov w16, v0.s[1] -; CHECK-NEXT: csel w12, w12, w8, gt -; CHECK-NEXT: cmp w17, w9 -; CHECK-NEXT: mov v2.s[1], w13 -; CHECK-NEXT: csel w13, w17, w9, lt -; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w15, w17, w9, lt ; CHECK-NEXT: fcvtzs w17, d1 -; CHECK-NEXT: csel w13, w13, w8, gt -; CHECK-NEXT: cmp w15, w9 -; CHECK-NEXT: csel w15, w15, w9, lt -; CHECK-NEXT: fmov s4, w14 ; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 -; CHECK-NEXT: mov v0.h[1], w16 -; CHECK-NEXT: fcvtzs w16, d5 +; CHECK-NEXT: mov d1, v5.d[1] ; CHECK-NEXT: csel w15, w15, w8, gt +; CHECK-NEXT: cmp w18, w9 +; CHECK-NEXT: csel w18, w18, w9, lt +; CHECK-NEXT: cmn w18, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w18, w18, w8, gt ; CHECK-NEXT: cmp w17, w9 ; CHECK-NEXT: csel w17, w17, w9, lt ; CHECK-NEXT: cmn w17, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w14, w17, w8, gt -; CHECK-NEXT: cmp w16, w9 -; CHECK-NEXT: fmov s1, w15 -; CHECK-NEXT: csel w15, w16, w9, lt -; CHECK-NEXT: fcvtzs w16, d3 -; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 -; CHECK-NEXT: mov v4.s[1], w11 -; CHECK-NEXT: csel w11, w15, w8, gt -; CHECK-NEXT: fcvtzs w15, d6 -; CHECK-NEXT: mov v1.s[1], w13 -; CHECK-NEXT: cmp w16, w9 -; CHECK-NEXT: fmov s3, w11 -; CHECK-NEXT: csel w16, w16, w9, lt -; CHECK-NEXT: fmov w11, s2 -; CHECK-NEXT: mov w13, v2.s[1] -; CHECK-NEXT: mov d2, v7.d[1] -; CHECK-NEXT: cmn w16, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w16, w16, w8, gt -; CHECK-NEXT: cmp w15, w9 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: csel w11, w15, w9, lt -; CHECK-NEXT: mov w15, v1.s[1] -; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 -; CHECK-NEXT: mov v3.s[1], w14 -; CHECK-NEXT: fcvtzs w14, d2 -; CHECK-NEXT: csel w11, w11, w8, gt -; CHECK-NEXT: mov v0.h[3], w13 -; CHECK-NEXT: mov v1.h[1], w15 +; CHECK-NEXT: csel w17, w17, w8, gt +; CHECK-NEXT: cmp w0, w9 +; CHECK-NEXT: csel w0, w0, w9, lt +; CHECK-NEXT: cmn w0, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w13, w0, w8, gt +; CHECK-NEXT: cmp w1, w9 +; CHECK-NEXT: csel w1, w1, w9, lt +; CHECK-NEXT: fcvtzs w0, d1 +; CHECK-NEXT: cmn w1, #8, lsl #12 // =32768 +; CHECK-NEXT: mov v19.s[1], w10 +; CHECK-NEXT: csel w10, w1, w8, gt +; CHECK-NEXT: cmp w2, w9 +; CHECK-NEXT: fcvtzs w1, d5 +; CHECK-NEXT: csel w2, w2, w9, lt +; CHECK-NEXT: fmov s18, w14 +; CHECK-NEXT: cmn w2, #8, lsl #12 // =32768 +; CHECK-NEXT: fmov s23, w13 +; CHECK-NEXT: csel w2, w2, w8, gt +; CHECK-NEXT: cmp w0, w9 +; CHECK-NEXT: csel w14, w0, w9, lt +; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w13, w14, w8, gt +; CHECK-NEXT: cmp w1, w9 +; CHECK-NEXT: fcvtzs w14, d0 +; CHECK-NEXT: csel w0, w1, w9, lt +; CHECK-NEXT: cmn w0, #8, lsl #12 // =32768 +; CHECK-NEXT: mov v18.s[1], w11 +; CHECK-NEXT: csel w11, w0, w8, gt +; CHECK-NEXT: mov v23.s[1], w17 ; CHECK-NEXT: cmp w14, w9 -; CHECK-NEXT: fmov w13, s3 +; CHECK-NEXT: fcvtzs w17, d4 ; CHECK-NEXT: csel w14, w14, w9, lt -; CHECK-NEXT: fcvtzs w15, d7 -; CHECK-NEXT: fmov s2, w11 +; CHECK-NEXT: fmov s22, w2 ; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 -; CHECK-NEXT: mov w11, v3.s[1] -; CHECK-NEXT: mov v1.h[2], w13 -; CHECK-NEXT: csel w13, w14, w8, gt -; CHECK-NEXT: cmp w15, w9 -; CHECK-NEXT: fmov s3, w12 -; CHECK-NEXT: mov v2.s[1], w16 -; CHECK-NEXT: csel w9, w15, w9, lt +; CHECK-NEXT: csel w14, w14, w8, gt +; CHECK-NEXT: fmov s17, w16 +; CHECK-NEXT: cmp w17, w9 +; CHECK-NEXT: mov v22.s[1], w10 +; CHECK-NEXT: csel w9, w17, w9, lt +; CHECK-NEXT: fmov s21, w11 ; CHECK-NEXT: cmn w9, #8, lsl #12 // =32768 -; CHECK-NEXT: fmov w12, s4 ; CHECK-NEXT: csel w8, w9, w8, gt -; CHECK-NEXT: mov w14, v4.s[1] -; CHECK-NEXT: mov v1.h[3], w11 -; CHECK-NEXT: fmov w11, s2 -; CHECK-NEXT: mov w9, v2.s[1] -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: mov v0.h[4], w12 -; CHECK-NEXT: mov v1.h[4], w11 -; CHECK-NEXT: mov v3.s[1], w10 -; CHECK-NEXT: mov v2.s[1], w13 -; CHECK-NEXT: mov v0.h[5], w14 -; CHECK-NEXT: mov v1.h[5], w9 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov w10, v3.s[1] -; CHECK-NEXT: mov w11, v2.s[1] -; CHECK-NEXT: mov v0.h[6], w8 -; CHECK-NEXT: mov v1.h[6], w9 -; CHECK-NEXT: mov v0.h[7], w10 -; CHECK-NEXT: mov v1.h[7], w11 +; CHECK-NEXT: adrp x9, .LCPI85_0 +; CHECK-NEXT: mov v17.s[1], w12 +; CHECK-NEXT: mov v21.s[1], w13 +; CHECK-NEXT: fmov s16, w18 +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI85_0] +; CHECK-NEXT: fmov s20, w8 +; CHECK-NEXT: mov v16.s[1], w15 +; CHECK-NEXT: mov v20.s[1], w14 +; CHECK-NEXT: tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b +; CHECK-NEXT: tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b ; CHECK-NEXT: ret %x = call <16 x i16> @llvm.fptosi.sat.v16f64.v16i16(<16 x double> %f) ret <16 x i16> %x diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll index 99ffde4..35b7861 100644 --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -2768,58 +2768,46 @@ define <16 x i16> @test_unsigned_v16f16_v16i16(<16 x half> %f) { define <8 x i8> @test_unsigned_v8f64_v8i8(<8 x double> %f) { ; CHECK-LABEL: test_unsigned_v8f64_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov d5, v0.d[1] -; CHECK-NEXT: fcvtzu w10, d0 -; CHECK-NEXT: mov d0, v1.d[1] +; CHECK-NEXT: mov d4, v3.d[1] +; CHECK-NEXT: fcvtzu w10, d3 +; CHECK-NEXT: mov d3, v2.d[1] ; CHECK-NEXT: mov w8, #255 -; CHECK-NEXT: fcvtzu w12, d1 -; CHECK-NEXT: mov d4, v2.d[1] -; CHECK-NEXT: fcvtzu w13, d3 -; CHECK-NEXT: fcvtzu w9, d5 -; CHECK-NEXT: fcvtzu w11, d0 +; CHECK-NEXT: fcvtzu w12, d2 +; CHECK-NEXT: fcvtzu w13, d1 +; CHECK-NEXT: fcvtzu w9, d4 +; CHECK-NEXT: mov d4, v1.d[1] +; CHECK-NEXT: fcvtzu w11, d3 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: cmp w9, #255 ; CHECK-NEXT: csel w9, w9, w8, lo ; CHECK-NEXT: cmp w10, #255 ; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: cmp w11, #255 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: csel w10, w11, w8, lo +; CHECK-NEXT: csel w11, w11, w8, lo ; CHECK-NEXT: cmp w12, #255 -; CHECK-NEXT: csel w11, w12, w8, lo -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: fcvtzu w9, d4 -; CHECK-NEXT: fmov s1, w11 -; CHECK-NEXT: fcvtzu w11, d2 +; CHECK-NEXT: csel w12, w12, w8, lo +; CHECK-NEXT: fmov s19, w10 +; CHECK-NEXT: fcvtzu w10, d4 +; CHECK-NEXT: cmp w10, #255 +; CHECK-NEXT: mov v19.s[1], w9 +; CHECK-NEXT: csel w10, w10, w8, lo +; CHECK-NEXT: cmp w13, #255 +; CHECK-NEXT: fmov s18, w12 +; CHECK-NEXT: fcvtzu w9, d1 +; CHECK-NEXT: csel w12, w13, w8, lo +; CHECK-NEXT: fcvtzu w13, d0 +; CHECK-NEXT: mov v18.s[1], w11 ; CHECK-NEXT: cmp w9, #255 -; CHECK-NEXT: mov d2, v3.d[1] -; CHECK-NEXT: mov w12, v0.s[1] +; CHECK-NEXT: fmov s17, w12 ; CHECK-NEXT: csel w9, w9, w8, lo -; CHECK-NEXT: mov v1.s[1], w10 -; CHECK-NEXT: cmp w11, #255 -; CHECK-NEXT: csel w10, w11, w8, lo -; CHECK-NEXT: mov v0.b[1], w12 -; CHECK-NEXT: fmov w11, s1 -; CHECK-NEXT: fmov s4, w10 -; CHECK-NEXT: fcvtzu w10, d2 -; CHECK-NEXT: mov w12, v1.s[1] -; CHECK-NEXT: mov v0.b[2], w11 -; CHECK-NEXT: mov v4.s[1], w9 -; CHECK-NEXT: cmp w10, #255 -; CHECK-NEXT: csel w9, w10, w8, lo ; CHECK-NEXT: cmp w13, #255 ; CHECK-NEXT: csel w8, w13, w8, lo -; CHECK-NEXT: mov v0.b[3], w12 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov w8, v4.s[1] -; CHECK-NEXT: mov v0.b[4], w10 -; CHECK-NEXT: mov v1.s[1], w9 -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov w9, v1.s[1] -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w9 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov v17.s[1], w10 +; CHECK-NEXT: fmov s16, w8 +; CHECK-NEXT: adrp x8, .LCPI82_0 +; CHECK-NEXT: mov v16.s[1], w9 +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI82_0] +; CHECK-NEXT: tbl v0.8b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.8b ; CHECK-NEXT: ret %x = call <8 x i8> @llvm.fptoui.sat.v8f64.v8i8(<8 x double> %f) ret <8 x i8> %x @@ -2939,57 +2927,46 @@ define <16 x i8> @test_unsigned_v16f64_v16i8(<16 x double> %f) { define <8 x i16> @test_unsigned_v8f64_v8i16(<8 x double> %f) { ; CHECK-LABEL: test_unsigned_v8f64_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov d5, v0.d[1] -; CHECK-NEXT: fcvtzu w10, d0 -; CHECK-NEXT: mov d0, v1.d[1] +; CHECK-NEXT: mov d4, v3.d[1] +; CHECK-NEXT: fcvtzu w10, d3 +; CHECK-NEXT: mov d3, v2.d[1] ; CHECK-NEXT: mov w8, #65535 -; CHECK-NEXT: fcvtzu w12, d1 -; CHECK-NEXT: mov d4, v2.d[1] -; CHECK-NEXT: fcvtzu w13, d3 -; CHECK-NEXT: fcvtzu w9, d5 -; CHECK-NEXT: fcvtzu w11, d0 +; CHECK-NEXT: fcvtzu w12, d2 +; CHECK-NEXT: fcvtzu w13, d1 +; CHECK-NEXT: fcvtzu w9, d4 +; CHECK-NEXT: mov d4, v1.d[1] +; CHECK-NEXT: fcvtzu w11, d3 +; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: cmp w9, w8 ; CHECK-NEXT: csel w9, w9, w8, lo ; CHECK-NEXT: cmp w10, w8 ; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: cmp w11, w8 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: csel w10, w11, w8, lo +; CHECK-NEXT: csel w11, w11, w8, lo ; CHECK-NEXT: cmp w12, w8 -; CHECK-NEXT: csel w11, w12, w8, lo -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: fcvtzu w9, d4 -; CHECK-NEXT: fmov s1, w11 -; CHECK-NEXT: fcvtzu w11, d2 +; CHECK-NEXT: csel w12, w12, w8, lo +; CHECK-NEXT: fmov s19, w10 +; CHECK-NEXT: fcvtzu w10, d4 +; CHECK-NEXT: cmp w10, w8 +; CHECK-NEXT: mov v19.s[1], w9 +; CHECK-NEXT: csel w10, w10, w8, lo +; CHECK-NEXT: cmp w13, w8 +; CHECK-NEXT: fmov s18, w12 +; CHECK-NEXT: fcvtzu w9, d1 +; CHECK-NEXT: csel w12, w13, w8, lo +; CHECK-NEXT: fcvtzu w13, d0 +; CHECK-NEXT: mov v18.s[1], w11 ; CHECK-NEXT: cmp w9, w8 -; CHECK-NEXT: mov d2, v3.d[1] -; CHECK-NEXT: mov w12, v0.s[1] +; CHECK-NEXT: fmov s17, w12 ; CHECK-NEXT: csel w9, w9, w8, lo -; CHECK-NEXT: mov v1.s[1], w10 -; CHECK-NEXT: cmp w11, w8 -; CHECK-NEXT: csel w10, w11, w8, lo -; CHECK-NEXT: mov v0.h[1], w12 -; CHECK-NEXT: fmov w11, s1 -; CHECK-NEXT: fmov s4, w10 -; CHECK-NEXT: fcvtzu w10, d2 -; CHECK-NEXT: mov w12, v1.s[1] -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: mov v4.s[1], w9 -; CHECK-NEXT: cmp w10, w8 -; CHECK-NEXT: csel w9, w10, w8, lo ; CHECK-NEXT: cmp w13, w8 ; CHECK-NEXT: csel w8, w13, w8, lo -; CHECK-NEXT: mov v0.h[3], w12 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov w8, v4.s[1] -; CHECK-NEXT: mov v0.h[4], w10 -; CHECK-NEXT: mov v1.s[1], w9 -; CHECK-NEXT: mov v0.h[5], w8 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov w9, v1.s[1] -; CHECK-NEXT: mov v0.h[6], w8 -; CHECK-NEXT: mov v0.h[7], w9 +; CHECK-NEXT: mov v17.s[1], w10 +; CHECK-NEXT: fmov s16, w8 +; CHECK-NEXT: adrp x8, .LCPI84_0 +; CHECK-NEXT: mov v16.s[1], w9 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI84_0] +; CHECK-NEXT: tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b ; CHECK-NEXT: ret %x = call <8 x i16> @llvm.fptoui.sat.v8f64.v8i16(<8 x double> %f) ret <8 x i16> %x @@ -2998,107 +2975,83 @@ define <8 x i16> @test_unsigned_v8f64_v8i16(<8 x double> %f) { define <16 x i16> @test_unsigned_v16f64_v16i16(<16 x double> %f) { ; CHECK-LABEL: test_unsigned_v16f64_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov d16, v0.d[1] -; CHECK-NEXT: fcvtzu w9, d0 -; CHECK-NEXT: mov d0, v1.d[1] -; CHECK-NEXT: mov d17, v2.d[1] -; CHECK-NEXT: fcvtzu w10, d1 -; CHECK-NEXT: mov d1, v3.d[1] +; CHECK-NEXT: mov d16, v3.d[1] +; CHECK-NEXT: fcvtzu w9, d3 +; CHECK-NEXT: mov d3, v2.d[1] ; CHECK-NEXT: mov w8, #65535 -; CHECK-NEXT: fcvtzu w12, d2 -; CHECK-NEXT: fcvtzu w11, d16 -; CHECK-NEXT: mov d2, v4.d[1] +; CHECK-NEXT: fcvtzu w10, d2 +; CHECK-NEXT: mov d2, v1.d[1] +; CHECK-NEXT: fcvtzu w11, d1 +; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: fcvtzu w12, d16 ; CHECK-NEXT: fcvtzu w13, d0 -; CHECK-NEXT: fcvtzu w14, d17 -; CHECK-NEXT: fcvtzu w15, d1 -; CHECK-NEXT: fcvtzu w16, d3 -; CHECK-NEXT: cmp w11, w8 -; CHECK-NEXT: mov d1, v5.d[1] -; CHECK-NEXT: csel w11, w11, w8, lo +; CHECK-NEXT: fcvtzu w14, d3 +; CHECK-NEXT: mov d0, v7.d[1] +; CHECK-NEXT: fcvtzu w15, d2 +; CHECK-NEXT: fcvtzu w17, d6 +; CHECK-NEXT: cmp w12, w8 +; CHECK-NEXT: fcvtzu w16, d1 +; CHECK-NEXT: csel w12, w12, w8, lo ; CHECK-NEXT: cmp w9, w8 ; CHECK-NEXT: csel w9, w9, w8, lo -; CHECK-NEXT: cmp w13, w8 -; CHECK-NEXT: csel w13, w13, w8, lo -; CHECK-NEXT: cmp w10, w8 -; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: cmp w14, w8 ; CHECK-NEXT: csel w14, w14, w8, lo -; CHECK-NEXT: cmp w12, w8 -; CHECK-NEXT: csel w12, w12, w8, lo +; CHECK-NEXT: cmp w10, w8 +; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: cmp w15, w8 -; CHECK-NEXT: fcvtzu w17, d2 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: fmov s19, w9 ; CHECK-NEXT: csel w9, w15, w8, lo -; CHECK-NEXT: fcvtzu w15, d4 -; CHECK-NEXT: cmp w16, w8 -; CHECK-NEXT: fcvtzu w18, d1 -; CHECK-NEXT: csel w16, w16, w8, lo -; CHECK-NEXT: cmp w17, w8 -; CHECK-NEXT: csel w17, w17, w8, lo -; CHECK-NEXT: cmp w15, w8 -; CHECK-NEXT: mov v0.s[1], w11 -; CHECK-NEXT: fcvtzu w0, d5 -; CHECK-NEXT: csel w11, w15, w8, lo -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: cmp w18, w8 -; CHECK-NEXT: mov d4, v6.d[1] -; CHECK-NEXT: csel w10, w18, w8, lo -; CHECK-NEXT: cmp w0, w8 -; CHECK-NEXT: fmov s1, w11 -; CHECK-NEXT: csel w11, w0, w8, lo -; CHECK-NEXT: mov v2.s[1], w13 -; CHECK-NEXT: mov w13, v0.s[1] -; CHECK-NEXT: fcvtzu w15, d4 -; CHECK-NEXT: mov v1.s[1], w17 -; CHECK-NEXT: fmov s3, w11 -; CHECK-NEXT: mov d4, v7.d[1] -; CHECK-NEXT: mov v0.h[1], w13 -; CHECK-NEXT: fmov w11, s2 -; CHECK-NEXT: mov v3.s[1], w10 -; CHECK-NEXT: cmp w15, w8 -; CHECK-NEXT: mov w10, v1.s[1] -; CHECK-NEXT: mov w13, v2.s[1] -; CHECK-NEXT: fmov s2, w12 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: fcvtzu w11, d6 -; CHECK-NEXT: csel w12, w15, w8, lo -; CHECK-NEXT: mov v1.h[1], w10 -; CHECK-NEXT: fmov w10, s3 ; CHECK-NEXT: cmp w11, w8 +; CHECK-NEXT: fcvtzu w15, d0 +; CHECK-NEXT: mov d0, v6.d[1] ; CHECK-NEXT: csel w11, w11, w8, lo -; CHECK-NEXT: mov v0.h[3], w13 -; CHECK-NEXT: fcvtzu w13, d7 -; CHECK-NEXT: mov v1.h[2], w10 -; CHECK-NEXT: fmov s5, w11 -; CHECK-NEXT: fcvtzu w10, d4 -; CHECK-NEXT: mov w11, v3.s[1] -; CHECK-NEXT: mov v2.s[1], w14 -; CHECK-NEXT: fmov s3, w16 -; CHECK-NEXT: mov v5.s[1], w12 -; CHECK-NEXT: cmp w10, w8 -; CHECK-NEXT: csel w10, w10, w8, lo +; CHECK-NEXT: mov v19.s[1], w12 +; CHECK-NEXT: cmp w16, w8 +; CHECK-NEXT: fcvtzu w12, d7 +; CHECK-NEXT: fmov s18, w10 +; CHECK-NEXT: csel w10, w16, w8, lo ; CHECK-NEXT: cmp w13, w8 -; CHECK-NEXT: csel w8, w13, w8, lo -; CHECK-NEXT: fmov w12, s2 -; CHECK-NEXT: mov v1.h[3], w11 -; CHECK-NEXT: fmov w13, s5 -; CHECK-NEXT: mov w14, v2.s[1] -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: mov w11, v5.s[1] -; CHECK-NEXT: mov v0.h[4], w12 -; CHECK-NEXT: mov v1.h[4], w13 -; CHECK-NEXT: mov v3.s[1], w9 -; CHECK-NEXT: mov v2.s[1], w10 -; CHECK-NEXT: mov v0.h[5], w14 -; CHECK-NEXT: mov v1.h[5], w11 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov w10, v3.s[1] -; CHECK-NEXT: mov w11, v2.s[1] -; CHECK-NEXT: mov v0.h[6], w8 -; CHECK-NEXT: mov v1.h[6], w9 -; CHECK-NEXT: mov v0.h[7], w10 -; CHECK-NEXT: mov v1.h[7], w11 +; CHECK-NEXT: fcvtzu w16, d0 +; CHECK-NEXT: csel w13, w13, w8, lo +; CHECK-NEXT: cmp w15, w8 +; CHECK-NEXT: csel w15, w15, w8, lo +; CHECK-NEXT: cmp w12, w8 +; CHECK-NEXT: mov d0, v5.d[1] +; CHECK-NEXT: csel w12, w12, w8, lo +; CHECK-NEXT: cmp w16, w8 +; CHECK-NEXT: mov v18.s[1], w14 +; CHECK-NEXT: fmov s23, w12 +; CHECK-NEXT: csel w12, w16, w8, lo +; CHECK-NEXT: cmp w17, w8 +; CHECK-NEXT: fcvtzu w16, d0 +; CHECK-NEXT: mov d0, v4.d[1] +; CHECK-NEXT: csel w14, w17, w8, lo +; CHECK-NEXT: fcvtzu w17, d5 +; CHECK-NEXT: fmov s17, w11 +; CHECK-NEXT: mov v23.s[1], w15 +; CHECK-NEXT: cmp w16, w8 +; CHECK-NEXT: fmov s22, w14 +; CHECK-NEXT: csel w14, w16, w8, lo +; CHECK-NEXT: cmp w17, w8 +; CHECK-NEXT: fcvtzu w16, d0 +; CHECK-NEXT: csel w15, w17, w8, lo +; CHECK-NEXT: fcvtzu w11, d4 +; CHECK-NEXT: mov v22.s[1], w12 +; CHECK-NEXT: cmp w16, w8 +; CHECK-NEXT: fmov s21, w15 +; CHECK-NEXT: csel w12, w16, w8, lo +; CHECK-NEXT: cmp w11, w8 +; CHECK-NEXT: csel w8, w11, w8, lo +; CHECK-NEXT: mov v17.s[1], w9 +; CHECK-NEXT: adrp x9, .LCPI85_0 +; CHECK-NEXT: mov v21.s[1], w14 +; CHECK-NEXT: fmov s16, w13 +; CHECK-NEXT: fmov s20, w8 +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI85_0] +; CHECK-NEXT: mov v16.s[1], w10 +; CHECK-NEXT: mov v20.s[1], w12 +; CHECK-NEXT: tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b +; CHECK-NEXT: tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b ; CHECK-NEXT: ret %x = call <16 x i16> @llvm.fptoui.sat.v16f64.v16i16(<16 x double> %f) ret <16 x i16> %x diff --git a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll index dd7dd44..aaa7dd0 100644 --- a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll +++ b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll @@ -268,36 +268,13 @@ entry: define <16 x i8> @extract_4_v4i32_badindex(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; CHECK-LABEL: extract_4_v4i32_badindex: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w10, v0.s[3] -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov v0.b[2], w9 -; CHECK-NEXT: mov w9, v1.s[2] -; CHECK-NEXT: mov v0.b[3], w10 -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov v0.b[5], w9 -; CHECK-NEXT: mov w9, v1.s[3] -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov v0.b[7], w9 -; CHECK-NEXT: mov w9, v2.s[1] -; CHECK-NEXT: mov v0.b[8], w8 -; CHECK-NEXT: mov w8, v2.s[2] -; CHECK-NEXT: mov v0.b[9], w9 -; CHECK-NEXT: mov w9, v2.s[3] -; CHECK-NEXT: mov v0.b[10], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov v0.b[11], w9 -; CHECK-NEXT: mov w9, v3.s[1] -; CHECK-NEXT: mov v0.b[12], w8 -; CHECK-NEXT: mov w8, v3.s[2] -; CHECK-NEXT: mov v0.b[13], w9 -; CHECK-NEXT: mov w9, v3.s[3] -; CHECK-NEXT: mov v0.b[14], w8 -; CHECK-NEXT: mov v0.b[15], w9 +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b ; CHECK-NEXT: ret entry: %a0 = extractelement <4 x i32> %a, i32 0 diff --git a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll index fe49164..2389b74 100644 --- a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll +++ b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll @@ -1,46 +1,33 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-none-eabi < %s | FileCheck %s +; CHECK: .LCPI0_0: +; CHECK: .byte 0 // 0x0 +; CHECK: .byte 16 // 0x10 +; CHECK: .byte 32 // 0x20 +; CHECK: .byte 48 // 0x30 +; CHECK: .byte 2 // 0x2 +; CHECK: .byte 18 // 0x12 +; CHECK: .byte 34 // 0x22 +; CHECK: .byte 50 // 0x32 +; CHECK: .byte 4 // 0x4 +; CHECK: .byte 20 // 0x14 +; CHECK: .byte 36 // 0x24 +; CHECK: .byte 52 // 0x34 +; CHECK: .byte 6 // 0x6 +; CHECK: .byte 22 // 0x16 +; CHECK: .byte 38 // 0x26 +; CHECK: .byte 54 // 0x36 define <16 x i8> @shuffle4_v4i8_16(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; CHECK-LABEL: shuffle4_v4i8_16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: umov w10, v1.h[0] -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: umov w8, v2.h[0] -; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: mov v4.b[1], w10 -; CHECK-NEXT: mov v4.b[2], w8 -; CHECK-NEXT: umov w8, v3.h[0] -; CHECK-NEXT: mov v4.b[3], w8 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: mov v4.b[4], w8 -; CHECK-NEXT: umov w8, v1.h[1] -; CHECK-NEXT: mov v4.b[5], w8 -; CHECK-NEXT: umov w8, v2.h[1] -; CHECK-NEXT: mov v4.b[6], w8 -; CHECK-NEXT: umov w8, v3.h[1] -; CHECK-NEXT: mov v4.b[7], w8 -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: mov v4.b[8], w8 -; CHECK-NEXT: umov w8, v1.h[2] -; CHECK-NEXT: mov v4.b[9], w8 -; CHECK-NEXT: umov w8, v2.h[2] -; CHECK-NEXT: mov v4.b[10], w8 -; CHECK-NEXT: umov w8, v3.h[2] -; CHECK-NEXT: mov v4.b[11], w8 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: mov v4.b[12], w8 -; CHECK-NEXT: umov w8, v1.h[3] -; CHECK-NEXT: mov v4.b[13], w8 -; CHECK-NEXT: umov w8, v2.h[3] -; CHECK-NEXT: mov v4.b[14], w8 -; CHECK-NEXT: umov w8, v3.h[3] -; CHECK-NEXT: mov v4.b[15], w8 -; CHECK-NEXT: mov v0.16b, v4.16b +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b ; CHECK-NEXT: ret %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> @@ -48,30 +35,25 @@ define <16 x i8> @shuffle4_v4i8_16(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i ret <16 x i8> %z } +; CHECK: .LCPI1_0: +; CHECK: .byte 0 // 0x0 +; CHECK: .byte 16 // 0x10 +; CHECK: .byte 32 // 0x20 +; CHECK: .byte 48 // 0x30 +; CHECK: .byte 2 // 0x2 +; CHECK: .byte 18 // 0x12 +; CHECK: .byte 34 // 0x22 +; CHECK: .byte 50 // 0x32 define <8 x i8> @shuffle4_v4i8_8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; CHECK-LABEL: shuffle4_v4i8_8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: umov w10, v1.h[0] -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: umov w8, v2.h[0] -; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: umov w9, v3.h[0] -; CHECK-NEXT: mov v4.b[1], w10 -; CHECK-NEXT: mov v4.b[2], w8 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: mov v4.b[3], w9 -; CHECK-NEXT: umov w9, v1.h[1] -; CHECK-NEXT: mov v4.b[4], w8 -; CHECK-NEXT: umov w8, v2.h[1] -; CHECK-NEXT: mov v4.b[5], w9 -; CHECK-NEXT: umov w9, v3.h[1] -; CHECK-NEXT: mov v4.b[6], w8 -; CHECK-NEXT: mov v4.b[7], w9 -; CHECK-NEXT: fmov d0, d4 +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b ; CHECK-NEXT: ret %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> @@ -388,46 +370,33 @@ define <8 x i16> @shuffle4_v4i8_zext(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x ret <8 x i16> %z } +; CHECK: .LCPI9_0: +; CHECK: .byte 0 // 0x0 +; CHECK: .byte 16 // 0x10 +; CHECK: .byte 32 // 0x20 +; CHECK: .byte 48 // 0x30 +; CHECK: .byte 2 // 0x2 +; CHECK: .byte 18 // 0x12 +; CHECK: .byte 34 // 0x22 +; CHECK: .byte 50 // 0x32 +; CHECK: .byte 4 // 0x4 +; CHECK: .byte 20 // 0x14 +; CHECK: .byte 36 // 0x24 +; CHECK: .byte 52 // 0x34 +; CHECK: .byte 6 // 0x6 +; CHECK: .byte 22 // 0x16 +; CHECK: .byte 38 // 0x26 +; CHECK: .byte 54 // 0x36 define <16 x i8> @shuffle4_v4i16_trunc(<4 x i16> %ae, <4 x i16> %be, <4 x i16> %ce, <4 x i16> %de) { ; CHECK-LABEL: shuffle4_v4i16_trunc: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: umov w10, v1.h[0] -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: umov w8, v2.h[0] -; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: mov v4.b[1], w10 -; CHECK-NEXT: mov v4.b[2], w8 -; CHECK-NEXT: umov w8, v3.h[0] -; CHECK-NEXT: mov v4.b[3], w8 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: mov v4.b[4], w8 -; CHECK-NEXT: umov w8, v1.h[1] -; CHECK-NEXT: mov v4.b[5], w8 -; CHECK-NEXT: umov w8, v2.h[1] -; CHECK-NEXT: mov v4.b[6], w8 -; CHECK-NEXT: umov w8, v3.h[1] -; CHECK-NEXT: mov v4.b[7], w8 -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: mov v4.b[8], w8 -; CHECK-NEXT: umov w8, v1.h[2] -; CHECK-NEXT: mov v4.b[9], w8 -; CHECK-NEXT: umov w8, v2.h[2] -; CHECK-NEXT: mov v4.b[10], w8 -; CHECK-NEXT: umov w8, v3.h[2] -; CHECK-NEXT: mov v4.b[11], w8 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: mov v4.b[12], w8 -; CHECK-NEXT: umov w8, v1.h[3] -; CHECK-NEXT: mov v4.b[13], w8 -; CHECK-NEXT: umov w8, v2.h[3] -; CHECK-NEXT: mov v4.b[14], w8 -; CHECK-NEXT: umov w8, v3.h[3] -; CHECK-NEXT: mov v4.b[15], w8 -; CHECK-NEXT: mov v0.16b, v4.16b +; CHECK-NEXT: adrp x8, .LCPI9_0 +; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI9_0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b ; CHECK-NEXT: ret %a = trunc <4 x i16> %ae to <4 x i8> %b = trunc <4 x i16> %be to <4 x i8> @@ -439,45 +408,34 @@ define <16 x i8> @shuffle4_v4i16_trunc(<4 x i16> %ae, <4 x i16> %be, <4 x i16> % ret <16 x i8> %z } +; CHECK: .LCPI10_0: +; CHECK: .byte 0 // 0x0 +; CHECK: .byte 16 // 0x10 +; CHECK: .byte 32 // 0x20 +; CHECK: .byte 48 // 0x30 +; CHECK: .byte 2 // 0x2 +; CHECK: .byte 18 // 0x12 +; CHECK: .byte 34 // 0x22 +; CHECK: .byte 50 // 0x32 +; CHECK: .byte 4 // 0x4 +; CHECK: .byte 20 // 0x14 +; CHECK: .byte 36 // 0x24 +; CHECK: .byte 52 // 0x34 +; CHECK: .byte 6 // 0x6 +; CHECK: .byte 22 // 0x16 +; CHECK: .byte 38 // 0x26 +; CHECK: .byte 54 // 0x36 +; CHECK: .text define <16 x i8> @shuffle4_v4i32_trunc(<4 x i32> %ae, <4 x i32> %be, <4 x i32> %ce, <4 x i32> %de) { ; CHECK-LABEL: shuffle4_v4i32_trunc: ; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI10_0 ; CHECK-NEXT: xtn v4.4h, v0.4s -; CHECK-NEXT: xtn v1.4h, v1.4s -; CHECK-NEXT: xtn v2.4h, v2.4s -; CHECK-NEXT: xtn v3.4h, v3.4s -; CHECK-NEXT: umov w8, v4.h[0] -; CHECK-NEXT: umov w9, v1.h[0] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: umov w8, v2.h[0] -; CHECK-NEXT: mov v0.b[1], w9 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: umov w8, v3.h[0] -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: umov w8, v4.h[1] -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: umov w8, v1.h[1] -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: umov w8, v2.h[1] -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: umov w8, v3.h[1] -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: umov w8, v4.h[2] -; CHECK-NEXT: mov v0.b[8], w8 -; CHECK-NEXT: umov w8, v1.h[2] -; CHECK-NEXT: mov v0.b[9], w8 -; CHECK-NEXT: umov w8, v2.h[2] -; CHECK-NEXT: mov v0.b[10], w8 -; CHECK-NEXT: umov w8, v3.h[2] -; CHECK-NEXT: mov v0.b[11], w8 -; CHECK-NEXT: umov w8, v4.h[3] -; CHECK-NEXT: mov v0.b[12], w8 -; CHECK-NEXT: umov w8, v1.h[3] -; CHECK-NEXT: mov v0.b[13], w8 -; CHECK-NEXT: umov w8, v2.h[3] -; CHECK-NEXT: mov v0.b[14], w8 -; CHECK-NEXT: umov w8, v3.h[3] -; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: xtn v5.4h, v1.4s +; CHECK-NEXT: xtn v6.4h, v2.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: xtn v7.4h, v3.4s +; CHECK-NEXT: tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b ; CHECK-NEXT: ret %a = trunc <4 x i32> %ae to <4 x i8> %b = trunc <4 x i32> %be to <4 x i8> @@ -489,37 +447,32 @@ define <16 x i8> @shuffle4_v4i32_trunc(<4 x i32> %ae, <4 x i32> %be, <4 x i32> % ret <16 x i8> %z } +; CHECK: .LCPI11_0: +; CHECK: .byte 0 // 0x0 +; CHECK: .byte 16 // 0x10 +; CHECK: .byte 32 // 0x20 +; CHECK: .byte 2 // 0x2 +; CHECK: .byte 18 // 0x12 +; CHECK: .byte 34 // 0x22 +; CHECK: .byte 4 // 0x4 +; CHECK: .byte 20 // 0x14 +; CHECK: .byte 36 // 0x24 +; CHECK: .byte 6 // 0x6 +; CHECK: .byte 22 // 0x16 +; CHECK: .byte 38 // 0x26 +; CHECK: .byte 255 // 0xff +; CHECK: .byte 255 // 0xff +; CHECK: .byte 255 // 0xff +; CHECK: .byte 255 // 0xff define <12 x i8> @shuffle3_v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) { ; CHECK-LABEL: shuffle3_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: umov w9, v1.h[0] -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: umov w8, v2.h[0] -; CHECK-NEXT: mov v3.b[1], w9 -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: mov v3.b[2], w8 -; CHECK-NEXT: umov w8, v1.h[1] -; CHECK-NEXT: mov v3.b[3], w9 -; CHECK-NEXT: umov w9, v2.h[1] -; CHECK-NEXT: mov v3.b[4], w8 -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: mov v3.b[5], w9 -; CHECK-NEXT: umov w9, v1.h[2] -; CHECK-NEXT: mov v3.b[6], w8 -; CHECK-NEXT: umov w8, v2.h[2] -; CHECK-NEXT: mov v3.b[7], w9 -; CHECK-NEXT: umov w9, v0.h[3] -; CHECK-NEXT: mov v3.b[8], w8 -; CHECK-NEXT: umov w8, v1.h[3] -; CHECK-NEXT: mov v3.b[9], w9 -; CHECK-NEXT: umov w9, v2.h[3] -; CHECK-NEXT: mov v3.b[10], w8 -; CHECK-NEXT: mov v3.b[11], w9 -; CHECK-NEXT: mov v0.16b, v3.16b +; CHECK-NEXT: adrp x8, .LCPI11_0 +; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_0] +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v3.16b ; CHECK-NEXT: ret %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> %y = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> @@ -743,3 +696,83 @@ define <16 x i8> @insert4_v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8 %i16 = insertelement <16 x i8> %i15, i8 %e16, i32 15 ret <16 x i8> %i16 } + + +; CHECK: .LCPI16_0: +; CHECK: .byte 0 +; CHECK: .byte 1 +; CHECK: .byte 4 +; CHECK: .byte 5 +; CHECK: .byte 16 +; CHECK: .byte 17 +; CHECK: .byte 20 +; CHECK: .byte 21 +; CHECK: .byte 32 +; CHECK: .byte 33 +; CHECK: .byte 36 +; CHECK: .byte 37 +; CHECK: .byte 48 +; CHECK: .byte 49 +; CHECK: .byte 52 +; CHECK: .byte 53 +define <16 x i16> @test(<2 x double> %l213, <2 x double> %l231, <2 x double> %l249, <2 x double> %l267, <2 x double> %l285, <2 x double> %l303, <2 x double> %l321, <2 x double> %l339) { +; CHECK-LABEL: test: +; CHECK: // %bb.0: +; CHECK-NEXT: frintm v0.2d, v0.2d +; CHECK-NEXT: adrp x8, .LCPI16_0 +; CHECK-NEXT: frintm v4.2d, v4.2d +; CHECK-NEXT: frintm v1.2d, v1.2d +; CHECK-NEXT: frintm v5.2d, v5.2d +; CHECK-NEXT: frintm v2.2d, v2.2d +; CHECK-NEXT: frintm v6.2d, v6.2d +; CHECK-NEXT: frintm v3.2d, v3.2d +; CHECK-NEXT: frintm v7.2d, v7.2d +; CHECK-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-NEXT: fcvtzs v4.2d, v4.2d +; CHECK-NEXT: fcvtzs v1.2d, v1.2d +; CHECK-NEXT: fcvtzs v5.2d, v5.2d +; CHECK-NEXT: fcvtzs v2.2d, v2.2d +; CHECK-NEXT: fcvtzs v6.2d, v6.2d +; CHECK-NEXT: fcvtzs v3.2d, v3.2d +; CHECK-NEXT: xtn v16.2s, v0.2d +; CHECK-NEXT: fcvtzs v0.2d, v7.2d +; CHECK-NEXT: xtn v20.2s, v4.2d +; CHECK-NEXT: xtn v17.2s, v1.2d +; CHECK-NEXT: xtn v21.2s, v5.2d +; CHECK-NEXT: xtn v18.2s, v2.2d +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] +; CHECK-NEXT: xtn v22.2s, v6.2d +; CHECK-NEXT: xtn v19.2s, v3.2d +; CHECK-NEXT: xtn v23.2s, v0.2d +; CHECK-NEXT: tbl v2.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b +; CHECK-NEXT: tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b +; CHECK-NEXT: uzp1 v0.8h, v2.8h, v1.8h +; CHECK-NEXT: uzp2 v1.8h, v2.8h, v1.8h +; CHECK-NEXT: ret + %l214 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l213) + %l215 = fptosi <2 x double> %l214 to <2 x i16> + %l232 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l231) + %l233 = fptosi <2 x double> %l232 to <2 x i16> + %l250 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l249) + %l251 = fptosi <2 x double> %l250 to <2 x i16> + %l268 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l267) + %l269 = fptosi <2 x double> %l268 to <2 x i16> + %l286 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l285) + %l287 = fptosi <2 x double> %l286 to <2 x i16> + %l304 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l303) + %l305 = fptosi <2 x double> %l304 to <2 x i16> + %l322 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l321) + %l323 = fptosi <2 x double> %l322 to <2 x i16> + %l340 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l339) + %l341 = fptosi <2 x double> %l340 to <2 x i16> + %l342 = shufflevector <2 x i16> %l215, <2 x i16> %l233, <4 x i32> + %l343 = shufflevector <2 x i16> %l251, <2 x i16> %l269, <4 x i32> + %l344 = shufflevector <2 x i16> %l287, <2 x i16> %l305, <4 x i32> + %l345 = shufflevector <2 x i16> %l323, <2 x i16> %l341, <4 x i32> + %l346 = shufflevector <4 x i16> %l342, <4 x i16> %l343, <8 x i32> + %l347 = shufflevector <4 x i16> %l344, <4 x i16> %l345, <8 x i32> + %interleaved.vec = shufflevector <8 x i16> %l346, <8 x i16> %l347, <16 x i32> + ret <16 x i16> %interleaved.vec +} + +declare <2 x double> @llvm.floor.v2f64(<2 x double> %l213) diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll index d9b610d..946128c 100644 --- a/llvm/test/CodeGen/AArch64/tbl-loops.ll +++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll @@ -371,59 +371,38 @@ define void @loop3(i8* noalias nocapture noundef writeonly %dst, float* nocaptur ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB2_8: // %vector.ph ; CHECK-NEXT: add x11, x8, #1 -; CHECK-NEXT: mov w13, #1132396544 +; CHECK-NEXT: adrp x12, .LCPI2_0 ; CHECK-NEXT: and x10, x11, #0x1fffffffc +; CHECK-NEXT: mov w13, #1132396544 +; CHECK-NEXT: add x8, x10, x10, lsl #1 +; CHECK-NEXT: ldr q0, [x12, :lo12:.LCPI2_0] +; CHECK-NEXT: add x9, x0, x8 ; CHECK-NEXT: mov x12, x10 -; CHECK-NEXT: add x9, x10, x10, lsl #1 -; CHECK-NEXT: dup v0.4s, w13 -; CHECK-NEXT: add x8, x1, x9, lsl #2 -; CHECK-NEXT: add x9, x0, x9 +; CHECK-NEXT: add x8, x1, x8, lsl #2 +; CHECK-NEXT: dup v1.4s, w13 ; CHECK-NEXT: .LBB2_9: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld3 { v1.4s, v2.4s, v3.4s }, [x1], #48 -; CHECK-NEXT: fcmlt v4.4s, v1.4s, #0.0 +; CHECK-NEXT: ld3 { v2.4s, v3.4s, v4.4s }, [x1], #48 +; CHECK-NEXT: fcmlt v5.4s, v2.4s, #0.0 +; CHECK-NEXT: add x13, x0, #8 +; CHECK-NEXT: fmin v6.4s, v2.4s, v1.4s ; CHECK-NEXT: subs x12, x12, #4 -; CHECK-NEXT: fmin v5.4s, v1.4s, v0.4s -; CHECK-NEXT: fmin v6.4s, v2.4s, v0.4s ; CHECK-NEXT: fcmlt v7.4s, v3.4s, #0.0 -; CHECK-NEXT: fmin v1.4s, v3.4s, v0.4s -; CHECK-NEXT: bic v4.16b, v5.16b, v4.16b -; CHECK-NEXT: fcmlt v5.4s, v2.4s, #0.0 -; CHECK-NEXT: fcvtzs v4.4s, v4.4s -; CHECK-NEXT: bic v1.16b, v1.16b, v7.16b -; CHECK-NEXT: fcvtzs v1.4s, v1.4s -; CHECK-NEXT: bic v2.16b, v6.16b, v5.16b +; CHECK-NEXT: fmin v16.4s, v3.4s, v1.4s +; CHECK-NEXT: fmin v2.4s, v4.4s, v1.4s +; CHECK-NEXT: bic v5.16b, v6.16b, v5.16b +; CHECK-NEXT: fcmlt v6.4s, v4.4s, #0.0 +; CHECK-NEXT: bic v3.16b, v16.16b, v7.16b +; CHECK-NEXT: fcvtzs v4.4s, v5.4s +; CHECK-NEXT: fcvtzs v3.4s, v3.4s +; CHECK-NEXT: bic v2.16b, v2.16b, v6.16b ; CHECK-NEXT: fcvtzs v2.4s, v2.4s -; CHECK-NEXT: xtn v3.4h, v4.4s -; CHECK-NEXT: xtn v1.4h, v1.4s -; CHECK-NEXT: umov w13, v3.h[0] -; CHECK-NEXT: xtn v2.4h, v2.4s -; CHECK-NEXT: umov w14, v2.h[0] -; CHECK-NEXT: fmov s4, w13 -; CHECK-NEXT: umov w13, v1.h[0] -; CHECK-NEXT: mov v4.b[1], w14 -; CHECK-NEXT: umov w14, v3.h[1] -; CHECK-NEXT: mov v4.b[2], w13 -; CHECK-NEXT: umov w13, v2.h[1] -; CHECK-NEXT: mov v4.b[3], w14 -; CHECK-NEXT: umov w14, v1.h[1] -; CHECK-NEXT: mov v4.b[4], w13 -; CHECK-NEXT: umov w13, v3.h[2] -; CHECK-NEXT: mov v4.b[5], w14 -; CHECK-NEXT: umov w14, v2.h[2] -; CHECK-NEXT: mov v4.b[6], w13 -; CHECK-NEXT: umov w13, v1.h[2] -; CHECK-NEXT: mov v4.b[7], w14 -; CHECK-NEXT: umov w14, v3.h[3] -; CHECK-NEXT: mov v4.b[8], w13 -; CHECK-NEXT: umov w13, v2.h[3] -; CHECK-NEXT: mov v4.b[9], w14 -; CHECK-NEXT: umov w14, v1.h[3] -; CHECK-NEXT: mov v4.b[10], w13 -; CHECK-NEXT: add x13, x0, #8 -; CHECK-NEXT: mov v4.b[11], w14 -; CHECK-NEXT: str d4, [x0], #12 -; CHECK-NEXT: st1 { v4.s }[2], [x13] +; CHECK-NEXT: xtn v4.4h, v4.4s +; CHECK-NEXT: xtn v5.4h, v3.4s +; CHECK-NEXT: xtn v6.4h, v2.4s +; CHECK-NEXT: tbl v2.16b, { v4.16b, v5.16b, v6.16b }, v0.16b +; CHECK-NEXT: str d2, [x0], #12 +; CHECK-NEXT: st1 { v2.s }[2], [x13] ; CHECK-NEXT: b.ne .LBB2_9 ; CHECK-NEXT: // %bb.10: // %middle.block ; CHECK-NEXT: cmp x11, x10 @@ -606,69 +585,40 @@ define void @loop4(i8* noalias nocapture noundef writeonly %dst, float* nocaptur ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB3_8: // %vector.ph ; CHECK-NEXT: add x11, x8, #1 -; CHECK-NEXT: mov w13, #1132396544 +; CHECK-NEXT: adrp x12, .LCPI3_0 ; CHECK-NEXT: and x10, x11, #0x1fffffffc -; CHECK-NEXT: mov x12, x10 +; CHECK-NEXT: mov w13, #1132396544 ; CHECK-NEXT: add x8, x1, x10, lsl #4 ; CHECK-NEXT: add x9, x0, x10, lsl #2 -; CHECK-NEXT: dup v0.4s, w13 +; CHECK-NEXT: ldr q0, [x12, :lo12:.LCPI3_0] +; CHECK-NEXT: mov x12, x10 +; CHECK-NEXT: dup v1.4s, w13 ; CHECK-NEXT: .LBB3_9: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x1], #64 -; CHECK-NEXT: fcmlt v5.4s, v1.4s, #0.0 +; CHECK-NEXT: ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x1], #64 +; CHECK-NEXT: fcmlt v6.4s, v2.4s, #0.0 ; CHECK-NEXT: subs x12, x12, #4 -; CHECK-NEXT: fmin v6.4s, v1.4s, v0.4s -; CHECK-NEXT: fmin v7.4s, v2.4s, v0.4s +; CHECK-NEXT: fmin v7.4s, v2.4s, v1.4s ; CHECK-NEXT: fcmlt v16.4s, v3.4s, #0.0 -; CHECK-NEXT: fmin v17.4s, v3.4s, v0.4s -; CHECK-NEXT: bic v5.16b, v6.16b, v5.16b -; CHECK-NEXT: fcmlt v6.4s, v2.4s, #0.0 -; CHECK-NEXT: fcvtzs v5.4s, v5.4s -; CHECK-NEXT: fmin v1.4s, v4.4s, v0.4s +; CHECK-NEXT: fmin v17.4s, v3.4s, v1.4s +; CHECK-NEXT: fmin v18.4s, v4.4s, v1.4s ; CHECK-NEXT: bic v6.16b, v7.16b, v6.16b -; CHECK-NEXT: fcvtzs v6.4s, v6.4s -; CHECK-NEXT: xtn v5.4h, v5.4s -; CHECK-NEXT: bic v7.16b, v17.16b, v16.16b -; CHECK-NEXT: fcmlt v16.4s, v4.4s, #0.0 -; CHECK-NEXT: umov w13, v5.h[0] -; CHECK-NEXT: xtn v2.4h, v6.4s -; CHECK-NEXT: fcvtzs v3.4s, v7.4s -; CHECK-NEXT: umov w14, v2.h[0] -; CHECK-NEXT: bic v1.16b, v1.16b, v16.16b -; CHECK-NEXT: fmov s4, w13 -; CHECK-NEXT: xtn v3.4h, v3.4s -; CHECK-NEXT: fcvtzs v1.4s, v1.4s -; CHECK-NEXT: mov v4.b[1], w14 -; CHECK-NEXT: umov w13, v3.h[0] -; CHECK-NEXT: xtn v1.4h, v1.4s -; CHECK-NEXT: mov v4.b[2], w13 -; CHECK-NEXT: umov w13, v1.h[0] -; CHECK-NEXT: mov v4.b[3], w13 -; CHECK-NEXT: umov w13, v5.h[1] -; CHECK-NEXT: mov v4.b[4], w13 -; CHECK-NEXT: umov w13, v2.h[1] -; CHECK-NEXT: mov v4.b[5], w13 -; CHECK-NEXT: umov w13, v3.h[1] -; CHECK-NEXT: mov v4.b[6], w13 -; CHECK-NEXT: umov w13, v1.h[1] -; CHECK-NEXT: mov v4.b[7], w13 -; CHECK-NEXT: umov w13, v5.h[2] -; CHECK-NEXT: mov v4.b[8], w13 -; CHECK-NEXT: umov w13, v2.h[2] -; CHECK-NEXT: mov v4.b[9], w13 -; CHECK-NEXT: umov w13, v3.h[2] -; CHECK-NEXT: mov v4.b[10], w13 -; CHECK-NEXT: umov w13, v1.h[2] -; CHECK-NEXT: mov v4.b[11], w13 -; CHECK-NEXT: umov w13, v5.h[3] -; CHECK-NEXT: mov v4.b[12], w13 -; CHECK-NEXT: umov w13, v2.h[3] -; CHECK-NEXT: mov v4.b[13], w13 -; CHECK-NEXT: umov w13, v3.h[3] -; CHECK-NEXT: mov v4.b[14], w13 -; CHECK-NEXT: umov w13, v1.h[3] -; CHECK-NEXT: mov v4.b[15], w13 -; CHECK-NEXT: str q4, [x0], #16 +; CHECK-NEXT: fcmlt v7.4s, v4.4s, #0.0 +; CHECK-NEXT: bic v16.16b, v17.16b, v16.16b +; CHECK-NEXT: fcmlt v17.4s, v5.4s, #0.0 +; CHECK-NEXT: fmin v2.4s, v5.4s, v1.4s +; CHECK-NEXT: fcvtzs v4.4s, v6.4s +; CHECK-NEXT: bic v3.16b, v18.16b, v7.16b +; CHECK-NEXT: fcvtzs v5.4s, v16.4s +; CHECK-NEXT: fcvtzs v3.4s, v3.4s +; CHECK-NEXT: bic v2.16b, v2.16b, v17.16b +; CHECK-NEXT: fcvtzs v2.4s, v2.4s +; CHECK-NEXT: xtn v16.4h, v4.4s +; CHECK-NEXT: xtn v17.4h, v5.4s +; CHECK-NEXT: xtn v18.4h, v3.4s +; CHECK-NEXT: xtn v19.4h, v2.4s +; CHECK-NEXT: tbl v2.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b +; CHECK-NEXT: str q2, [x0], #16 ; CHECK-NEXT: b.ne .LBB3_9 ; CHECK-NEXT: // %bb.10: // %middle.block ; CHECK-NEXT: cmp x11, x10 -- 2.7.4