From: David Green Date: Fri, 25 Mar 2022 10:03:10 +0000 (+0000) Subject: Revert "[AArch64] Lower 3 and 4 sources buildvectors to TBL" X-Git-Tag: upstream/15.0.7~12350 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3d8d60e147fdf345eb014bf45c71440a9e9356f9;p=platform%2Fupstream%2Fllvm.git Revert "[AArch64] Lower 3 and 4 sources buildvectors to TBL" This reverts commit ec93b28909749619dbe58b092a13da9d1ff1eb1e as problems with it have been reported. --- diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c9cd9c6..f2c51e1 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9079,72 +9079,10 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, Source->MaxElt = std::max(Source->MaxElt, EltNo); } - // If we have 3 or 4 sources, try to generate a TBL, which will at least be - // better than moving to/from gpr registers for larger vectors. - if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) { - // Construct a mask for the tbl. We may need to adjust the index for types - // larger than i8. - SmallVector Mask; - unsigned OutputFactor = VT.getScalarSizeInBits() / 8; - for (unsigned I = 0; I < NumElts; ++I) { - SDValue V = Op.getOperand(I); - if (V.isUndef()) { - for (unsigned OF = 0; OF < OutputFactor; OF++) - Mask.push_back(-1); - continue; - } - // Set the Mask lanes adjusted for the size of the input and output - // lanes. The Mask is always i8, so it will set OutputFactor lanes per - // output element, adjusted in their positions per input and output types. - unsigned Lane = V.getConstantOperandVal(1); - for (unsigned S = 0; S < Sources.size(); S++) { - if (V.getOperand(0) == Sources[S].Vec) { - unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits(); - unsigned InputBase = 16 * S + Lane * InputSize / (8 * OutputFactor); - for (unsigned OF = 0; OF < OutputFactor; OF++) - Mask.push_back(InputBase + OF); - break; - } - } - } - - // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to - // v16i8, and the TBLMask - SmallVector TBLOperands; - TBLOperands.push_back(DAG.getConstant(Sources.size() == 3 - ? Intrinsic::aarch64_neon_tbl3 - : Intrinsic::aarch64_neon_tbl4, - dl, MVT::i32)); - for (unsigned i = 0; i < Sources.size(); i++) { - SDValue Src = Sources[i].Vec; - EVT SrcVT = Src.getValueType(); - Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src); - assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) && - "Expected a legally typed vector"); - if (SrcVT.is64BitVector()) - Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src, - DAG.getUNDEF(MVT::v8i8)); - TBLOperands.push_back(Src); - } - - SmallVector TBLMask; - for (unsigned i = 0; i < Mask.size(); i++) - TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32)); - assert((Mask.size() == 8 || Mask.size() == 16) && - "Expected a v8i8 or v16i8 Mask"); - TBLOperands.push_back( - DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask)); - - SDValue Shuffle = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, - Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands); - return DAG.getBitcast(VT, Shuffle); - } - if (Sources.size() > 2) { - LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something " - << "sensible when at most two source vectors are " - << "involved\n"); + LLVM_DEBUG( + dbgs() << "Reshuffle failed: currently only do something sane when at " + "most two source vectors are involved\n"); return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll index 6e30267..33fa504 100644 --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -3321,63 +3321,75 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) { define <8 x i8> @test_signed_v8f64_v8i8(<8 x double> %f) { ; CHECK-LABEL: test_signed_v8f64_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov d4, v3.d[1] +; CHECK-NEXT: mov d4, v0.d[1] ; CHECK-NEXT: mov w8, #127 -; CHECK-NEXT: fcvtzs w10, d3 -; CHECK-NEXT: mov w11, #-128 -; CHECK-NEXT: mov d3, v1.d[1] -; CHECK-NEXT: fcvtzs w13, d2 -; CHECK-NEXT: fcvtzs w15, d1 -; CHECK-NEXT: mov d1, v0.d[1] -; CHECK-NEXT: fcvtzs w9, d4 -; CHECK-NEXT: mov d4, v2.d[1] -; CHECK-NEXT: fcvtzs w14, d3 -; CHECK-NEXT: cmp w9, #127 -; CHECK-NEXT: csel w9, w9, w8, lt -; CHECK-NEXT: fcvtzs w12, d4 -; CHECK-NEXT: cmn w9, #128 -; CHECK-NEXT: csel w9, w9, w11, gt +; CHECK-NEXT: fcvtzs w11, d0 +; CHECK-NEXT: mov w9, #-128 +; CHECK-NEXT: mov d0, v2.d[1] +; CHECK-NEXT: fcvtzs w13, d1 +; CHECK-NEXT: fcvtzs w15, d3 +; CHECK-NEXT: fcvtzs w10, d4 +; CHECK-NEXT: mov d4, v1.d[1] +; CHECK-NEXT: mov d1, v3.d[1] +; CHECK-NEXT: fcvtzs w14, d0 ; CHECK-NEXT: cmp w10, #127 ; CHECK-NEXT: csel w10, w10, w8, lt +; CHECK-NEXT: fcvtzs w12, d4 ; CHECK-NEXT: cmn w10, #128 -; CHECK-NEXT: csel w10, w10, w11, gt +; CHECK-NEXT: csel w10, w10, w9, gt +; CHECK-NEXT: cmp w11, #127 +; CHECK-NEXT: csel w11, w11, w8, lt +; CHECK-NEXT: cmn w11, #128 +; CHECK-NEXT: csel w11, w11, w9, gt ; CHECK-NEXT: cmp w12, #127 ; CHECK-NEXT: csel w12, w12, w8, lt ; CHECK-NEXT: cmn w12, #128 -; CHECK-NEXT: csel w12, w12, w11, gt +; CHECK-NEXT: csel w12, w12, w9, gt ; CHECK-NEXT: cmp w13, #127 -; CHECK-NEXT: csel w13, w13, w8, lt -; CHECK-NEXT: fmov s5, w10 -; CHECK-NEXT: cmn w13, #128 -; CHECK-NEXT: csel w13, w13, w11, gt -; CHECK-NEXT: cmp w14, #127 -; CHECK-NEXT: csel w14, w14, w8, lt -; CHECK-NEXT: cmn w14, #128 -; CHECK-NEXT: csel w10, w14, w11, gt -; CHECK-NEXT: cmp w15, #127 -; CHECK-NEXT: fcvtzs w14, d1 -; CHECK-NEXT: csel w15, w15, w8, lt -; CHECK-NEXT: cmn w15, #128 -; CHECK-NEXT: mov v5.s[1], w9 -; CHECK-NEXT: csel w9, w15, w11, gt +; CHECK-NEXT: fmov s0, w11 +; CHECK-NEXT: csel w11, w13, w8, lt +; CHECK-NEXT: cmn w11, #128 +; CHECK-NEXT: fcvtzs w13, d2 +; CHECK-NEXT: csel w11, w11, w9, gt ; CHECK-NEXT: cmp w14, #127 -; CHECK-NEXT: fcvtzs w15, d0 -; CHECK-NEXT: fmov s4, w13 -; CHECK-NEXT: csel w13, w14, w8, lt +; CHECK-NEXT: mov v0.s[1], w10 +; CHECK-NEXT: csel w10, w14, w8, lt +; CHECK-NEXT: cmn w10, #128 +; CHECK-NEXT: fmov s2, w11 +; CHECK-NEXT: csel w10, w10, w9, gt +; CHECK-NEXT: cmp w13, #127 +; CHECK-NEXT: mov w11, v0.s[1] +; CHECK-NEXT: csel w13, w13, w8, lt +; CHECK-NEXT: mov v2.s[1], w12 ; CHECK-NEXT: cmn w13, #128 -; CHECK-NEXT: csel w13, w13, w11, gt +; CHECK-NEXT: fcvtzs w12, d1 +; CHECK-NEXT: csel w13, w13, w9, gt +; CHECK-NEXT: mov v0.b[1], w11 +; CHECK-NEXT: fmov w14, s2 +; CHECK-NEXT: cmp w12, #127 +; CHECK-NEXT: fmov s1, w13 +; CHECK-NEXT: csel w12, w12, w8, lt +; CHECK-NEXT: cmn w12, #128 +; CHECK-NEXT: mov w11, v2.s[1] +; CHECK-NEXT: mov v0.b[2], w14 +; CHECK-NEXT: csel w12, w12, w9, gt ; CHECK-NEXT: cmp w15, #127 -; CHECK-NEXT: mov v4.s[1], w12 +; CHECK-NEXT: mov v1.s[1], w10 ; CHECK-NEXT: csel w8, w15, w8, lt -; CHECK-NEXT: fmov s3, w9 ; CHECK-NEXT: cmn w8, #128 -; CHECK-NEXT: csel w8, w8, w11, gt -; CHECK-NEXT: mov v3.s[1], w10 +; CHECK-NEXT: csel w8, w8, w9, gt +; CHECK-NEXT: mov v0.b[3], w11 +; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: adrp x8, .LCPI82_0 -; CHECK-NEXT: mov v2.s[1], w13 -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI82_0] -; CHECK-NEXT: tbl v0.8b, { v2.16b, v3.16b, v4.16b, v5.16b }, v0.8b +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov v0.b[4], w9 +; CHECK-NEXT: mov v2.s[1], w12 +; CHECK-NEXT: mov v0.b[5], w8 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov w9, v2.s[1] +; CHECK-NEXT: mov v0.b[6], w8 +; CHECK-NEXT: mov v0.b[7], w9 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %x = call <8 x i8> @llvm.fptosi.sat.v8f64.v8i8(<8 x double> %f) ret <8 x i8> %x @@ -3530,17 +3542,17 @@ define <16 x i8> @test_signed_v16f64_v16i8(<16 x double> %f) { define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) { ; CHECK-LABEL: test_signed_v8f64_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov d4, v3.d[1] +; CHECK-NEXT: mov d4, v0.d[1] ; CHECK-NEXT: mov w8, #32767 -; CHECK-NEXT: fcvtzs w10, d3 +; CHECK-NEXT: fcvtzs w10, d0 ; CHECK-NEXT: mov w11, #-32768 -; CHECK-NEXT: mov d3, v1.d[1] -; CHECK-NEXT: fcvtzs w13, d2 -; CHECK-NEXT: fcvtzs w15, d1 -; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: mov d0, v2.d[1] +; CHECK-NEXT: fcvtzs w13, d1 +; CHECK-NEXT: fcvtzs w15, d3 ; CHECK-NEXT: fcvtzs w9, d4 -; CHECK-NEXT: mov d4, v2.d[1] -; CHECK-NEXT: fcvtzs w14, d3 +; CHECK-NEXT: mov d4, v1.d[1] +; CHECK-NEXT: mov d1, v3.d[1] +; CHECK-NEXT: fcvtzs w14, d0 ; CHECK-NEXT: cmp w9, w8 ; CHECK-NEXT: csel w9, w9, w8, lt ; CHECK-NEXT: fcvtzs w12, d4 @@ -3555,38 +3567,49 @@ define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) { ; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w12, w12, w11, gt ; CHECK-NEXT: cmp w13, w8 -; CHECK-NEXT: csel w13, w13, w8, lt -; CHECK-NEXT: fmov s5, w10 -; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w13, w13, w11, gt -; CHECK-NEXT: cmp w14, w8 -; CHECK-NEXT: csel w14, w14, w8, lt -; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w10, w14, w11, gt -; CHECK-NEXT: cmp w15, w8 -; CHECK-NEXT: fcvtzs w14, d1 -; CHECK-NEXT: csel w15, w15, w8, lt -; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 -; CHECK-NEXT: mov v5.s[1], w9 -; CHECK-NEXT: csel w9, w15, w11, gt +; CHECK-NEXT: fmov s0, w10 +; CHECK-NEXT: csel w10, w13, w8, lt +; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 +; CHECK-NEXT: fcvtzs w13, d2 +; CHECK-NEXT: csel w10, w10, w11, gt ; CHECK-NEXT: cmp w14, w8 -; CHECK-NEXT: fcvtzs w15, d0 -; CHECK-NEXT: fmov s4, w13 -; CHECK-NEXT: csel w13, w14, w8, lt +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: csel w9, w14, w8, lt +; CHECK-NEXT: cmn w9, #8, lsl #12 // =32768 +; CHECK-NEXT: fmov s2, w10 +; CHECK-NEXT: csel w9, w9, w11, gt +; CHECK-NEXT: cmp w13, w8 +; CHECK-NEXT: mov w10, v0.s[1] +; CHECK-NEXT: csel w13, w13, w8, lt +; CHECK-NEXT: mov v2.s[1], w12 ; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768 +; CHECK-NEXT: fcvtzs w12, d1 ; CHECK-NEXT: csel w13, w13, w11, gt +; CHECK-NEXT: mov v0.h[1], w10 +; CHECK-NEXT: fmov w14, s2 +; CHECK-NEXT: cmp w12, w8 +; CHECK-NEXT: fmov s1, w13 +; CHECK-NEXT: csel w12, w12, w8, lt +; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768 +; CHECK-NEXT: mov w10, v2.s[1] +; CHECK-NEXT: mov v0.h[2], w14 +; CHECK-NEXT: csel w12, w12, w11, gt ; CHECK-NEXT: cmp w15, w8 -; CHECK-NEXT: mov v4.s[1], w12 +; CHECK-NEXT: mov v1.s[1], w9 ; CHECK-NEXT: csel w8, w15, w8, lt -; CHECK-NEXT: fmov s3, w9 ; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w8, w8, w11, gt -; CHECK-NEXT: mov v3.s[1], w10 +; CHECK-NEXT: mov v0.h[3], w10 +; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: adrp x8, .LCPI84_0 -; CHECK-NEXT: mov v2.s[1], w13 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI84_0] -; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b, v4.16b, v5.16b }, v0.16b +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov v0.h[4], w9 +; CHECK-NEXT: mov v2.s[1], w12 +; CHECK-NEXT: mov v0.h[5], w8 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov w9, v2.s[1] +; CHECK-NEXT: mov v0.h[6], w8 +; CHECK-NEXT: mov v0.h[7], w9 ; CHECK-NEXT: ret %x = call <8 x i16> @llvm.fptosi.sat.v8f64.v8i16(<8 x double> %f) ret <8 x i16> %x @@ -3595,116 +3618,140 @@ define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) { define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) { ; CHECK-LABEL: test_signed_v16f64_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov d16, v3.d[1] +; CHECK-NEXT: mov d16, v0.d[1] ; CHECK-NEXT: mov w9, #32767 -; CHECK-NEXT: fcvtzs w11, d3 +; CHECK-NEXT: fcvtzs w11, d0 ; CHECK-NEXT: mov w8, #-32768 -; CHECK-NEXT: mov d3, v1.d[1] +; CHECK-NEXT: mov d0, v2.d[1] +; CHECK-NEXT: fcvtzs w12, d1 ; CHECK-NEXT: fcvtzs w14, d2 -; CHECK-NEXT: fcvtzs w15, d1 -; CHECK-NEXT: mov d1, v7.d[1] +; CHECK-NEXT: mov d2, v4.d[1] ; CHECK-NEXT: fcvtzs w10, d16 -; CHECK-NEXT: mov d16, v2.d[1] -; CHECK-NEXT: mov d2, v0.d[1] -; CHECK-NEXT: fcvtzs w18, d0 -; CHECK-NEXT: mov d0, v6.d[1] -; CHECK-NEXT: fcvtzs w0, d7 +; CHECK-NEXT: mov d16, v1.d[1] +; CHECK-NEXT: mov d1, v3.d[1] +; CHECK-NEXT: fcvtzs w16, d3 +; CHECK-NEXT: fcvtzs w15, d0 +; CHECK-NEXT: mov d3, v6.d[1] ; CHECK-NEXT: cmp w10, w9 -; CHECK-NEXT: fcvtzs w2, d6 ; CHECK-NEXT: csel w10, w10, w9, lt -; CHECK-NEXT: fcvtzs w12, d16 +; CHECK-NEXT: fcvtzs w13, d16 ; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-NEXT: fcvtzs w17, d2 +; CHECK-NEXT: fcvtzs w17, d1 ; CHECK-NEXT: csel w10, w10, w8, gt ; CHECK-NEXT: cmp w11, w9 ; CHECK-NEXT: csel w11, w11, w9, lt -; CHECK-NEXT: fcvtzs w1, d0 -; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 -; CHECK-NEXT: mov d0, v4.d[1] -; CHECK-NEXT: csel w13, w11, w8, gt -; CHECK-NEXT: cmp w12, w9 -; CHECK-NEXT: csel w11, w12, w9, lt -; CHECK-NEXT: fcvtzs w12, d3 +; CHECK-NEXT: mov d1, v5.d[1] ; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w11, w11, w8, gt -; CHECK-NEXT: cmp w14, w9 -; CHECK-NEXT: csel w14, w14, w9, lt -; CHECK-NEXT: fmov s19, w13 -; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w14, w14, w8, gt +; CHECK-NEXT: cmp w13, w9 +; CHECK-NEXT: csel w13, w13, w9, lt +; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w13, w13, w8, gt ; CHECK-NEXT: cmp w12, w9 ; CHECK-NEXT: csel w12, w12, w9, lt +; CHECK-NEXT: fmov s0, w11 ; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w12, w12, w8, gt ; CHECK-NEXT: cmp w15, w9 ; CHECK-NEXT: csel w15, w15, w9, lt ; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w16, w15, w8, gt +; CHECK-NEXT: csel w11, w15, w8, gt +; CHECK-NEXT: cmp w14, w9 +; CHECK-NEXT: csel w14, w14, w9, lt +; CHECK-NEXT: fcvtzs w15, d4 +; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w14, w14, w8, gt ; CHECK-NEXT: cmp w17, w9 -; CHECK-NEXT: csel w15, w17, w9, lt +; CHECK-NEXT: mov v0.s[1], w10 +; CHECK-NEXT: csel w10, w17, w9, lt +; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 +; CHECK-NEXT: fcvtzs w17, d2 +; CHECK-NEXT: csel w10, w10, w8, gt +; CHECK-NEXT: cmp w16, w9 +; CHECK-NEXT: fmov s2, w12 +; CHECK-NEXT: csel w12, w16, w9, lt +; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768 +; CHECK-NEXT: mov w16, v0.s[1] +; CHECK-NEXT: csel w12, w12, w8, gt +; CHECK-NEXT: cmp w17, w9 +; CHECK-NEXT: mov v2.s[1], w13 +; CHECK-NEXT: csel w13, w17, w9, lt +; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768 ; CHECK-NEXT: fcvtzs w17, d1 +; CHECK-NEXT: csel w13, w13, w8, gt +; CHECK-NEXT: cmp w15, w9 +; CHECK-NEXT: csel w15, w15, w9, lt +; CHECK-NEXT: fmov s4, w14 ; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 -; CHECK-NEXT: mov d1, v5.d[1] +; CHECK-NEXT: mov v0.h[1], w16 +; CHECK-NEXT: fcvtzs w16, d5 ; CHECK-NEXT: csel w15, w15, w8, gt -; CHECK-NEXT: cmp w18, w9 -; CHECK-NEXT: csel w18, w18, w9, lt -; CHECK-NEXT: cmn w18, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w18, w18, w8, gt ; CHECK-NEXT: cmp w17, w9 ; CHECK-NEXT: csel w17, w17, w9, lt ; CHECK-NEXT: cmn w17, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w17, w17, w8, gt -; CHECK-NEXT: cmp w0, w9 -; CHECK-NEXT: csel w0, w0, w9, lt -; CHECK-NEXT: cmn w0, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w13, w0, w8, gt -; CHECK-NEXT: cmp w1, w9 -; CHECK-NEXT: csel w1, w1, w9, lt -; CHECK-NEXT: fcvtzs w0, d1 -; CHECK-NEXT: cmn w1, #8, lsl #12 // =32768 -; CHECK-NEXT: mov v19.s[1], w10 -; CHECK-NEXT: csel w10, w1, w8, gt -; CHECK-NEXT: cmp w2, w9 -; CHECK-NEXT: fcvtzs w1, d5 -; CHECK-NEXT: csel w2, w2, w9, lt -; CHECK-NEXT: fmov s18, w14 -; CHECK-NEXT: cmn w2, #8, lsl #12 // =32768 -; CHECK-NEXT: fmov s23, w13 -; CHECK-NEXT: csel w2, w2, w8, gt -; CHECK-NEXT: cmp w0, w9 -; CHECK-NEXT: csel w14, w0, w9, lt -; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w13, w14, w8, gt -; CHECK-NEXT: cmp w1, w9 -; CHECK-NEXT: fcvtzs w14, d0 -; CHECK-NEXT: csel w0, w1, w9, lt -; CHECK-NEXT: cmn w0, #8, lsl #12 // =32768 -; CHECK-NEXT: mov v18.s[1], w11 -; CHECK-NEXT: csel w11, w0, w8, gt -; CHECK-NEXT: mov v23.s[1], w17 +; CHECK-NEXT: csel w14, w17, w8, gt +; CHECK-NEXT: cmp w16, w9 +; CHECK-NEXT: fmov s1, w15 +; CHECK-NEXT: csel w15, w16, w9, lt +; CHECK-NEXT: fcvtzs w16, d3 +; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 +; CHECK-NEXT: mov v4.s[1], w11 +; CHECK-NEXT: csel w11, w15, w8, gt +; CHECK-NEXT: fcvtzs w15, d6 +; CHECK-NEXT: mov v1.s[1], w13 +; CHECK-NEXT: cmp w16, w9 +; CHECK-NEXT: fmov s3, w11 +; CHECK-NEXT: csel w16, w16, w9, lt +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: mov w13, v2.s[1] +; CHECK-NEXT: mov d2, v7.d[1] +; CHECK-NEXT: cmn w16, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w16, w16, w8, gt +; CHECK-NEXT: cmp w15, w9 +; CHECK-NEXT: mov v0.h[2], w11 +; CHECK-NEXT: csel w11, w15, w9, lt +; CHECK-NEXT: mov w15, v1.s[1] +; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 +; CHECK-NEXT: mov v3.s[1], w14 +; CHECK-NEXT: fcvtzs w14, d2 +; CHECK-NEXT: csel w11, w11, w8, gt +; CHECK-NEXT: mov v0.h[3], w13 +; CHECK-NEXT: mov v1.h[1], w15 ; CHECK-NEXT: cmp w14, w9 -; CHECK-NEXT: fcvtzs w17, d4 +; CHECK-NEXT: fmov w13, s3 ; CHECK-NEXT: csel w14, w14, w9, lt -; CHECK-NEXT: fmov s22, w2 +; CHECK-NEXT: fcvtzs w15, d7 +; CHECK-NEXT: fmov s2, w11 ; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w14, w14, w8, gt -; CHECK-NEXT: fmov s17, w16 -; CHECK-NEXT: cmp w17, w9 -; CHECK-NEXT: mov v22.s[1], w10 -; CHECK-NEXT: csel w9, w17, w9, lt -; CHECK-NEXT: fmov s21, w11 +; CHECK-NEXT: mov w11, v3.s[1] +; CHECK-NEXT: mov v1.h[2], w13 +; CHECK-NEXT: csel w13, w14, w8, gt +; CHECK-NEXT: cmp w15, w9 +; CHECK-NEXT: fmov s3, w12 +; CHECK-NEXT: mov v2.s[1], w16 +; CHECK-NEXT: csel w9, w15, w9, lt ; CHECK-NEXT: cmn w9, #8, lsl #12 // =32768 +; CHECK-NEXT: fmov w12, s4 ; CHECK-NEXT: csel w8, w9, w8, gt -; CHECK-NEXT: adrp x9, .LCPI85_0 -; CHECK-NEXT: mov v17.s[1], w12 -; CHECK-NEXT: mov v21.s[1], w13 -; CHECK-NEXT: fmov s16, w18 -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI85_0] -; CHECK-NEXT: fmov s20, w8 -; CHECK-NEXT: mov v16.s[1], w15 -; CHECK-NEXT: mov v20.s[1], w14 -; CHECK-NEXT: tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b -; CHECK-NEXT: tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b +; CHECK-NEXT: mov w14, v4.s[1] +; CHECK-NEXT: mov v1.h[3], w11 +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: mov w9, v2.s[1] +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: mov v0.h[4], w12 +; CHECK-NEXT: mov v1.h[4], w11 +; CHECK-NEXT: mov v3.s[1], w10 +; CHECK-NEXT: mov v2.s[1], w13 +; CHECK-NEXT: mov v0.h[5], w14 +; CHECK-NEXT: mov v1.h[5], w9 +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov w10, v3.s[1] +; CHECK-NEXT: mov w11, v2.s[1] +; CHECK-NEXT: mov v0.h[6], w8 +; CHECK-NEXT: mov v1.h[6], w9 +; CHECK-NEXT: mov v0.h[7], w10 +; CHECK-NEXT: mov v1.h[7], w11 ; CHECK-NEXT: ret %x = call <16 x i16> @llvm.fptosi.sat.v16f64.v16i16(<16 x double> %f) ret <16 x i16> %x diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll index 35b7861..99ffde4 100644 --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -2768,46 +2768,58 @@ define <16 x i16> @test_unsigned_v16f16_v16i16(<16 x half> %f) { define <8 x i8> @test_unsigned_v8f64_v8i8(<8 x double> %f) { ; CHECK-LABEL: test_unsigned_v8f64_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov d4, v3.d[1] -; CHECK-NEXT: fcvtzu w10, d3 -; CHECK-NEXT: mov d3, v2.d[1] +; CHECK-NEXT: mov d5, v0.d[1] +; CHECK-NEXT: fcvtzu w10, d0 +; CHECK-NEXT: mov d0, v1.d[1] ; CHECK-NEXT: mov w8, #255 -; CHECK-NEXT: fcvtzu w12, d2 -; CHECK-NEXT: fcvtzu w13, d1 -; CHECK-NEXT: fcvtzu w9, d4 -; CHECK-NEXT: mov d4, v1.d[1] -; CHECK-NEXT: fcvtzu w11, d3 -; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: fcvtzu w12, d1 +; CHECK-NEXT: mov d4, v2.d[1] +; CHECK-NEXT: fcvtzu w13, d3 +; CHECK-NEXT: fcvtzu w9, d5 +; CHECK-NEXT: fcvtzu w11, d0 ; CHECK-NEXT: cmp w9, #255 ; CHECK-NEXT: csel w9, w9, w8, lo ; CHECK-NEXT: cmp w10, #255 ; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: cmp w11, #255 -; CHECK-NEXT: csel w11, w11, w8, lo +; CHECK-NEXT: fmov s0, w10 +; CHECK-NEXT: csel w10, w11, w8, lo ; CHECK-NEXT: cmp w12, #255 -; CHECK-NEXT: csel w12, w12, w8, lo -; CHECK-NEXT: fmov s19, w10 -; CHECK-NEXT: fcvtzu w10, d4 -; CHECK-NEXT: cmp w10, #255 -; CHECK-NEXT: mov v19.s[1], w9 -; CHECK-NEXT: csel w10, w10, w8, lo -; CHECK-NEXT: cmp w13, #255 -; CHECK-NEXT: fmov s18, w12 -; CHECK-NEXT: fcvtzu w9, d1 -; CHECK-NEXT: csel w12, w13, w8, lo -; CHECK-NEXT: fcvtzu w13, d0 -; CHECK-NEXT: mov v18.s[1], w11 +; CHECK-NEXT: csel w11, w12, w8, lo +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: fcvtzu w9, d4 +; CHECK-NEXT: fmov s1, w11 +; CHECK-NEXT: fcvtzu w11, d2 ; CHECK-NEXT: cmp w9, #255 -; CHECK-NEXT: fmov s17, w12 +; CHECK-NEXT: mov d2, v3.d[1] +; CHECK-NEXT: mov w12, v0.s[1] ; CHECK-NEXT: csel w9, w9, w8, lo +; CHECK-NEXT: mov v1.s[1], w10 +; CHECK-NEXT: cmp w11, #255 +; CHECK-NEXT: csel w10, w11, w8, lo +; CHECK-NEXT: mov v0.b[1], w12 +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: fmov s4, w10 +; CHECK-NEXT: fcvtzu w10, d2 +; CHECK-NEXT: mov w12, v1.s[1] +; CHECK-NEXT: mov v0.b[2], w11 +; CHECK-NEXT: mov v4.s[1], w9 +; CHECK-NEXT: cmp w10, #255 +; CHECK-NEXT: csel w9, w10, w8, lo ; CHECK-NEXT: cmp w13, #255 ; CHECK-NEXT: csel w8, w13, w8, lo -; CHECK-NEXT: mov v17.s[1], w10 -; CHECK-NEXT: fmov s16, w8 -; CHECK-NEXT: adrp x8, .LCPI82_0 -; CHECK-NEXT: mov v16.s[1], w9 -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI82_0] -; CHECK-NEXT: tbl v0.8b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.8b +; CHECK-NEXT: mov v0.b[3], w12 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov w8, v4.s[1] +; CHECK-NEXT: mov v0.b[4], w10 +; CHECK-NEXT: mov v1.s[1], w9 +; CHECK-NEXT: mov v0.b[5], w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov w9, v1.s[1] +; CHECK-NEXT: mov v0.b[6], w8 +; CHECK-NEXT: mov v0.b[7], w9 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %x = call <8 x i8> @llvm.fptoui.sat.v8f64.v8i8(<8 x double> %f) ret <8 x i8> %x @@ -2927,46 +2939,57 @@ define <16 x i8> @test_unsigned_v16f64_v16i8(<16 x double> %f) { define <8 x i16> @test_unsigned_v8f64_v8i16(<8 x double> %f) { ; CHECK-LABEL: test_unsigned_v8f64_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov d4, v3.d[1] -; CHECK-NEXT: fcvtzu w10, d3 -; CHECK-NEXT: mov d3, v2.d[1] +; CHECK-NEXT: mov d5, v0.d[1] +; CHECK-NEXT: fcvtzu w10, d0 +; CHECK-NEXT: mov d0, v1.d[1] ; CHECK-NEXT: mov w8, #65535 -; CHECK-NEXT: fcvtzu w12, d2 -; CHECK-NEXT: fcvtzu w13, d1 -; CHECK-NEXT: fcvtzu w9, d4 -; CHECK-NEXT: mov d4, v1.d[1] -; CHECK-NEXT: fcvtzu w11, d3 -; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: fcvtzu w12, d1 +; CHECK-NEXT: mov d4, v2.d[1] +; CHECK-NEXT: fcvtzu w13, d3 +; CHECK-NEXT: fcvtzu w9, d5 +; CHECK-NEXT: fcvtzu w11, d0 ; CHECK-NEXT: cmp w9, w8 ; CHECK-NEXT: csel w9, w9, w8, lo ; CHECK-NEXT: cmp w10, w8 ; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: cmp w11, w8 -; CHECK-NEXT: csel w11, w11, w8, lo +; CHECK-NEXT: fmov s0, w10 +; CHECK-NEXT: csel w10, w11, w8, lo ; CHECK-NEXT: cmp w12, w8 -; CHECK-NEXT: csel w12, w12, w8, lo -; CHECK-NEXT: fmov s19, w10 -; CHECK-NEXT: fcvtzu w10, d4 -; CHECK-NEXT: cmp w10, w8 -; CHECK-NEXT: mov v19.s[1], w9 -; CHECK-NEXT: csel w10, w10, w8, lo -; CHECK-NEXT: cmp w13, w8 -; CHECK-NEXT: fmov s18, w12 -; CHECK-NEXT: fcvtzu w9, d1 -; CHECK-NEXT: csel w12, w13, w8, lo -; CHECK-NEXT: fcvtzu w13, d0 -; CHECK-NEXT: mov v18.s[1], w11 +; CHECK-NEXT: csel w11, w12, w8, lo +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: fcvtzu w9, d4 +; CHECK-NEXT: fmov s1, w11 +; CHECK-NEXT: fcvtzu w11, d2 ; CHECK-NEXT: cmp w9, w8 -; CHECK-NEXT: fmov s17, w12 +; CHECK-NEXT: mov d2, v3.d[1] +; CHECK-NEXT: mov w12, v0.s[1] ; CHECK-NEXT: csel w9, w9, w8, lo +; CHECK-NEXT: mov v1.s[1], w10 +; CHECK-NEXT: cmp w11, w8 +; CHECK-NEXT: csel w10, w11, w8, lo +; CHECK-NEXT: mov v0.h[1], w12 +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: fmov s4, w10 +; CHECK-NEXT: fcvtzu w10, d2 +; CHECK-NEXT: mov w12, v1.s[1] +; CHECK-NEXT: mov v0.h[2], w11 +; CHECK-NEXT: mov v4.s[1], w9 +; CHECK-NEXT: cmp w10, w8 +; CHECK-NEXT: csel w9, w10, w8, lo ; CHECK-NEXT: cmp w13, w8 ; CHECK-NEXT: csel w8, w13, w8, lo -; CHECK-NEXT: mov v17.s[1], w10 -; CHECK-NEXT: fmov s16, w8 -; CHECK-NEXT: adrp x8, .LCPI84_0 -; CHECK-NEXT: mov v16.s[1], w9 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI84_0] -; CHECK-NEXT: tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b +; CHECK-NEXT: mov v0.h[3], w12 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov w8, v4.s[1] +; CHECK-NEXT: mov v0.h[4], w10 +; CHECK-NEXT: mov v1.s[1], w9 +; CHECK-NEXT: mov v0.h[5], w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov w9, v1.s[1] +; CHECK-NEXT: mov v0.h[6], w8 +; CHECK-NEXT: mov v0.h[7], w9 ; CHECK-NEXT: ret %x = call <8 x i16> @llvm.fptoui.sat.v8f64.v8i16(<8 x double> %f) ret <8 x i16> %x @@ -2975,83 +2998,107 @@ define <8 x i16> @test_unsigned_v8f64_v8i16(<8 x double> %f) { define <16 x i16> @test_unsigned_v16f64_v16i16(<16 x double> %f) { ; CHECK-LABEL: test_unsigned_v16f64_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov d16, v3.d[1] -; CHECK-NEXT: fcvtzu w9, d3 -; CHECK-NEXT: mov d3, v2.d[1] +; CHECK-NEXT: mov d16, v0.d[1] +; CHECK-NEXT: fcvtzu w9, d0 +; CHECK-NEXT: mov d0, v1.d[1] +; CHECK-NEXT: mov d17, v2.d[1] +; CHECK-NEXT: fcvtzu w10, d1 +; CHECK-NEXT: mov d1, v3.d[1] ; CHECK-NEXT: mov w8, #65535 -; CHECK-NEXT: fcvtzu w10, d2 -; CHECK-NEXT: mov d2, v1.d[1] -; CHECK-NEXT: fcvtzu w11, d1 -; CHECK-NEXT: mov d1, v0.d[1] -; CHECK-NEXT: fcvtzu w12, d16 +; CHECK-NEXT: fcvtzu w12, d2 +; CHECK-NEXT: fcvtzu w11, d16 +; CHECK-NEXT: mov d2, v4.d[1] ; CHECK-NEXT: fcvtzu w13, d0 -; CHECK-NEXT: fcvtzu w14, d3 -; CHECK-NEXT: mov d0, v7.d[1] -; CHECK-NEXT: fcvtzu w15, d2 -; CHECK-NEXT: fcvtzu w17, d6 -; CHECK-NEXT: cmp w12, w8 -; CHECK-NEXT: fcvtzu w16, d1 -; CHECK-NEXT: csel w12, w12, w8, lo -; CHECK-NEXT: cmp w9, w8 -; CHECK-NEXT: csel w9, w9, w8, lo -; CHECK-NEXT: cmp w14, w8 -; CHECK-NEXT: csel w14, w14, w8, lo -; CHECK-NEXT: cmp w10, w8 -; CHECK-NEXT: csel w10, w10, w8, lo -; CHECK-NEXT: cmp w15, w8 -; CHECK-NEXT: fmov s19, w9 -; CHECK-NEXT: csel w9, w15, w8, lo +; CHECK-NEXT: fcvtzu w14, d17 +; CHECK-NEXT: fcvtzu w15, d1 +; CHECK-NEXT: fcvtzu w16, d3 ; CHECK-NEXT: cmp w11, w8 -; CHECK-NEXT: fcvtzu w15, d0 -; CHECK-NEXT: mov d0, v6.d[1] +; CHECK-NEXT: mov d1, v5.d[1] ; CHECK-NEXT: csel w11, w11, w8, lo -; CHECK-NEXT: mov v19.s[1], w12 -; CHECK-NEXT: cmp w16, w8 -; CHECK-NEXT: fcvtzu w12, d7 -; CHECK-NEXT: fmov s18, w10 -; CHECK-NEXT: csel w10, w16, w8, lo +; CHECK-NEXT: cmp w9, w8 +; CHECK-NEXT: csel w9, w9, w8, lo ; CHECK-NEXT: cmp w13, w8 -; CHECK-NEXT: fcvtzu w16, d0 ; CHECK-NEXT: csel w13, w13, w8, lo -; CHECK-NEXT: cmp w15, w8 -; CHECK-NEXT: csel w15, w15, w8, lo +; CHECK-NEXT: cmp w10, w8 +; CHECK-NEXT: csel w10, w10, w8, lo +; CHECK-NEXT: cmp w14, w8 +; CHECK-NEXT: csel w14, w14, w8, lo ; CHECK-NEXT: cmp w12, w8 -; CHECK-NEXT: mov d0, v5.d[1] ; CHECK-NEXT: csel w12, w12, w8, lo +; CHECK-NEXT: cmp w15, w8 +; CHECK-NEXT: fcvtzu w17, d2 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: csel w9, w15, w8, lo +; CHECK-NEXT: fcvtzu w15, d4 ; CHECK-NEXT: cmp w16, w8 -; CHECK-NEXT: mov v18.s[1], w14 -; CHECK-NEXT: fmov s23, w12 -; CHECK-NEXT: csel w12, w16, w8, lo -; CHECK-NEXT: cmp w17, w8 -; CHECK-NEXT: fcvtzu w16, d0 -; CHECK-NEXT: mov d0, v4.d[1] -; CHECK-NEXT: csel w14, w17, w8, lo -; CHECK-NEXT: fcvtzu w17, d5 -; CHECK-NEXT: fmov s17, w11 -; CHECK-NEXT: mov v23.s[1], w15 -; CHECK-NEXT: cmp w16, w8 -; CHECK-NEXT: fmov s22, w14 -; CHECK-NEXT: csel w14, w16, w8, lo +; CHECK-NEXT: fcvtzu w18, d1 +; CHECK-NEXT: csel w16, w16, w8, lo ; CHECK-NEXT: cmp w17, w8 -; CHECK-NEXT: fcvtzu w16, d0 -; CHECK-NEXT: csel w15, w17, w8, lo -; CHECK-NEXT: fcvtzu w11, d4 -; CHECK-NEXT: mov v22.s[1], w12 -; CHECK-NEXT: cmp w16, w8 -; CHECK-NEXT: fmov s21, w15 -; CHECK-NEXT: csel w12, w16, w8, lo +; CHECK-NEXT: csel w17, w17, w8, lo +; CHECK-NEXT: cmp w15, w8 +; CHECK-NEXT: mov v0.s[1], w11 +; CHECK-NEXT: fcvtzu w0, d5 +; CHECK-NEXT: csel w11, w15, w8, lo +; CHECK-NEXT: fmov s2, w10 +; CHECK-NEXT: cmp w18, w8 +; CHECK-NEXT: mov d4, v6.d[1] +; CHECK-NEXT: csel w10, w18, w8, lo +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: fmov s1, w11 +; CHECK-NEXT: csel w11, w0, w8, lo +; CHECK-NEXT: mov v2.s[1], w13 +; CHECK-NEXT: mov w13, v0.s[1] +; CHECK-NEXT: fcvtzu w15, d4 +; CHECK-NEXT: mov v1.s[1], w17 +; CHECK-NEXT: fmov s3, w11 +; CHECK-NEXT: mov d4, v7.d[1] +; CHECK-NEXT: mov v0.h[1], w13 +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: mov v3.s[1], w10 +; CHECK-NEXT: cmp w15, w8 +; CHECK-NEXT: mov w10, v1.s[1] +; CHECK-NEXT: mov w13, v2.s[1] +; CHECK-NEXT: fmov s2, w12 +; CHECK-NEXT: mov v0.h[2], w11 +; CHECK-NEXT: fcvtzu w11, d6 +; CHECK-NEXT: csel w12, w15, w8, lo +; CHECK-NEXT: mov v1.h[1], w10 +; CHECK-NEXT: fmov w10, s3 ; CHECK-NEXT: cmp w11, w8 -; CHECK-NEXT: csel w8, w11, w8, lo -; CHECK-NEXT: mov v17.s[1], w9 -; CHECK-NEXT: adrp x9, .LCPI85_0 -; CHECK-NEXT: mov v21.s[1], w14 -; CHECK-NEXT: fmov s16, w13 -; CHECK-NEXT: fmov s20, w8 -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI85_0] -; CHECK-NEXT: mov v16.s[1], w10 -; CHECK-NEXT: mov v20.s[1], w12 -; CHECK-NEXT: tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b -; CHECK-NEXT: tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b +; CHECK-NEXT: csel w11, w11, w8, lo +; CHECK-NEXT: mov v0.h[3], w13 +; CHECK-NEXT: fcvtzu w13, d7 +; CHECK-NEXT: mov v1.h[2], w10 +; CHECK-NEXT: fmov s5, w11 +; CHECK-NEXT: fcvtzu w10, d4 +; CHECK-NEXT: mov w11, v3.s[1] +; CHECK-NEXT: mov v2.s[1], w14 +; CHECK-NEXT: fmov s3, w16 +; CHECK-NEXT: mov v5.s[1], w12 +; CHECK-NEXT: cmp w10, w8 +; CHECK-NEXT: csel w10, w10, w8, lo +; CHECK-NEXT: cmp w13, w8 +; CHECK-NEXT: csel w8, w13, w8, lo +; CHECK-NEXT: fmov w12, s2 +; CHECK-NEXT: mov v1.h[3], w11 +; CHECK-NEXT: fmov w13, s5 +; CHECK-NEXT: mov w14, v2.s[1] +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: mov w11, v5.s[1] +; CHECK-NEXT: mov v0.h[4], w12 +; CHECK-NEXT: mov v1.h[4], w13 +; CHECK-NEXT: mov v3.s[1], w9 +; CHECK-NEXT: mov v2.s[1], w10 +; CHECK-NEXT: mov v0.h[5], w14 +; CHECK-NEXT: mov v1.h[5], w11 +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov w10, v3.s[1] +; CHECK-NEXT: mov w11, v2.s[1] +; CHECK-NEXT: mov v0.h[6], w8 +; CHECK-NEXT: mov v1.h[6], w9 +; CHECK-NEXT: mov v0.h[7], w10 +; CHECK-NEXT: mov v1.h[7], w11 ; CHECK-NEXT: ret %x = call <16 x i16> @llvm.fptoui.sat.v16f64.v16i16(<16 x double> %f) ret <16 x i16> %x diff --git a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll index aaa7dd0..dd7dd44 100644 --- a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll +++ b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll @@ -268,13 +268,36 @@ entry: define <16 x i8> @extract_4_v4i32_badindex(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; CHECK-LABEL: extract_4_v4i32_badindex: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adrp x8, .LCPI5_0 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI5_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov w9, v0.s[2] +; CHECK-NEXT: mov w10, v0.s[3] +; CHECK-NEXT: mov v0.b[1], w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.b[2], w9 +; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov v0.b[3], w10 +; CHECK-NEXT: mov v0.b[4], w8 +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov v0.b[5], w9 +; CHECK-NEXT: mov w9, v1.s[3] +; CHECK-NEXT: mov v0.b[6], w8 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov v0.b[7], w9 +; CHECK-NEXT: mov w9, v2.s[1] +; CHECK-NEXT: mov v0.b[8], w8 +; CHECK-NEXT: mov w8, v2.s[2] +; CHECK-NEXT: mov v0.b[9], w9 +; CHECK-NEXT: mov w9, v2.s[3] +; CHECK-NEXT: mov v0.b[10], w8 +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov v0.b[11], w9 +; CHECK-NEXT: mov w9, v3.s[1] +; CHECK-NEXT: mov v0.b[12], w8 +; CHECK-NEXT: mov w8, v3.s[2] +; CHECK-NEXT: mov v0.b[13], w9 +; CHECK-NEXT: mov w9, v3.s[3] +; CHECK-NEXT: mov v0.b[14], w8 +; CHECK-NEXT: mov v0.b[15], w9 ; CHECK-NEXT: ret entry: %a0 = extractelement <4 x i32> %a, i32 0 diff --git a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll index a3a36ca..fe49164 100644 --- a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll +++ b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll @@ -1,33 +1,46 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-none-eabi < %s | FileCheck %s -; CHECK: .LCPI0_0: -; CHECK: .byte 0 // 0x0 -; CHECK: .byte 16 // 0x10 -; CHECK: .byte 32 // 0x20 -; CHECK: .byte 48 // 0x30 -; CHECK: .byte 2 // 0x2 -; CHECK: .byte 18 // 0x12 -; CHECK: .byte 34 // 0x22 -; CHECK: .byte 50 // 0x32 -; CHECK: .byte 4 // 0x4 -; CHECK: .byte 20 // 0x14 -; CHECK: .byte 36 // 0x24 -; CHECK: .byte 52 // 0x34 -; CHECK: .byte 6 // 0x6 -; CHECK: .byte 22 // 0x16 -; CHECK: .byte 38 // 0x26 -; CHECK: .byte 54 // 0x36 define <16 x i8> @shuffle4_v4i8_16(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; CHECK-LABEL: shuffle4_v4i8_16: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI0_0 -; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI0_0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umov w10, v1.h[0] +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: umov w8, v2.h[0] +; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-NEXT: fmov s4, w9 +; CHECK-NEXT: mov v4.b[1], w10 +; CHECK-NEXT: mov v4.b[2], w8 +; CHECK-NEXT: umov w8, v3.h[0] +; CHECK-NEXT: mov v4.b[3], w8 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: mov v4.b[4], w8 +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: mov v4.b[5], w8 +; CHECK-NEXT: umov w8, v2.h[1] +; CHECK-NEXT: mov v4.b[6], w8 +; CHECK-NEXT: umov w8, v3.h[1] +; CHECK-NEXT: mov v4.b[7], w8 +; CHECK-NEXT: umov w8, v0.h[2] +; CHECK-NEXT: mov v4.b[8], w8 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: mov v4.b[9], w8 +; CHECK-NEXT: umov w8, v2.h[2] +; CHECK-NEXT: mov v4.b[10], w8 +; CHECK-NEXT: umov w8, v3.h[2] +; CHECK-NEXT: mov v4.b[11], w8 +; CHECK-NEXT: umov w8, v0.h[3] +; CHECK-NEXT: mov v4.b[12], w8 +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: mov v4.b[13], w8 +; CHECK-NEXT: umov w8, v2.h[3] +; CHECK-NEXT: mov v4.b[14], w8 +; CHECK-NEXT: umov w8, v3.h[3] +; CHECK-NEXT: mov v4.b[15], w8 +; CHECK-NEXT: mov v0.16b, v4.16b ; CHECK-NEXT: ret %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> @@ -35,25 +48,30 @@ define <16 x i8> @shuffle4_v4i8_16(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i ret <16 x i8> %z } -; CHECK: .LCPI1_0: -; CHECK: .byte 0 // 0x0 -; CHECK: .byte 16 // 0x10 -; CHECK: .byte 32 // 0x20 -; CHECK: .byte 48 // 0x30 -; CHECK: .byte 2 // 0x2 -; CHECK: .byte 18 // 0x12 -; CHECK: .byte 34 // 0x22 -; CHECK: .byte 50 // 0x32 define <8 x i8> @shuffle4_v4i8_8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; CHECK-LABEL: shuffle4_v4i8_8: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI1_0 -; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umov w10, v1.h[0] +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: umov w8, v2.h[0] +; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-NEXT: fmov s4, w9 +; CHECK-NEXT: umov w9, v3.h[0] +; CHECK-NEXT: mov v4.b[1], w10 +; CHECK-NEXT: mov v4.b[2], w8 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: mov v4.b[3], w9 +; CHECK-NEXT: umov w9, v1.h[1] +; CHECK-NEXT: mov v4.b[4], w8 +; CHECK-NEXT: umov w8, v2.h[1] +; CHECK-NEXT: mov v4.b[5], w9 +; CHECK-NEXT: umov w9, v3.h[1] +; CHECK-NEXT: mov v4.b[6], w8 +; CHECK-NEXT: mov v4.b[7], w9 +; CHECK-NEXT: fmov d0, d4 ; CHECK-NEXT: ret %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> @@ -370,33 +388,46 @@ define <8 x i16> @shuffle4_v4i8_zext(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x ret <8 x i16> %z } -; CHECK: .LCPI9_0: -; CHECK: .byte 0 // 0x0 -; CHECK: .byte 16 // 0x10 -; CHECK: .byte 32 // 0x20 -; CHECK: .byte 48 // 0x30 -; CHECK: .byte 2 // 0x2 -; CHECK: .byte 18 // 0x12 -; CHECK: .byte 34 // 0x22 -; CHECK: .byte 50 // 0x32 -; CHECK: .byte 4 // 0x4 -; CHECK: .byte 20 // 0x14 -; CHECK: .byte 36 // 0x24 -; CHECK: .byte 52 // 0x34 -; CHECK: .byte 6 // 0x6 -; CHECK: .byte 22 // 0x16 -; CHECK: .byte 38 // 0x26 -; CHECK: .byte 54 // 0x36 define <16 x i8> @shuffle4_v4i16_trunc(<4 x i16> %ae, <4 x i16> %be, <4 x i16> %ce, <4 x i16> %de) { ; CHECK-LABEL: shuffle4_v4i16_trunc: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI9_0 -; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI9_0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umov w10, v1.h[0] +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: umov w8, v2.h[0] +; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-NEXT: fmov s4, w9 +; CHECK-NEXT: mov v4.b[1], w10 +; CHECK-NEXT: mov v4.b[2], w8 +; CHECK-NEXT: umov w8, v3.h[0] +; CHECK-NEXT: mov v4.b[3], w8 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: mov v4.b[4], w8 +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: mov v4.b[5], w8 +; CHECK-NEXT: umov w8, v2.h[1] +; CHECK-NEXT: mov v4.b[6], w8 +; CHECK-NEXT: umov w8, v3.h[1] +; CHECK-NEXT: mov v4.b[7], w8 +; CHECK-NEXT: umov w8, v0.h[2] +; CHECK-NEXT: mov v4.b[8], w8 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: mov v4.b[9], w8 +; CHECK-NEXT: umov w8, v2.h[2] +; CHECK-NEXT: mov v4.b[10], w8 +; CHECK-NEXT: umov w8, v3.h[2] +; CHECK-NEXT: mov v4.b[11], w8 +; CHECK-NEXT: umov w8, v0.h[3] +; CHECK-NEXT: mov v4.b[12], w8 +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: mov v4.b[13], w8 +; CHECK-NEXT: umov w8, v2.h[3] +; CHECK-NEXT: mov v4.b[14], w8 +; CHECK-NEXT: umov w8, v3.h[3] +; CHECK-NEXT: mov v4.b[15], w8 +; CHECK-NEXT: mov v0.16b, v4.16b ; CHECK-NEXT: ret %a = trunc <4 x i16> %ae to <4 x i8> %b = trunc <4 x i16> %be to <4 x i8> @@ -408,34 +439,45 @@ define <16 x i8> @shuffle4_v4i16_trunc(<4 x i16> %ae, <4 x i16> %be, <4 x i16> % ret <16 x i8> %z } -; CHECK: .LCPI10_0: -; CHECK: .byte 0 // 0x0 -; CHECK: .byte 16 // 0x10 -; CHECK: .byte 32 // 0x20 -; CHECK: .byte 48 // 0x30 -; CHECK: .byte 2 // 0x2 -; CHECK: .byte 18 // 0x12 -; CHECK: .byte 34 // 0x22 -; CHECK: .byte 50 // 0x32 -; CHECK: .byte 4 // 0x4 -; CHECK: .byte 20 // 0x14 -; CHECK: .byte 36 // 0x24 -; CHECK: .byte 52 // 0x34 -; CHECK: .byte 6 // 0x6 -; CHECK: .byte 22 // 0x16 -; CHECK: .byte 38 // 0x26 -; CHECK: .byte 54 // 0x36 -; CHECK: .text define <16 x i8> @shuffle4_v4i32_trunc(<4 x i32> %ae, <4 x i32> %be, <4 x i32> %ce, <4 x i32> %de) { ; CHECK-LABEL: shuffle4_v4i32_trunc: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI10_0 ; CHECK-NEXT: xtn v4.4h, v0.4s -; CHECK-NEXT: xtn v5.4h, v1.4s -; CHECK-NEXT: xtn v6.4h, v2.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI10_0] -; CHECK-NEXT: xtn v7.4h, v3.4s -; CHECK-NEXT: tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: xtn v2.4h, v2.4s +; CHECK-NEXT: xtn v3.4h, v3.4s +; CHECK-NEXT: umov w8, v4.h[0] +; CHECK-NEXT: umov w9, v1.h[0] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: umov w8, v2.h[0] +; CHECK-NEXT: mov v0.b[1], w9 +; CHECK-NEXT: mov v0.b[2], w8 +; CHECK-NEXT: umov w8, v3.h[0] +; CHECK-NEXT: mov v0.b[3], w8 +; CHECK-NEXT: umov w8, v4.h[1] +; CHECK-NEXT: mov v0.b[4], w8 +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: mov v0.b[5], w8 +; CHECK-NEXT: umov w8, v2.h[1] +; CHECK-NEXT: mov v0.b[6], w8 +; CHECK-NEXT: umov w8, v3.h[1] +; CHECK-NEXT: mov v0.b[7], w8 +; CHECK-NEXT: umov w8, v4.h[2] +; CHECK-NEXT: mov v0.b[8], w8 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: mov v0.b[9], w8 +; CHECK-NEXT: umov w8, v2.h[2] +; CHECK-NEXT: mov v0.b[10], w8 +; CHECK-NEXT: umov w8, v3.h[2] +; CHECK-NEXT: mov v0.b[11], w8 +; CHECK-NEXT: umov w8, v4.h[3] +; CHECK-NEXT: mov v0.b[12], w8 +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: mov v0.b[13], w8 +; CHECK-NEXT: umov w8, v2.h[3] +; CHECK-NEXT: mov v0.b[14], w8 +; CHECK-NEXT: umov w8, v3.h[3] +; CHECK-NEXT: mov v0.b[15], w8 ; CHECK-NEXT: ret %a = trunc <4 x i32> %ae to <4 x i8> %b = trunc <4 x i32> %be to <4 x i8> @@ -447,32 +489,37 @@ define <16 x i8> @shuffle4_v4i32_trunc(<4 x i32> %ae, <4 x i32> %be, <4 x i32> % ret <16 x i8> %z } -; CHECK: .LCPI11_0: -; CHECK: .byte 0 // 0x0 -; CHECK: .byte 16 // 0x10 -; CHECK: .byte 32 // 0x20 -; CHECK: .byte 2 // 0x2 -; CHECK: .byte 18 // 0x12 -; CHECK: .byte 34 // 0x22 -; CHECK: .byte 4 // 0x4 -; CHECK: .byte 20 // 0x14 -; CHECK: .byte 36 // 0x24 -; CHECK: .byte 6 // 0x6 -; CHECK: .byte 22 // 0x16 -; CHECK: .byte 38 // 0x26 -; CHECK: .byte 255 // 0xff -; CHECK: .byte 255 // 0xff -; CHECK: .byte 255 // 0xff -; CHECK: .byte 255 // 0xff define <12 x i8> @shuffle3_v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) { ; CHECK-LABEL: shuffle3_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI11_0 -; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_0] -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v3.16b +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w8, v0.h[0] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umov w9, v1.h[0] +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmov s3, w8 +; CHECK-NEXT: umov w8, v2.h[0] +; CHECK-NEXT: mov v3.b[1], w9 +; CHECK-NEXT: umov w9, v0.h[1] +; CHECK-NEXT: mov v3.b[2], w8 +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: mov v3.b[3], w9 +; CHECK-NEXT: umov w9, v2.h[1] +; CHECK-NEXT: mov v3.b[4], w8 +; CHECK-NEXT: umov w8, v0.h[2] +; CHECK-NEXT: mov v3.b[5], w9 +; CHECK-NEXT: umov w9, v1.h[2] +; CHECK-NEXT: mov v3.b[6], w8 +; CHECK-NEXT: umov w8, v2.h[2] +; CHECK-NEXT: mov v3.b[7], w9 +; CHECK-NEXT: umov w9, v0.h[3] +; CHECK-NEXT: mov v3.b[8], w8 +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: mov v3.b[9], w9 +; CHECK-NEXT: umov w9, v2.h[3] +; CHECK-NEXT: mov v3.b[10], w8 +; CHECK-NEXT: mov v3.b[11], w9 +; CHECK-NEXT: mov v0.16b, v3.16b ; CHECK-NEXT: ret %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> %y = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll index 946128c..d9b610d 100644 --- a/llvm/test/CodeGen/AArch64/tbl-loops.ll +++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll @@ -371,38 +371,59 @@ define void @loop3(i8* noalias nocapture noundef writeonly %dst, float* nocaptur ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB2_8: // %vector.ph ; CHECK-NEXT: add x11, x8, #1 -; CHECK-NEXT: adrp x12, .LCPI2_0 -; CHECK-NEXT: and x10, x11, #0x1fffffffc ; CHECK-NEXT: mov w13, #1132396544 -; CHECK-NEXT: add x8, x10, x10, lsl #1 -; CHECK-NEXT: ldr q0, [x12, :lo12:.LCPI2_0] -; CHECK-NEXT: add x9, x0, x8 +; CHECK-NEXT: and x10, x11, #0x1fffffffc ; CHECK-NEXT: mov x12, x10 -; CHECK-NEXT: add x8, x1, x8, lsl #2 -; CHECK-NEXT: dup v1.4s, w13 +; CHECK-NEXT: add x9, x10, x10, lsl #1 +; CHECK-NEXT: dup v0.4s, w13 +; CHECK-NEXT: add x8, x1, x9, lsl #2 +; CHECK-NEXT: add x9, x0, x9 ; CHECK-NEXT: .LBB2_9: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld3 { v2.4s, v3.4s, v4.4s }, [x1], #48 -; CHECK-NEXT: fcmlt v5.4s, v2.4s, #0.0 -; CHECK-NEXT: add x13, x0, #8 -; CHECK-NEXT: fmin v6.4s, v2.4s, v1.4s +; CHECK-NEXT: ld3 { v1.4s, v2.4s, v3.4s }, [x1], #48 +; CHECK-NEXT: fcmlt v4.4s, v1.4s, #0.0 ; CHECK-NEXT: subs x12, x12, #4 +; CHECK-NEXT: fmin v5.4s, v1.4s, v0.4s +; CHECK-NEXT: fmin v6.4s, v2.4s, v0.4s ; CHECK-NEXT: fcmlt v7.4s, v3.4s, #0.0 -; CHECK-NEXT: fmin v16.4s, v3.4s, v1.4s -; CHECK-NEXT: fmin v2.4s, v4.4s, v1.4s -; CHECK-NEXT: bic v5.16b, v6.16b, v5.16b -; CHECK-NEXT: fcmlt v6.4s, v4.4s, #0.0 -; CHECK-NEXT: bic v3.16b, v16.16b, v7.16b -; CHECK-NEXT: fcvtzs v4.4s, v5.4s -; CHECK-NEXT: fcvtzs v3.4s, v3.4s -; CHECK-NEXT: bic v2.16b, v2.16b, v6.16b +; CHECK-NEXT: fmin v1.4s, v3.4s, v0.4s +; CHECK-NEXT: bic v4.16b, v5.16b, v4.16b +; CHECK-NEXT: fcmlt v5.4s, v2.4s, #0.0 +; CHECK-NEXT: fcvtzs v4.4s, v4.4s +; CHECK-NEXT: bic v1.16b, v1.16b, v7.16b +; CHECK-NEXT: fcvtzs v1.4s, v1.4s +; CHECK-NEXT: bic v2.16b, v6.16b, v5.16b ; CHECK-NEXT: fcvtzs v2.4s, v2.4s -; CHECK-NEXT: xtn v4.4h, v4.4s -; CHECK-NEXT: xtn v5.4h, v3.4s -; CHECK-NEXT: xtn v6.4h, v2.4s -; CHECK-NEXT: tbl v2.16b, { v4.16b, v5.16b, v6.16b }, v0.16b -; CHECK-NEXT: str d2, [x0], #12 -; CHECK-NEXT: st1 { v2.s }[2], [x13] +; CHECK-NEXT: xtn v3.4h, v4.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: umov w13, v3.h[0] +; CHECK-NEXT: xtn v2.4h, v2.4s +; CHECK-NEXT: umov w14, v2.h[0] +; CHECK-NEXT: fmov s4, w13 +; CHECK-NEXT: umov w13, v1.h[0] +; CHECK-NEXT: mov v4.b[1], w14 +; CHECK-NEXT: umov w14, v3.h[1] +; CHECK-NEXT: mov v4.b[2], w13 +; CHECK-NEXT: umov w13, v2.h[1] +; CHECK-NEXT: mov v4.b[3], w14 +; CHECK-NEXT: umov w14, v1.h[1] +; CHECK-NEXT: mov v4.b[4], w13 +; CHECK-NEXT: umov w13, v3.h[2] +; CHECK-NEXT: mov v4.b[5], w14 +; CHECK-NEXT: umov w14, v2.h[2] +; CHECK-NEXT: mov v4.b[6], w13 +; CHECK-NEXT: umov w13, v1.h[2] +; CHECK-NEXT: mov v4.b[7], w14 +; CHECK-NEXT: umov w14, v3.h[3] +; CHECK-NEXT: mov v4.b[8], w13 +; CHECK-NEXT: umov w13, v2.h[3] +; CHECK-NEXT: mov v4.b[9], w14 +; CHECK-NEXT: umov w14, v1.h[3] +; CHECK-NEXT: mov v4.b[10], w13 +; CHECK-NEXT: add x13, x0, #8 +; CHECK-NEXT: mov v4.b[11], w14 +; CHECK-NEXT: str d4, [x0], #12 +; CHECK-NEXT: st1 { v4.s }[2], [x13] ; CHECK-NEXT: b.ne .LBB2_9 ; CHECK-NEXT: // %bb.10: // %middle.block ; CHECK-NEXT: cmp x11, x10 @@ -585,40 +606,69 @@ define void @loop4(i8* noalias nocapture noundef writeonly %dst, float* nocaptur ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB3_8: // %vector.ph ; CHECK-NEXT: add x11, x8, #1 -; CHECK-NEXT: adrp x12, .LCPI3_0 -; CHECK-NEXT: and x10, x11, #0x1fffffffc ; CHECK-NEXT: mov w13, #1132396544 +; CHECK-NEXT: and x10, x11, #0x1fffffffc +; CHECK-NEXT: mov x12, x10 ; CHECK-NEXT: add x8, x1, x10, lsl #4 ; CHECK-NEXT: add x9, x0, x10, lsl #2 -; CHECK-NEXT: ldr q0, [x12, :lo12:.LCPI3_0] -; CHECK-NEXT: mov x12, x10 -; CHECK-NEXT: dup v1.4s, w13 +; CHECK-NEXT: dup v0.4s, w13 ; CHECK-NEXT: .LBB3_9: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x1], #64 -; CHECK-NEXT: fcmlt v6.4s, v2.4s, #0.0 +; CHECK-NEXT: ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x1], #64 +; CHECK-NEXT: fcmlt v5.4s, v1.4s, #0.0 ; CHECK-NEXT: subs x12, x12, #4 -; CHECK-NEXT: fmin v7.4s, v2.4s, v1.4s +; CHECK-NEXT: fmin v6.4s, v1.4s, v0.4s +; CHECK-NEXT: fmin v7.4s, v2.4s, v0.4s ; CHECK-NEXT: fcmlt v16.4s, v3.4s, #0.0 -; CHECK-NEXT: fmin v17.4s, v3.4s, v1.4s -; CHECK-NEXT: fmin v18.4s, v4.4s, v1.4s +; CHECK-NEXT: fmin v17.4s, v3.4s, v0.4s +; CHECK-NEXT: bic v5.16b, v6.16b, v5.16b +; CHECK-NEXT: fcmlt v6.4s, v2.4s, #0.0 +; CHECK-NEXT: fcvtzs v5.4s, v5.4s +; CHECK-NEXT: fmin v1.4s, v4.4s, v0.4s ; CHECK-NEXT: bic v6.16b, v7.16b, v6.16b -; CHECK-NEXT: fcmlt v7.4s, v4.4s, #0.0 -; CHECK-NEXT: bic v16.16b, v17.16b, v16.16b -; CHECK-NEXT: fcmlt v17.4s, v5.4s, #0.0 -; CHECK-NEXT: fmin v2.4s, v5.4s, v1.4s -; CHECK-NEXT: fcvtzs v4.4s, v6.4s -; CHECK-NEXT: bic v3.16b, v18.16b, v7.16b -; CHECK-NEXT: fcvtzs v5.4s, v16.4s -; CHECK-NEXT: fcvtzs v3.4s, v3.4s -; CHECK-NEXT: bic v2.16b, v2.16b, v17.16b -; CHECK-NEXT: fcvtzs v2.4s, v2.4s -; CHECK-NEXT: xtn v16.4h, v4.4s -; CHECK-NEXT: xtn v17.4h, v5.4s -; CHECK-NEXT: xtn v18.4h, v3.4s -; CHECK-NEXT: xtn v19.4h, v2.4s -; CHECK-NEXT: tbl v2.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b -; CHECK-NEXT: str q2, [x0], #16 +; CHECK-NEXT: fcvtzs v6.4s, v6.4s +; CHECK-NEXT: xtn v5.4h, v5.4s +; CHECK-NEXT: bic v7.16b, v17.16b, v16.16b +; CHECK-NEXT: fcmlt v16.4s, v4.4s, #0.0 +; CHECK-NEXT: umov w13, v5.h[0] +; CHECK-NEXT: xtn v2.4h, v6.4s +; CHECK-NEXT: fcvtzs v3.4s, v7.4s +; CHECK-NEXT: umov w14, v2.h[0] +; CHECK-NEXT: bic v1.16b, v1.16b, v16.16b +; CHECK-NEXT: fmov s4, w13 +; CHECK-NEXT: xtn v3.4h, v3.4s +; CHECK-NEXT: fcvtzs v1.4s, v1.4s +; CHECK-NEXT: mov v4.b[1], w14 +; CHECK-NEXT: umov w13, v3.h[0] +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: mov v4.b[2], w13 +; CHECK-NEXT: umov w13, v1.h[0] +; CHECK-NEXT: mov v4.b[3], w13 +; CHECK-NEXT: umov w13, v5.h[1] +; CHECK-NEXT: mov v4.b[4], w13 +; CHECK-NEXT: umov w13, v2.h[1] +; CHECK-NEXT: mov v4.b[5], w13 +; CHECK-NEXT: umov w13, v3.h[1] +; CHECK-NEXT: mov v4.b[6], w13 +; CHECK-NEXT: umov w13, v1.h[1] +; CHECK-NEXT: mov v4.b[7], w13 +; CHECK-NEXT: umov w13, v5.h[2] +; CHECK-NEXT: mov v4.b[8], w13 +; CHECK-NEXT: umov w13, v2.h[2] +; CHECK-NEXT: mov v4.b[9], w13 +; CHECK-NEXT: umov w13, v3.h[2] +; CHECK-NEXT: mov v4.b[10], w13 +; CHECK-NEXT: umov w13, v1.h[2] +; CHECK-NEXT: mov v4.b[11], w13 +; CHECK-NEXT: umov w13, v5.h[3] +; CHECK-NEXT: mov v4.b[12], w13 +; CHECK-NEXT: umov w13, v2.h[3] +; CHECK-NEXT: mov v4.b[13], w13 +; CHECK-NEXT: umov w13, v3.h[3] +; CHECK-NEXT: mov v4.b[14], w13 +; CHECK-NEXT: umov w13, v1.h[3] +; CHECK-NEXT: mov v4.b[15], w13 +; CHECK-NEXT: str q4, [x0], #16 ; CHECK-NEXT: b.ne .LBB3_9 ; CHECK-NEXT: // %bb.10: // %middle.block ; CHECK-NEXT: cmp x11, x10