From b147b88c8432cdc14a3238925dbfb8d55be32932 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Wed, 6 May 2020 17:14:15 +0100 Subject: [PATCH] [CodeGen] Add support for extracting elements of scalable vectors I have tried to ensure that SelectionDAG and DAGCombiner do sensible things for scalable vectors, and added support for a limited number of simple folds. Codegen support for the vector extract patterns have also been added to the AArch64 backend. New vector extract tests have been added here: CodeGen/AArch64/sve-extract-element.ll and I have also added new folds using inserts and extracts here: CodeGen/AArch64/sve-insert-element.ll Differential Revision: https://reviews.llvm.org/D80208 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 24 ++- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 39 +++- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 40 ++++ llvm/test/CodeGen/AArch64/sve-extract-element.ll | 247 +++++++++++++++++++++++ llvm/test/CodeGen/AArch64/sve-insert-element.ll | 206 +++++++++++++------ 5 files changed, 481 insertions(+), 75 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve-extract-element.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 7e41b2f..9216151 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17429,16 +17429,21 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // extract_vector_elt of out-of-bounds element -> UNDEF auto *IndexC = dyn_cast(Index); - unsigned NumElts = VecVT.getVectorNumElements(); - unsigned VecEltBitWidth = VecVT.getScalarSizeInBits(); - if (IndexC && IndexC->getAPIntValue().uge(NumElts)) + if (IndexC && VecVT.isFixedLengthVector() && + IndexC->getAPIntValue().uge(VecVT.getVectorNumElements())) return DAG.getUNDEF(ScalarVT); // extract_vector_elt (build_vector x, y), 1 -> y - if (IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR && + if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) || + VecOp.getOpcode() == ISD::SPLAT_VECTOR) && TLI.isTypeLegal(VecVT) && (VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT))) { - SDValue Elt = VecOp.getOperand(IndexC->getZExtValue()); + assert((VecOp.getOpcode() != ISD::BUILD_VECTOR || + VecVT.isFixedLengthVector()) && + "BUILD_VECTOR used for scalable vectors"); + unsigned IndexVal = + VecOp.getOpcode() == ISD::BUILD_VECTOR ? IndexC->getZExtValue() : 0; + SDValue Elt = VecOp.getOperand(IndexVal); EVT InEltVT = Elt.getValueType(); // Sometimes build_vector's scalar input types do not match result type. @@ -17449,6 +17454,15 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // converts. } + if (VecVT.isScalableVector()) + return SDValue(); + + // All the code from this point onwards assumes fixed width vectors, but it's + // possible that some of the combinations could be made to work for scalable + // vectors too. + unsigned NumElts = VecVT.getVectorNumElements(); + unsigned VecEltBitWidth = VecVT.getScalarSizeInBits(); + // TODO: These transforms should not require the 'hasOneUse' restriction, but // there are regressions on multiple targets without it. We can end up with a // mess of scalar and vector code if we reduce only part of the DAG to scalar. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index cfb15d6..2f277ee 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -38,6 +38,7 @@ #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" @@ -5362,15 +5363,19 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (N1.isUndef() || N2.isUndef()) return getUNDEF(VT); - // EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF - if (N2C && N2C->getAPIntValue().uge(N1.getValueType().getVectorNumElements())) + // EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF for fixed length + // vectors. For scalable vectors we will provide appropriate support for + // dealing with arbitrary indices. + if (N2C && N1.getValueType().isFixedLengthVector() && + N2C->getAPIntValue().uge(N1.getValueType().getVectorNumElements())) return getUNDEF(VT); // EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is - // expanding copies of large vectors from registers. - if (N2C && - N1.getOpcode() == ISD::CONCAT_VECTORS && - N1.getNumOperands() > 0) { + // expanding copies of large vectors from registers. This only works for + // fixed length vectors, since we need to know the exact number of + // elements. + if (N2C && N1.getOperand(0).getValueType().isFixedLengthVector() && + N1.getOpcode() == ISD::CONCAT_VECTORS && N1.getNumOperands() > 0) { unsigned Factor = N1.getOperand(0).getValueType().getVectorNumElements(); return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, @@ -5378,10 +5383,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, getVectorIdxConstant(N2C->getZExtValue() % Factor, DL)); } - // EXTRACT_VECTOR_ELT of BUILD_VECTOR is often formed while lowering is - // expanding large vector constants. - if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) { - SDValue Elt = N1.getOperand(N2C->getZExtValue()); + // EXTRACT_VECTOR_ELT of BUILD_VECTOR or SPLAT_VECTOR is often formed while + // lowering is expanding large vector constants. + if (N2C && (N1.getOpcode() == ISD::BUILD_VECTOR || + N1.getOpcode() == ISD::SPLAT_VECTOR)) { + assert((N1.getOpcode() != ISD::BUILD_VECTOR || + N1.getValueType().isFixedLengthVector()) && + "BUILD_VECTOR used for scalable vectors"); + unsigned Index = + N1.getOpcode() == ISD::BUILD_VECTOR ? N2C->getZExtValue() : 0; + SDValue Elt = N1.getOperand(Index); if (VT != Elt.getValueType()) // If the vector element type is not legal, the BUILD_VECTOR operands @@ -5415,8 +5426,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, // EXTRACT_VECTOR_ELT of v1iX EXTRACT_SUBVECTOR could be formed // when vector types are scalarized and v1iX is legal. - // vextract (v1iX extract_subvector(vNiX, Idx)) -> vextract(vNiX,Idx) + // vextract (v1iX extract_subvector(vNiX, Idx)) -> vextract(vNiX,Idx). + // Here we are completely ignoring the extract element index (N2), + // which is fine for fixed width vectors, since any index other than 0 + // is undefined anyway. However, this cannot be ignored for scalable + // vectors - in theory we could support this, but we don't want to do this + // without a profitability check. if (N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N1.getValueType().isFixedLengthVector() && N1.getValueType().getVectorNumElements() == 1) { return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0), N1.getOperand(1)); diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 54a7643..f5b983a 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1897,6 +1897,46 @@ multiclass sve_prefetch; + + // Extract element from vector with immediate index + def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)), + (EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>; + def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)), + (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), ssub)>; + def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)), + (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>; + def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)), + (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>; + def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)), + (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>; + def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)), + (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>; + def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)), + (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>; + + // Extract element from vector with scalar index + def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index), + ZPR:$vec)>; + def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index), + ZPR:$vec)>; + def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index), + ZPR:$vec)>; + def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index), + ZPR:$vec)>; + + def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index), + ZPR:$vec)>; + def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index), + ZPR:$vec)>; + def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index), + ZPR:$vec)>; } let Predicates = [HasSVE, HasMatMulInt8] in { diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll new file mode 100644 index 0000000..4cb3103 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll @@ -0,0 +1,247 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +define i8 @test_lane0_16xi8( %a) { +; CHECK-LABEL: test_lane0_16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.b, b0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = extractelement %a, i32 0 + ret i8 %b +} + +define i16 @test_lane0_8xi16( %a) { +; CHECK-LABEL: test_lane0_8xi16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = extractelement %a, i32 0 + ret i16 %b +} + +define i32 @test_lane0_4xi32( %a) { +; CHECK-LABEL: test_lane0_4xi32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = extractelement %a, i32 0 + ret i32 %b +} + +define i64 @test_lane0_2xi64( %a) { +; CHECK-LABEL: test_lane0_2xi64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %b = extractelement %a, i32 0 + ret i64 %b +} + +define double @test_lane0_2xf64( %a) { +; CHECK-LABEL: test_lane0_2xf64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %b = extractelement %a, i32 0 + ret double %b +} + +define float @test_lane0_4xf32( %a) { +; CHECK-LABEL: test_lane0_4xf32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: ret + %b = extractelement %a, i32 0 + ret float %b +} + +define half @test_lane0_8xf16( %a) { +; CHECK-LABEL: test_lane0_8xf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: ret + %b = extractelement %a, i32 0 + ret half %b +} + +define i8 @test_lanex_16xi8( %a, i32 %x) { +; CHECK-LABEL: test_lanex_16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: whilels p0.b, xzr, x8 +; CHECK-NEXT: lastb w0, p0, z0.b +; CHECK-NEXT: ret + %b = extractelement %a, i32 %x + ret i8 %b +} + +define i16 @test_lanex_8xi16( %a, i32 %x) { +; CHECK-LABEL: test_lanex_8xi16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: whilels p0.h, xzr, x8 +; CHECK-NEXT: lastb w0, p0, z0.h +; CHECK-NEXT: ret + %b = extractelement %a, i32 %x + ret i16 %b +} + +define i32 @test_lanex_4xi32( %a, i32 %x) { +; CHECK-LABEL: test_lanex_4xi32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: whilels p0.s, xzr, x8 +; CHECK-NEXT: lastb w0, p0, z0.s +; CHECK-NEXT: ret + %b = extractelement %a, i32 %x + ret i32 %b +} + +define i64 @test_lanex_2xi64( %a, i32 %x) { +; CHECK-LABEL: test_lanex_2xi64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: whilels p0.d, xzr, x8 +; CHECK-NEXT: lastb x0, p0, z0.d +; CHECK-NEXT: ret + %b = extractelement %a, i32 %x + ret i64 %b +} + +define double @test_lanex_2xf64( %a, i32 %x) { +; CHECK-LABEL: test_lanex_2xf64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: whilels p0.d, xzr, x8 +; CHECK-NEXT: lastb d0, p0, z0.d +; CHECK-NEXT: ret + %b = extractelement %a, i32 %x + ret double %b +} + +define float @test_lanex_4xf32( %a, i32 %x) { +; CHECK-LABEL: test_lanex_4xf32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: whilels p0.s, xzr, x8 +; CHECK-NEXT: lastb s0, p0, z0.s +; CHECK-NEXT: ret + %b = extractelement %a, i32 %x + ret float %b +} + +define half @test_lanex_8xf16( %a, i32 %x) { +; CHECK-LABEL: test_lanex_8xf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: whilels p0.h, xzr, x8 +; CHECK-NEXT: lastb h0, p0, z0.h +; CHECK-NEXT: ret + %b = extractelement %a, i32 %x + ret half %b +} + +; Deliberately choose an index that is out-of-bounds +define i8 @test_lane64_16xi8( %a) { +; CHECK-LABEL: test_lane64_16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #64 +; CHECK-NEXT: whilels p0.b, xzr, x8 +; CHECK-NEXT: lastb w0, p0, z0.b +; CHECK-NEXT: ret + %b = extractelement %a, i32 64 + ret i8 %b +} + +define double @test_lane9_2xf64( %a) { +; CHECK-LABEL: test_lane9_2xf64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #9 +; CHECK-NEXT: whilels p0.d, xzr, x8 +; CHECK-NEXT: lastb d0, p0, z0.d +; CHECK-NEXT: ret + %b = extractelement %a, i32 9 + ret double %b +} + +; Deliberately choose an index that is undefined +define i32 @test_lane64_4xi32( %a) { +; CHECK-LABEL: test_lane64_4xi32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = extractelement %a, i32 undef + ret i32 %b +} + +define i8 @extract_of_insert_undef_16xi8(i8 %a) { +; CHECK-LABEL: extract_of_insert_undef_16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = insertelement undef, i8 %a, i32 0 + %c = extractelement %b, i32 0 + ret i8 %c +} + +define i8 @extract0_of_insert0_16xi8( %a, i8 %b) { +; CHECK-LABEL: extract0_of_insert0_16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %c = insertelement %a, i8 %b, i32 0 + %d = extractelement %c, i32 0 + ret i8 %d +} + +define i8 @extract64_of_insert64_16xi8( %a, i8 %b) { +; CHECK-LABEL: extract64_of_insert64_16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %c = insertelement %a, i8 %b, i32 64 + %d = extractelement %c, i32 64 + ret i8 %d +} + +define i8 @extract_of_insert_diff_lanes_16xi8( %a, i8 %b) { +; CHECK-LABEL: extract_of_insert_diff_lanes_16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.b, z0.b[3] +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %c = insertelement %a, i8 %b, i32 0 + %d = extractelement %c, i32 3 + ret i8 %d +} + +define i8 @test_lane0_zero_16xi8( %a) { +; CHECK-LABEL: test_lane0_zero_16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret + %b = extractelement zeroinitializer, i32 0 + ret i8 %b +} + +; The DAG combiner should fold the extract of a splat to give element zero +; of the splat, i.e. %x. If the index is beyond the end of the scalable +; vector the result is undefined anyway. +define i64 @test_lanex_splat_2xi64(i64 %x, i32 %y) { +; CHECK-LABEL: test_lanex_splat_2xi64: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %a = insertelement undef, i64 %x, i32 0 + %b = shufflevector %a, undef, zeroinitializer + %c = extractelement %b, i32 %y + ret i64 %c +} diff --git a/llvm/test/CodeGen/AArch64/sve-insert-element.ll b/llvm/test/CodeGen/AArch64/sve-insert-element.ll index 90acf8c..daaaa6d 100644 --- a/llvm/test/CodeGen/AArch64/sve-insert-element.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-element.ll @@ -1,106 +1,142 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s define @test_lane0_16xi8( %a) { -; CHECK-LABEL: test_lane0_16xi8 -; CHECK: mov [[REG:.*]], #30 -; CHECK: mov z0.b, p{{[0-7]}}/m, [[REG]] +; CHECK-LABEL: test_lane0_16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl1 +; CHECK-NEXT: mov w8, #30 +; CHECK-NEXT: mov z0.b, p0/m, w8 +; CHECK-NEXT: ret %b = insertelement %a, i8 30, i32 0 ret %b } define @test_lane0_8xi16( %a) { -; CHECK-LABEL: test_lane0_8xi16 -; CHECK: mov [[REG:.*]], #30 -; CHECK: mov z0.h, p{{[0-7]}}/m, [[REG]] +; CHECK-LABEL: test_lane0_8xi16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl1 +; CHECK-NEXT: mov w8, #30 +; CHECK-NEXT: mov z0.h, p0/m, w8 +; CHECK-NEXT: ret %b = insertelement %a, i16 30, i32 0 ret %b } define @test_lane0_4xi32( %a) { -; CHECK-LABEL: test_lane0_4xi32 -; CHECK: mov [[REG:.*]], #30 -; CHECK: mov z0.s, p{{[0-7]}}/m, [[REG]] +; CHECK-LABEL: test_lane0_4xi32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl1 +; CHECK-NEXT: mov w8, #30 +; CHECK-NEXT: mov z0.s, p0/m, w8 +; CHECK-NEXT: ret %b = insertelement %a, i32 30, i32 0 ret %b } define @test_lane0_2xi64( %a) { -; CHECK-LABEL: test_lane0_2xi64 -; CHECK: mov w[[REG:.*]], #30 -; CHECK: mov z0.d, p{{[0-7]}}/m, x[[REG]] +; CHECK-LABEL: test_lane0_2xi64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: mov w8, #30 +; CHECK-NEXT: mov z0.d, p0/m, x8 +; CHECK-NEXT: ret %b = insertelement %a, i64 30, i32 0 ret %b } define @test_lane0_2xf64( %a) { -; CHECK-LABEL: test_lane0_2xf64 -; CHECK: fmov d[[REG:[0-9]+]], #1.00000000 -; CHECK: mov z0.d, p{{[0-7]}}/m, z[[REG]].d +; CHECK-LABEL: test_lane0_2xf64: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d1, #1.00000000 +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: mov z0.d, p0/m, z1.d +; CHECK-NEXT: ret %b = insertelement %a, double 1.0, i32 0 ret %b } define @test_lane0_4xf32( %a) { -; CHECK-LABEL: test_lane0_4xf32 -; CHECK: fmov s[[REG:[0-9]+]], #1.00000000 -; CHECK: mov z0.s, p{{[0-7]}}/m, z[[REG]].s +; CHECK-LABEL: test_lane0_4xf32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s1, #1.00000000 +; CHECK-NEXT: ptrue p0.s, vl1 +; CHECK-NEXT: mov z0.s, p0/m, z1.s +; CHECK-NEXT: ret %b = insertelement %a, float 1.0, i32 0 ret %b } define @test_lane0_8xf16( %a) { -; CHECK-LABEL: test_lane0_8xf16 -; CHECK: fmov h[[REG:[0-9]+]], #1.00000000 -; CHECK: mov z0.h, p{{[0-7]}}/m, z[[REG]].h +; CHECK-LABEL: test_lane0_8xf16: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov h1, #1.00000000 +; CHECK-NEXT: ptrue p0.h, vl1 +; CHECK-NEXT: mov z0.h, p0/m, z1.h +; CHECK-NEXT: ret %b = insertelement %a, half 1.0, i32 0 ret %b } ; Undefined lane insert define @test_lane4_2xi64( %a) { -; CHECK-LABEL: test_lane4_2xi64 -; CHECK: mov w[[IDXREG:.*]], #4 -; CHECK: index z[[CMPVEC:[0-9]+]].d, #0, #1 -; CHECK: mov z[[IDXVEC:[0-9]+]].d, x[[IDXREG]] -; CHECK: cmpeq p[[PRED:[0-9]+]].d, p{{[0-7]}}/z, z[[CMPVEC]].d, z[[IDXVEC]].d -; CHECK: mov w[[VALREG:.*]], #30 -; CHECK: mov z0.d, p[[PRED]]/m, x[[VALREG]] +; CHECK-LABEL: test_lane4_2xi64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d +; CHECK-NEXT: mov w8, #30 +; CHECK-NEXT: mov z0.d, p0/m, x8 +; CHECK-NEXT: ret %b = insertelement %a, i64 30, i32 4 ret %b } ; Undefined lane insert define @test_lane9_8xf16( %a) { -; CHECK-LABEL: test_lane9_8xf16 -; CHECK: mov w[[IDXREG:.*]], #9 -; CHECK: index z[[CMPVEC:[0-9]+]].h, #0, #1 -; CHECK: mov z[[IDXVEC:[0-9]+]].h, w[[IDXREG]] -; CHECK: cmpeq p[[PRED:[0-9]+]].h, p{{[0-7]}}/z, z[[CMPVEC]].h, z[[IDXVEC]].h -; CHECK: fmov h[[VALREG:[0-9]+]], #1.00000000 -; CHECK: mov z0.h, p[[PRED]]/m, h[[VALREG]] +; CHECK-LABEL: test_lane9_8xf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #9 +; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h +; CHECK-NEXT: fmov h1, #1.00000000 +; CHECK-NEXT: mov z0.h, p0/m, h1 +; CHECK-NEXT: ret %b = insertelement %a, half 1.0, i32 9 ret %b } define @test_lane1_16xi8( %a) { -; CHECK-LABEL: test_lane1_16xi8 -; CHECK: mov w[[IDXREG:.*]], #1 -; CHECK: index z[[CMPVEC:[0-9]+]].b, #0, #1 -; CHECK: mov z[[IDXVEC:[0-9]+]].b, w[[IDXREG]] -; CHECK: cmpeq p[[PRED:[0-9]+]].b, p{{[0-7]}}/z, z[[CMPVEC]].b, z[[IDXVEC]].b -; CHECK: mov w[[VALREG:.*]], #30 -; CHECK: mov z0.b, p[[PRED]]/m, w[[VALREG]] +; CHECK-LABEL: test_lane1_16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z2.b, w8 +; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z2.b +; CHECK-NEXT: mov w8, #30 +; CHECK-NEXT: mov z0.b, p0/m, w8 +; CHECK-NEXT: ret %b = insertelement %a, i8 30, i32 1 ret %b } define @test_lanex_16xi8( %a, i32 %x) { -; CHECK-LABEL: test_lanex_16xi8 -; CHECK: index z[[CMPVEC:[0-9]+]].b, #0, #1 -; CHECK: mov z[[IDXVEC:[0-9]+]].b, w[[IDXREG]] -; CHECK: cmpeq p[[PRED:[0-9]+]].b, p{{[0-7]}}/z, z[[CMPVEC]].b, z[[IDXVEC]].b -; CHECK: mov w[[VALREG:.*]], #30 -; CHECK: mov z0.b, p[[PRED]]/m, w[[VALREG]] +; CHECK-LABEL: test_lanex_16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z2.b, w8 +; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z2.b +; CHECK-NEXT: mov w8, #30 +; CHECK-NEXT: mov z0.b, p0/m, w8 +; CHECK-NEXT: ret %b = insertelement %a, i8 30, i32 %x ret %b } @@ -108,28 +144,80 @@ define @test_lanex_16xi8( %a, i32 %x) { ; Redundant lane insert define @extract_insert_4xi32( %a) { -; CHECK-LABEL: extract_insert_4xi32 -; CHECK-NOT: mov w{{.*}}, #30 -; CHECK-NOT: mov z0.d +; CHECK-LABEL: extract_insert_4xi32: +; CHECK: // %bb.0: +; CHECK-NEXT: ret %b = extractelement %a, i32 2 %c = insertelement %a, i32 %b, i32 2 ret %c } define @test_lane6_undef_8xi16(i16 %a) { -; CHECK-LABEL: test_lane6_undef_8xi16 -; CHECK: mov w[[IDXREG:.*]], #6 -; CHECK: index z[[CMPVEC:.*]].h, #0, #1 -; CHECK: mov z[[IDXVEC:[0-9]+]].h, w[[IDXREG]] -; CHECK: cmpeq p[[PRED:.*]].h, p{{.*}}/z, z[[CMPVEC]].h, z[[IDXVEC]].h -; CHECK: mov z0.h, p[[PRED]]/m, w0 +; CHECK-LABEL: test_lane6_undef_8xi16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #6 +; CHECK-NEXT: index z0.h, #0, #1 +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z0.h, p0/m, w0 +; CHECK-NEXT: ret %b = insertelement undef, i16 %a, i32 6 ret %b } define @test_lane0_undef_16xi8(i8 %a) { -; CHECK-LABEL: test_lane0_undef_16xi8 -; CHECK: fmov s0, w0 +; CHECK-LABEL: test_lane0_undef_16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ret %b = insertelement undef, i8 %a, i32 0 ret %b } + +define @test_insert0_of_extract0_16xi8( %a, %b) { +; CHECK-LABEL: test_insert0_of_extract0_16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.b, b1 +; CHECK-NEXT: ptrue p0.b, vl1 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z0.b, p0/m, w8 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement %a, i8 %c, i32 0 + ret %d +} + +define @test_insert64_of_extract64_16xi8( %a, %b) { +; CHECK-LABEL: test_insert64_of_extract64_16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #64 +; CHECK-NEXT: index z2.b, #0, #1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: whilels p1.b, xzr, x8 +; CHECK-NEXT: mov z3.b, w8 +; CHECK-NEXT: lastb w8, p1, z1.b +; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b +; CHECK-NEXT: mov z0.b, p0/m, w8 +; CHECK-NEXT: ret + %c = extractelement %b, i32 64 + %d = insertelement %a, i8 %c, i32 64 + ret %d +} + +define @test_insert3_of_extract1_16xi8( %a, %b) { +; CHECK-LABEL: test_insert3_of_extract1_16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.b, z1.b[1] +; CHECK-NEXT: mov w8, #3 +; CHECK-NEXT: index z2.b, #0, #1 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z1.b, w8 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z1.b +; CHECK-NEXT: mov z0.b, p0/m, w9 +; CHECK-NEXT: ret + %c = extractelement %b, i32 1 + %d = insertelement %a, i8 %c, i32 3 + ret %d +} -- 2.7.4