From 9249f606024beb65c9c4871d8b82aa47fe4a4f57 Mon Sep 17 00:00:00 2001 From: Andrzej Warzynski Date: Wed, 19 Feb 2020 12:25:30 +0000 Subject: [PATCH] [AArch64][SVE] Add intrinsics for non-temporal gather-loads/scatter-stores Summary: This patch adds the following LLVM IR intrinsics for SVE: 1. non-temporal gather loads * @llvm.aarch64.sve.ldnt1.gather * @llvm.aarch64.sve.ldnt1.gather.uxtw * @llvm.aarch64.sve.ldnt1.gather.scalar.offset 2. non-temporal scatter stores * @llvm.aarch64.sve.stnt1.scatter * @llvm.aarch64.sve.ldnt1.gather.uxtw * @llvm.aarch64.sve.ldnt1.gather.scalar.offset These intrinsic are mapped to the corresponding SVE instructions (example for half-words, zero-extending): * ldnt1h { z0.s }, p0/z, [z0.s, x0] * stnt1h { z0.s }, p0/z, [z0.s, x0] Note that for non-temporal gathers/scatters, the SVE spec defines only one instruction type: "vector + scalar". For this reason, we swap the arguments when processing intrinsics that implement the "scalar + vector" addressing mode: * @llvm.aarch64.sve.ldnt1.gather * @llvm.aarch64.sve.ldnt1.gather.uxtw * @llvm.aarch64.sve.stnt1.scatter * @llvm.aarch64.sve.ldnt1.gather.uxtw In other words, all intrinsics for gather-loads and scatter-stores implemented in this patch are mapped to the same load and store instruction, respectively. The sve2_mem_gldnt_vs multiclass (and it's counterpart for scatter stores) from SVEInstrFormats.td was split into: * sve2_mem_gldnt_vec_vs_32_ptrs (32bit wide base addresses) * sve2_mem_gldnt_vec_vs_62_ptrs (64bit wide base addresses) This is consistent with what we did for @llvm.aarch64.sve.ld1.scalar.offset and highlights the actual split in the spec and the implementation. Reviewed by: sdesmalen Differential Revision: https://reviews.llvm.org/D74858 --- llvm/include/llvm/IR/IntrinsicsAArch64.td | 32 ++++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 39 +++++ llvm/lib/Target/AArch64/AArch64ISelLowering.h | 7 + llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 47 +++--- llvm/lib/Target/AArch64/SVEInstrFormats.td | 67 ++++++-- ...insics-nt-gather-loads-32bit-unscaled-offset.ll | 96 +++++++++++ ...insics-nt-gather-loads-64bit-unscaled-offset.ll | 103 +++++++++++ ...cs-nt-gather-loads-vector-base-scalar-offset.ll | 188 +++++++++++++++++++++ ...sics-nt-scatter-stores-32bit-unscaled-offset.ll | 77 +++++++++ ...sics-nt-scatter-stores-64bit-unscaled-offset.ll | 70 ++++++++ ...-nt-scatter-stores-vector-base-scalar-offset.ll | 134 +++++++++++++++ 11 files changed, 826 insertions(+), 34 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-32bit-unscaled-offset.ll create mode 100644 llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-unscaled-offset.ll create mode 100644 llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-vector-base-scalar-offset.ll create mode 100644 llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-32bit-unscaled-offset.ll create mode 100644 llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-unscaled-offset.ll create mode 100644 llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-vector-base-scalar-offset.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index ac5d530..3976dde 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1761,6 +1761,22 @@ def int_aarch64_sve_ldff1_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic // +// Non-temporal gather loads: scalar base + vector offsets +// + +// 64 bit unscaled offsets +def int_aarch64_sve_ldnt1_gather : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic; + +// 32 bit unscaled offsets, zero (zxtw) extended to 64 bits +def int_aarch64_sve_ldnt1_gather_uxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic; + +// +// Non-temporal gather loads: vector base + scalar offset +// + +def int_aarch64_sve_ldnt1_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic; + +// // Scatter stores: scalar base + vector offsets // @@ -1792,6 +1808,22 @@ def int_aarch64_sve_st1_scatter_uxtw_index def int_aarch64_sve_st1_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsic; // +// Non-temporal scatter stores: scalar base + vector offsets +// + +// 64 bit unscaled offsets +def int_aarch64_sve_stnt1_scatter : AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic; + +// 32 bit unscaled offsets, zero (zxtw) extended to 64 bits +def int_aarch64_sve_stnt1_scatter_uxtw : AdvSIMD_ScatterStore_SV_32b_Offsets_Intrinsic; + +// +// Non-temporal scatter stores: vector base + scalar offset +// + +def int_aarch64_sve_stnt1_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsic; + +// // SVE2 - Uniform DSP operations // diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 2346ce5..6466aae 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1438,6 +1438,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::GLDFF1S_UXTW_SCALED: return "AArch64ISD::GLDFF1S_UXTW_SCALED"; case AArch64ISD::GLDFF1S_IMM: return "AArch64ISD::GLDFF1S_IMM"; + + case AArch64ISD::GLDNT1: return "AArch64ISD::GLDNT1"; + case AArch64ISD::GLDNT1S: return "AArch64ISD::GLDNT1S"; + case AArch64ISD::SST1: return "AArch64ISD::SST1"; case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED"; case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW"; @@ -1445,6 +1449,9 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED"; case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED"; case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM"; + + case AArch64ISD::SSTNT1: return "AArch64ISD::SSTNT1"; + case AArch64ISD::LDP: return "AArch64ISD::LDP"; case AArch64ISD::STP: return "AArch64ISD::STP"; case AArch64ISD::STNP: return "AArch64ISD::STNP"; @@ -10457,6 +10464,7 @@ static SDValue performSVEAndCombine(SDNode *N, case AArch64ISD::GLDFF1_UXTW: case AArch64ISD::GLDFF1_UXTW_SCALED: case AArch64ISD::GLDFF1_IMM: + case AArch64ISD::GLDNT1: MemVT = cast(Src->getOperand(4))->getVT(); break; default: @@ -12644,6 +12652,14 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(5); + // In the case of non-temporal gather loads there's only one SVE instruction + // per data-size: "scalar + vector", i.e. + // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] + // Since we do have intrinsics that allow the arguments to be in a different + // order, we may need to swap them to match the spec. + if (Opcode == AArch64ISD::SSTNT1 && Offset.getValueType().isVector()) + std::swap(Base, Offset); + // SST1_IMM requires that the offset is an immediate that is: // * a multiple of #SizeInBytes, // * in the range [0, 31 x #SizeInBytes], @@ -12730,6 +12746,14 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(4); + // In the case of non-temporal gather loads there's only one SVE instruction + // per data-size: "scalar + vector", i.e. + // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] + // Since we do have intrinsics that allow the arguments to be in a different + // order, we may need to swap them to match the spec. + if (Opcode == AArch64ISD::GLDNT1 && Offset.getValueType().isVector()) + std::swap(Base, Offset); + // GLD{FF}1_IMM requires that the offset is an immediate that is: // * a multiple of #SizeInBytes, // * in the range [0, 31 x #SizeInBytes], @@ -12859,6 +12883,9 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, case AArch64ISD::GLDFF1_IMM: NewOpc = AArch64ISD::GLDFF1S_IMM; break; + case AArch64ISD::GLDNT1: + NewOpc = AArch64ISD::GLDNT1S; + break; default: return SDValue(); } @@ -12972,12 +12999,24 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performNEONPostLDSTCombine(N, DCI, DAG); case Intrinsic::aarch64_sve_ldnt1: return performLDNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); + case Intrinsic::aarch64_sve_ldnt1_gather: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); + case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); case Intrinsic::aarch64_sve_ldnf1: return performLDNF1Combine(N, DAG, AArch64ISD::LDNF1); case Intrinsic::aarch64_sve_ldff1: return performLDNF1Combine(N, DAG, AArch64ISD::LDFF1); case Intrinsic::aarch64_sve_stnt1: return performSTNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1); + case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1); + case Intrinsic::aarch64_sve_stnt1_scatter: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1); case Intrinsic::aarch64_sve_ld1_gather: return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1); case Intrinsic::aarch64_sve_ld1_gather_index: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index c5914b1..2f26744 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -261,6 +261,10 @@ enum NodeType : unsigned { GLDFF1S_SXTW_SCALED, GLDFF1S_IMM, + // Non-temporal gather loads + GLDNT1, + GLDNT1S, + // Scatter store SST1, SST1_SCALED, @@ -270,6 +274,9 @@ enum NodeType : unsigned { SST1_SXTW_SCALED, SST1_IMM, + // Non-temporal scatter store + SSTNT1, + // Strict (exception-raising) floating point comparison STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, STRICT_FCMPE, diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 3687b3a..542533a 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -69,6 +69,9 @@ def AArch64ldff1s_gather_uxtw_scaled : SDNode<"AArch64ISD::GLDFF1S_UXTW_SCALED", def AArch64ldff1s_gather_sxtw_scaled : SDNode<"AArch64ISD::GLDFF1S_SXTW_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ldff1s_gather_imm : SDNode<"AArch64ISD::GLDFF1S_IMM", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldnt1_gather : SDNode<"AArch64ISD::GLDNT1", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ldnt1s_gather : SDNode<"AArch64ISD::GLDNT1S", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; + // Scatter stores - node definitions // def SDT_AArch64_SCATTER_SV : SDTypeProfile<0, 5, [ @@ -89,6 +92,8 @@ def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED", SDT_ def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>; + // AArch64 SVE/SVE2 - the remaining node definitions // @@ -1909,32 +1914,32 @@ let Predicates = [HasSVE2] in { def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">; // SVE2 non-temporal gather loads - defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>; - defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>; - defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>; - defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>; - defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>; - - defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>; - defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>; - defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>; - defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>; - defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>; - defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>; - defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>; + defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00000, "ldnt1sb", AArch64ldnt1s_gather, nxv4i8>; + defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00001, "ldnt1b", AArch64ldnt1_gather, nxv4i8>; + defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00100, "ldnt1sh", AArch64ldnt1s_gather, nxv4i16>; + defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00101, "ldnt1h", AArch64ldnt1_gather, nxv4i16>; + defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b01001, "ldnt1w", AArch64ldnt1_gather, nxv4i32>; + + defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10000, "ldnt1sb", AArch64ldnt1s_gather, nxv2i8>; + defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10010, "ldnt1b", AArch64ldnt1_gather, nxv2i8>; + defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10100, "ldnt1sh", AArch64ldnt1s_gather, nxv2i16>; + defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10110, "ldnt1h", AArch64ldnt1_gather, nxv2i16>; + defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11000, "ldnt1sw", AArch64ldnt1s_gather, nxv2i32>; + defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11010, "ldnt1w", AArch64ldnt1_gather, nxv2i32>; + defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d", AArch64ldnt1_gather, nxv2i64>; // SVE2 vector splice (constructive) defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">; // SVE2 non-temporal scatter stores - defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>; - defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>; - defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>; - - defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>; - defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>; - defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>; - defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>; + defm STNT1B_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b001, "stnt1b", AArch64stnt1_scatter, nxv4i8>; + defm STNT1H_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b011, "stnt1h", AArch64stnt1_scatter, nxv4i16>; + defm STNT1W_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b101, "stnt1w", AArch64stnt1_scatter, nxv4i32>; + + defm STNT1B_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b000, "stnt1b", AArch64stnt1_scatter, nxv2i8>; + defm STNT1H_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b010, "stnt1h", AArch64stnt1_scatter, nxv2i16>; + defm STNT1W_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b100, "stnt1w", AArch64stnt1_scatter, nxv2i32>; + defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>; // SVE2 table lookup (three sources) defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 7cd35d8..b055012 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -5071,16 +5071,36 @@ class sve2_mem_sstnt_vs_base opc, string asm, let mayStore = 1; } -multiclass sve2_mem_sstnt_vs opc, string asm, - RegisterOperand listty, ZPRRegOp zprty> { - def _REAL : sve2_mem_sstnt_vs_base; +multiclass sve2_mem_sstnt_vs_32_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_sstnt_vs_base; def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; + (!cast(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>; def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>; + (!cast(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>; def : InstAlias(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>; + (!cast(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>; + + def : Pat <(op (nxv4i32 ZPR32:$Zt), (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zn), (i64 GPR64:$Rm), vt), + (!cast(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm)>; +} + +multiclass sve2_mem_sstnt_vs_64_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_sstnt_vs_base; + + def : InstAlias(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>; + def : InstAlias(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>; + def : InstAlias(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>; + + def : Pat <(op (nxv2i64 ZPR64:$Zt), (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64:$Rm), vt), + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>; } class sve_mem_sst_sv opc, bit xs, bit scaled, string asm, @@ -6529,17 +6549,38 @@ class sve2_mem_gldnt_vs_base opc, dag iops, string asm, let mayLoad = 1; } -multiclass sve2_mem_gldnt_vs opc, string asm, - RegisterOperand listty, ZPRRegOp zprty> { - def _REAL : sve2_mem_gldnt_vs_base; +multiclass sve2_mem_gldnt_vs_32_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_gldnt_vs_base; + + def : InstAlias(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>; + def : InstAlias(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>; + def : InstAlias(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>; + + def : Pat <(nxv4i32 (op (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zd), (i64 GPR64:$Rm), vt)), + (!cast(NAME # _REAL) PPR3bAny:$Pg, ZPR32:$Zd, GPR64:$Rm)>; +} + +multiclass sve2_mem_gldnt_vs_64_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_gldnt_vs_base; def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>; def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>; + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>; def : InstAlias(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>; + (!cast(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>; + + def : Pat <(nxv2i64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zd), (i64 GPR64:$Rm), vt)), + (!cast(NAME # _REAL) PPR3bAny:$Pg, ZPR64:$Zd, GPR64:$Rm)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-32bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-32bit-unscaled-offset.ll new file mode 100644 index 0000000..ab570ce --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-32bit-unscaled-offset.ll @@ -0,0 +1,96 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; LDNT1B, LDNT1W, LDNT1H, LDNT1D: base + 32-bit unscaled offsets, zero (uxtw) +; extended to 64 bits. +; e.g. ldnt1h { z0.s }, p0/z, [z0.s, x0] +; + +; LDNT1B +define @gldnt1b_s_uxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gldnt1b_s_uxtw: +; CHECK: ldnt1b { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i8( %pg, + i8* %base, + %b) + %res = zext %load to + ret %res +} + +; LDNT1H +define @gldnt1h_s_uxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gldnt1h_s_uxtw: +; CHECK: ldnt1h { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +; LDNT1W +define @gldnt1w_s_uxtw( %pg, i32* %base, %b) { +; CHECK-LABEL: gldnt1w_s_uxtw: +; CHECK: ldnt1w { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i32( %pg, + i32* %base, + %b) + ret %load +} + +define @gldnt1w_s_uxtw_float( %pg, float* %base, %b) { +; CHECK-LABEL: gldnt1w_s_uxtw_float: +; CHECK: ldnt1w { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4f32( %pg, + float* %base, + %b) + ret %load +} + +; LDNT1SB, LDNT1SW, LDNT1SH: base + 32-bit unscaled offsets, zero (uxtw) +; extended to 64 bits. +; e.g. ldnt1sh { z0.s }, p0/z, [z0.s, x0] +; + +; LDNT1SB +define @gldnt1sb_s_uxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gldnt1sb_s_uxtw: +; CHECK: ldnt1sb { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i8( %pg, + i8* %base, + %b) + %res = sext %load to + ret %res +} + +; LDNT1SH +define @gldnt1sh_s_uxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gldnt1sh_s_uxtw: +; CHECK: ldnt1sh { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i16( %pg, + i16* %base, + %b) + %res = sext %load to + ret %res +} + +; LDNT1B/LDNT1SB +declare @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i8(, i8*, ) +declare @llvm.aarch64.sve.ldnt1.gather.sxtw.nxv4i8(, i8*, ) + +; LDNT1H/LDNT1SH +declare @llvm.aarch64.sve.ldnt1.gather.sxtw.nxv4i16(, i16*, ) +declare @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i16(, i16*, ) + +; LDNT1W/LDNT1SW +declare @llvm.aarch64.sve.ldnt1.gather.sxtw.nxv4i32(, i32*, ) +declare @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i32(, i32*, ) + +declare @llvm.aarch64.sve.ldnt1.gather.sxtw.nxv4f32(, float*, ) +declare @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4f32(, float*, ) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-unscaled-offset.ll new file mode 100644 index 0000000..bc1e961 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-unscaled-offset.ll @@ -0,0 +1,103 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; LDNT1B, LDNT1W, LDNT1H, LDNT1D: base + 64-bit unscaled offsets +; e.g. ldnt1h { z0.d }, p0/z, [z0.d, x0] +; + +define @gldnt1b_d( %pg, i8* %base, %b) { +; CHECK-LABEL: gldnt1b_d: +; CHECK: ldnt1b { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i8( %pg, + i8* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldnt1h_d( %pg, i16* %base, %b) { +; CHECK-LABEL: gldnt1h_d: +; CHECK: ldnt1h { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldnt1w_d( %pg, i32* %base, %offsets) { +; CHECK-LABEL: gldnt1w_d: +; CHECK: ldnt1w { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i32( %pg, + i32* %base, + %offsets) + %res = zext %load to + ret %res +} + +define @gldnt1d_d( %pg, i64* %base, %b) { +; CHECK-LABEL: gldnt1d_d: +; CHECK: ldnt1d { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i64( %pg, + i64* %base, + %b) + ret %load +} + +define @gldnt1d_d_double( %pg, double* %base, %b) { +; CHECK-LABEL: gldnt1d_d_double: +; CHECK: ldnt1d { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2f64( %pg, + double* %base, + %b) + ret %load +} + +; +; LDNT1SB, LDNT1SW, LDNT1SH: base + 64-bit unscaled offsets +; e.g. ldnt1sh { z0.d }, p0/z, [z0.d, x0] +; + +define @gldnt1sb_d( %pg, i8* %base, %b) { +; CHECK-LABEL: gldnt1sb_d: +; CHECK: ldnt1sb { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i8( %pg, + i8* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldnt1sh_d( %pg, i16* %base, %b) { +; CHECK-LABEL: gldnt1sh_d: +; CHECK: ldnt1sh { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i16( %pg, + i16* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldnt1sw_d( %pg, i32* %base, %offsets) { +; CHECK-LABEL: gldnt1sw_d: +; CHECK: ldnt1sw { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i32( %pg, + i32* %base, + %offsets) + %res = sext %load to + ret %res +} + +declare @llvm.aarch64.sve.ldnt1.gather.nxv2i8(, i8*, ) +declare @llvm.aarch64.sve.ldnt1.gather.nxv2i16(, i16*, ) +declare @llvm.aarch64.sve.ldnt1.gather.nxv2i32(, i32*, ) +declare @llvm.aarch64.sve.ldnt1.gather.nxv2i64(, i64*, ) +declare @llvm.aarch64.sve.ldnt1.gather.nxv2f64(, double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-vector-base-scalar-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-vector-base-scalar-offset.ll new file mode 100644 index 0000000..65d3365 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-vector-base-scalar-offset.ll @@ -0,0 +1,188 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; LDNT1B, LDNT1W, LDNT1H, LDNT1D: vector base + scalar offset +; ldnt1b { z0.s }, p0/z, [z0.s, x0] +; + +; LDNT1B +define @gldnt1b_s( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1b_s: +; CHECK: ldnt1b { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i8.nxv4i32( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +define @gldnt1b_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1b_d: +; CHECK: ldnt1b { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i8.nxv2i64( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +; LDNT1H +define @gldnt1h_s( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1h_s: +; CHECK: ldnt1h { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv416.nxv4i32( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +define @gldnt1h_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1h_d: +; CHECK: ldnt1h { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i16.nxv2i64( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +; LDNT1W +define @gldnt1w_s( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1w_s: +; CHECK: ldnt1w { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i32.nxv4i32( %pg, + %base, + i64 %offset) + ret %load +} + +define @gldnt1w_s_float( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1w_s_float: +; CHECK: ldnt1w { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4f32.nxv4i32( %pg, + %base, + i64 %offset) + ret %load +} + +define @gldnt1w_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1w_d: +; CHECK: ldnt1w { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i32.nxv2i64( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +; LDNT1D +define @gldnt1d_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1d_d: +; CHECK: ldnt1d { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i64.nxv2i64( %pg, + %base, + i64 %offset) + ret %load +} + +; LDNT1D +define @gldnt1d_d_double( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1d_d_double: +; CHECK: ldnt1d { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2f64.nxv2i64( %pg, + %base, + i64 %offset) + ret %load +} + +; +; LDNT1SB, LDNT1SW, LDNT1SH, LDNT1SD: vector base + scalar offset +; ldnt1sb { z0.s }, p0/z, [z0.s, x0] +; + +; LDNT1SB +define @gldnt1sb_s( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1sb_s: +; CHECK: ldnt1sb { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i8.nxv4i32( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +define @gldnt1sb_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1sb_d: +; CHECK: ldnt1sb { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i8.nxv2i64( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +; LDNT1SH +define @gldnt1sh_s( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1sh_s: +; CHECK: ldnt1sh { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv416.nxv4i32( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +define @gldnt1sh_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1sh_d: +; CHECK: ldnt1sh { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i16.nxv2i64( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +; LDNT1SW +define @gldnt1sw_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1sw_d: +; CHECK: ldnt1sw { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i32.nxv2i64( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +; LDNT1B/LDNT1SB +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i8.nxv4i32(, , i64) +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i8.nxv2i64(, , i64) + +; LDNT1H/LDNT1SH +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv416.nxv4i32(, , i64) +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i16.nxv2i64(, , i64) + +; LDNT1W/LDNT1SW +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i32.nxv4i32(, , i64) +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i32.nxv2i64(, , i64) + +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4f32.nxv4i32(, , i64) + +; LDNT1D +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i64.nxv2i64(, , i64) + +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2f64.nxv2i64(, , i64) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-32bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-32bit-unscaled-offset.ll new file mode 100644 index 0000000..56836ba --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-32bit-unscaled-offset.ll @@ -0,0 +1,77 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; STNT1B, STNT1W, STNT1H, STNT1D: base + 32-bit unscaled offset, zero (uxtw) +; extended to 64 bits. +; e.g. stnt1h { z0.d }, p0, [z1.d, x0] +; + +; STNT1B +define void @sstnt1b_s_uxtw( %data, %pg, i8* %base, %offsets) { +; CHECK-LABEL: sstnt1b_s_uxtw: +; CHECK: stnt1b { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i8( %data_trunc, + %pg, + i8* %base, + %offsets) + ret void +} + +; STNT1H +define void @sstnt1h_s_uxtw( %data, %pg, i16* %base, %offsets) { +; CHECK-LABEL: sstnt1h_s_uxtw: +; CHECK: stnt1h { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i16( %data_trunc, + %pg, + i16* %base, + %offsets) + ret void +} + +; STNT1W +define void @sstnt1w_s_uxtw( %data, %pg, i32* %base, %offsets) { +; CHECK-LABEL: sstnt1w_s_uxtw: +; CHECK: stnt1w { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i32( %data, + %pg, + i32* %base, + %offsets) + ret void +} + +define void @sstnt1w_s_uxtw_float( %data, %pg, float* %base, %offsets) { +; CHECK-LABEL: sstnt1w_s_uxtw_float: +; CHECK: stnt1w { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4f32( %data, + %pg, + float* %base, + %offsets) + ret void +} + +; STNT1B +declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i8(, , i8*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv2i8(, , i8*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv4i8(, , i8*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv2i8(, , i8*, ) + +; STNT1H +declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv4i16(, , i16*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv2i16(, , i16*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i16(, , i16*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv2i16(, , i16*, ) + +; STNT1W +declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv4i32(, , i32*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv2i32(, , i32*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i32(, , i32*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv2i32(, , i32*, ) + +declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv4f32(, , float*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4f32(, , float*, ) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-unscaled-offset.ll new file mode 100644 index 0000000..6cf4d5d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-unscaled-offset.ll @@ -0,0 +1,70 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; STNT1B, STNT1W, STNT1H, STNT1D: base + 64-bit unscaled offset +; e.g. stnt1h { z0.d }, p0, [z1.d, x0] +; + +define void @sstnt1b_d( %data, %pg, i8* %base, %b) { +; CHECK-LABEL: sstnt1b_d: +; CHECK: stnt1b { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.nxv2i8( %data_trunc, + %pg, + i8* %base, + %b) + ret void +} + +define void @sstnt1h_d( %data, %pg, i16* %base, %b) { +; CHECK-LABEL: sstnt1h_d: +; CHECK: stnt1h { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.nxv2i16( %data_trunc, + %pg, + i16* %base, + %b) + ret void +} + +define void @sstnt1w_d( %data, %pg, i32* %base, %b) { +; CHECK-LABEL: sstnt1w_d: +; CHECK: stnt1w { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.nxv2i32( %data_trunc, + %pg, + i32* %base, + %b) + ret void +} + +define void @sstnt1d_d( %data, %pg, i64* %base, %b) { +; CHECK-LABEL: sstnt1d_d: +; CHECK: stnt1d { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.nxv2i64( %data, + %pg, + i64* %base, + %b) + ret void +} + +define void @sstnt1d_d_double( %data, %pg, double* %base, %b) { +; CHECK-LABEL: sstnt1d_d_double: +; CHECK: stnt1d { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.nxv2f64( %data, + %pg, + double* %base, + %b) + ret void +} + +declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i8(, , i8*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i16(, , i16*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i32(, , i32*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i64(, , i64*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.nxv2f64(, , double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-vector-base-scalar-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-vector-base-scalar-offset.ll new file mode 100644 index 0000000..26d84fe --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-vector-base-scalar-offset.ll @@ -0,0 +1,134 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; STNT1B, STNT1W, STNT1H, STNT1D: vector base + scalar offset +; stnt1b { z0.s }, p0/z, [z0.s, x0] +; + +; STNT1B +define void @stnt1b_s( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1b_s: +; CHECK: stnt1b { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i8.nxv4i32( %data_trunc, + %pg, + %base, + i64 %offset) + ret void +} + +define void @stnt1b_d( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1b_d: +; CHECK: stnt1b { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i8.nxv2i64( %data_trunc, + %pg, + %base, + i64 %offset) + ret void +} + +; STNT1H +define void @stnt1h_s( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1h_s: +; CHECK: stnt1h { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i16.nxv4i32( %data_trunc, + %pg, + %base, + i64 %offset) + ret void +} + +define void @stnt1h_d( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1h_d: +; CHECK: stnt1h { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i16.nxv2i64( %data_trunc, + %pg, + %base, + i64 %offset) + ret void +} + +; STNT1W +define void @stnt1w_s( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1w_s: +; CHECK: stnt1w { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i32.nxv4i32( %data, + %pg, + %base, + i64 %offset) + ret void +} + +define void @stnt1w_f32_s( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1w_f32_s: +; CHECK: stnt1w { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4f32.nxv4i32( %data, + %pg, + %base, + i64 %offset) + ret void +} + +define void @stnt1w_d( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1w_d: +; CHECK: stnt1w { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i32.nxv2i64( %data_trunc, + %pg, + %base, + i64 %offset) + ret void +} + +; STNT1D +define void @stnt1d_d( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1d_d: +; CHECK: stnt1d { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i64.nxv2i64( %data, + %pg, + %base, + i64 %offset) + ret void +} + +define void @stnt1d_f64_d( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1d_f64_d: +; CHECK: stnt1d { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2f64.nxv2i64( %data, + %pg, + %base, + i64 %offset) + ret void +} + +; STNT1B +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i8.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i8.nxv4i32(, , , i64) + +; STNT1H +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i16.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i16.nxv4i32(, , , i64) + +; STNT1W +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i32.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i32.nxv4i32(, , , i64) + +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4f32.nxv4i32(, , , i64) + +; STNT1D +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i64.nxv2i64(, , , i64) + +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2f32.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2f64.nxv2i64(, , , i64) -- 2.7.4