//
+// Non-temporal gather loads: scalar base + vector offsets
+//
+
+// 64 bit unscaled offsets
+def int_aarch64_sve_ldnt1_gather : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic;
+
+// 32 bit unscaled offsets, zero (zxtw) extended to 64 bits
+def int_aarch64_sve_ldnt1_gather_uxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;
+
+//
+// Non-temporal gather loads: vector base + scalar offset
+//
+
+def int_aarch64_sve_ldnt1_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic;
+
+//
// Scatter stores: scalar base + vector offsets
//
def int_aarch64_sve_st1_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsic;
//
+// Non-temporal scatter stores: scalar base + vector offsets
+//
+
+// 64 bit unscaled offsets
+def int_aarch64_sve_stnt1_scatter : AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic;
+
+// 32 bit unscaled offsets, zero (zxtw) extended to 64 bits
+def int_aarch64_sve_stnt1_scatter_uxtw : AdvSIMD_ScatterStore_SV_32b_Offsets_Intrinsic;
+
+//
+// Non-temporal scatter stores: vector base + scalar offset
+//
+
+def int_aarch64_sve_stnt1_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsic;
+
+//
// SVE2 - Uniform DSP operations
//
case AArch64ISD::GLDFF1S_UXTW_SCALED:
return "AArch64ISD::GLDFF1S_UXTW_SCALED";
case AArch64ISD::GLDFF1S_IMM: return "AArch64ISD::GLDFF1S_IMM";
+
+ case AArch64ISD::GLDNT1: return "AArch64ISD::GLDNT1";
+ case AArch64ISD::GLDNT1S: return "AArch64ISD::GLDNT1S";
+
case AArch64ISD::SST1: return "AArch64ISD::SST1";
case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED";
case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW";
case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED";
case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED";
case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM";
+
+ case AArch64ISD::SSTNT1: return "AArch64ISD::SSTNT1";
+
case AArch64ISD::LDP: return "AArch64ISD::LDP";
case AArch64ISD::STP: return "AArch64ISD::STP";
case AArch64ISD::STNP: return "AArch64ISD::STNP";
case AArch64ISD::GLDFF1_UXTW:
case AArch64ISD::GLDFF1_UXTW_SCALED:
case AArch64ISD::GLDFF1_IMM:
+ case AArch64ISD::GLDNT1:
MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
break;
default:
// vector of offsets (that fits into one register)
SDValue Offset = N->getOperand(5);
+ // In the case of non-temporal gather loads there's only one SVE instruction
+ // per data-size: "scalar + vector", i.e.
+ // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
+ // Since we do have intrinsics that allow the arguments to be in a different
+ // order, we may need to swap them to match the spec.
+ if (Opcode == AArch64ISD::SSTNT1 && Offset.getValueType().isVector())
+ std::swap(Base, Offset);
+
// SST1_IMM requires that the offset is an immediate that is:
// * a multiple of #SizeInBytes,
// * in the range [0, 31 x #SizeInBytes],
// vector of offsets (that fits into one register)
SDValue Offset = N->getOperand(4);
+ // In the case of non-temporal gather loads there's only one SVE instruction
+ // per data-size: "scalar + vector", i.e.
+ // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
+ // Since we do have intrinsics that allow the arguments to be in a different
+ // order, we may need to swap them to match the spec.
+ if (Opcode == AArch64ISD::GLDNT1 && Offset.getValueType().isVector())
+ std::swap(Base, Offset);
+
// GLD{FF}1_IMM requires that the offset is an immediate that is:
// * a multiple of #SizeInBytes,
// * in the range [0, 31 x #SizeInBytes],
case AArch64ISD::GLDFF1_IMM:
NewOpc = AArch64ISD::GLDFF1S_IMM;
break;
+ case AArch64ISD::GLDNT1:
+ NewOpc = AArch64ISD::GLDNT1S;
+ break;
default:
return SDValue();
}
return performNEONPostLDSTCombine(N, DCI, DAG);
case Intrinsic::aarch64_sve_ldnt1:
return performLDNT1Combine(N, DAG);
+ case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
+ case Intrinsic::aarch64_sve_ldnt1_gather:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
+ case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
+ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
case Intrinsic::aarch64_sve_ldnf1:
return performLDNF1Combine(N, DAG, AArch64ISD::LDNF1);
case Intrinsic::aarch64_sve_ldff1:
return performLDNF1Combine(N, DAG, AArch64ISD::LDFF1);
case Intrinsic::aarch64_sve_stnt1:
return performSTNT1Combine(N, DAG);
+ case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1);
+ case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1);
+ case Intrinsic::aarch64_sve_stnt1_scatter:
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1);
case Intrinsic::aarch64_sve_ld1_gather:
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1);
case Intrinsic::aarch64_sve_ld1_gather_index:
GLDFF1S_SXTW_SCALED,
GLDFF1S_IMM,
+ // Non-temporal gather loads
+ GLDNT1,
+ GLDNT1S,
+
// Scatter store
SST1,
SST1_SCALED,
SST1_SXTW_SCALED,
SST1_IMM,
+ // Non-temporal scatter store
+ SSTNT1,
+
// Strict (exception-raising) floating point comparison
STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
STRICT_FCMPE,
def AArch64ldff1s_gather_sxtw_scaled : SDNode<"AArch64ISD::GLDFF1S_SXTW_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ldff1s_gather_imm : SDNode<"AArch64ISD::GLDFF1S_IMM", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ldnt1_gather : SDNode<"AArch64ISD::GLDNT1", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ldnt1s_gather : SDNode<"AArch64ISD::GLDNT1S", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
+
// Scatter stores - node definitions
//
def SDT_AArch64_SCATTER_SV : SDTypeProfile<0, 5, [
def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
+def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>;
+
// AArch64 SVE/SVE2 - the remaining node definitions
//
def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
// SVE2 non-temporal gather loads
- defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
- defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>;
- defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
- defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>;
- defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>;
-
- defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
- defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>;
- defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
- defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>;
- defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
- defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>;
- defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>;
+ defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00000, "ldnt1sb", AArch64ldnt1s_gather, nxv4i8>;
+ defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00001, "ldnt1b", AArch64ldnt1_gather, nxv4i8>;
+ defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00100, "ldnt1sh", AArch64ldnt1s_gather, nxv4i16>;
+ defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00101, "ldnt1h", AArch64ldnt1_gather, nxv4i16>;
+ defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b01001, "ldnt1w", AArch64ldnt1_gather, nxv4i32>;
+
+ defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10000, "ldnt1sb", AArch64ldnt1s_gather, nxv2i8>;
+ defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10010, "ldnt1b", AArch64ldnt1_gather, nxv2i8>;
+ defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10100, "ldnt1sh", AArch64ldnt1s_gather, nxv2i16>;
+ defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10110, "ldnt1h", AArch64ldnt1_gather, nxv2i16>;
+ defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11000, "ldnt1sw", AArch64ldnt1s_gather, nxv2i32>;
+ defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11010, "ldnt1w", AArch64ldnt1_gather, nxv2i32>;
+ defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d", AArch64ldnt1_gather, nxv2i64>;
// SVE2 vector splice (constructive)
defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
// SVE2 non-temporal scatter stores
- defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
- defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
- defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
-
- defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
- defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
- defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
- defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
+ defm STNT1B_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b001, "stnt1b", AArch64stnt1_scatter, nxv4i8>;
+ defm STNT1H_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b011, "stnt1h", AArch64stnt1_scatter, nxv4i16>;
+ defm STNT1W_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b101, "stnt1w", AArch64stnt1_scatter, nxv4i32>;
+
+ defm STNT1B_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b000, "stnt1b", AArch64stnt1_scatter, nxv2i8>;
+ defm STNT1H_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b010, "stnt1h", AArch64stnt1_scatter, nxv2i16>;
+ defm STNT1W_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b100, "stnt1w", AArch64stnt1_scatter, nxv2i32>;
+ defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>;
// SVE2 table lookup (three sources)
defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>;
let mayStore = 1;
}
-multiclass sve2_mem_sstnt_vs<bits<3> opc, string asm,
- RegisterOperand listty, ZPRRegOp zprty> {
- def _REAL : sve2_mem_sstnt_vs_base<opc, asm, listty, zprty>;
+multiclass sve2_mem_sstnt_vs_32_ptrs<bits<3> opc, string asm,
+ SDPatternOperator op,
+ ValueType vt> {
+ def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_s, ZPR32>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
- (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
+ (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
- (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
+ (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
- (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
+ (!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>;
+
+ def : Pat <(op (nxv4i32 ZPR32:$Zt), (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zn), (i64 GPR64:$Rm), vt),
+ (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm)>;
+}
+
+multiclass sve2_mem_sstnt_vs_64_ptrs<bits<3> opc, string asm,
+ SDPatternOperator op,
+ ValueType vt> {
+ def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_d, ZPR64>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
+ (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+ (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+ (!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;
+
+ def : Pat <(op (nxv2i64 ZPR64:$Zt), (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64:$Rm), vt),
+ (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>;
}
class sve_mem_sst_sv<bits<3> opc, bit xs, bit scaled, string asm,
let mayLoad = 1;
}
-multiclass sve2_mem_gldnt_vs<bits<5> opc, string asm,
- RegisterOperand listty, ZPRRegOp zprty> {
- def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
- asm, listty>;
+multiclass sve2_mem_gldnt_vs_32_ptrs<bits<5> opc, string asm,
+ SDPatternOperator op,
+ ValueType vt> {
+ def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm),
+ asm, Z_s>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
+ (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+ (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+ (!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>;
+
+ def : Pat <(nxv4i32 (op (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zd), (i64 GPR64:$Rm), vt)),
+ (!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR32:$Zd, GPR64:$Rm)>;
+}
+
+multiclass sve2_mem_gldnt_vs_64_ptrs<bits<5> opc, string asm,
+ SDPatternOperator op,
+ ValueType vt> {
+ def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm),
+ asm, Z_d>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
- (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
+ (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
- (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
+ (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
- (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
+ (!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;
+
+ def : Pat <(nxv2i64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zd), (i64 GPR64:$Rm), vt)),
+ (!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR64:$Zd, GPR64:$Rm)>;
}
//===----------------------------------------------------------------------===//
--- /dev/null
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; LDNT1B, LDNT1W, LDNT1H, LDNT1D: base + 32-bit unscaled offsets, zero (uxtw)
+; extended to 64 bits.
+; e.g. ldnt1h { z0.s }, p0/z, [z0.s, x0]
+;
+
+; LDNT1B
+define <vscale x 4 x i32> @gldnt1b_s_uxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldnt1b_s_uxtw:
+; CHECK: ldnt1b { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i8(<vscale x 4 x i1> %pg,
+ i8* %base,
+ <vscale x 4 x i32> %b)
+ %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+; LDNT1H
+define <vscale x 4 x i32> @gldnt1h_s_uxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldnt1h_s_uxtw:
+; CHECK: ldnt1h { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i16(<vscale x 4 x i1> %pg,
+ i16* %base,
+ <vscale x 4 x i32> %b)
+ %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+; LDNT1W
+define <vscale x 4 x i32> @gldnt1w_s_uxtw(<vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldnt1w_s_uxtw:
+; CHECK: ldnt1w { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i32(<vscale x 4 x i1> %pg,
+ i32* %base,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x float> @gldnt1w_s_uxtw_float(<vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldnt1w_s_uxtw_float:
+; CHECK: ldnt1w { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4f32(<vscale x 4 x i1> %pg,
+ float* %base,
+ <vscale x 4 x i32> %b)
+ ret <vscale x 4 x float> %load
+}
+
+; LDNT1SB, LDNT1SW, LDNT1SH: base + 32-bit unscaled offsets, zero (uxtw)
+; extended to 64 bits.
+; e.g. ldnt1sh { z0.s }, p0/z, [z0.s, x0]
+;
+
+; LDNT1SB
+define <vscale x 4 x i32> @gldnt1sb_s_uxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldnt1sb_s_uxtw:
+; CHECK: ldnt1sb { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i8(<vscale x 4 x i1> %pg,
+ i8* %base,
+ <vscale x 4 x i32> %b)
+ %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+; LDNT1SH
+define <vscale x 4 x i32> @gldnt1sh_s_uxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldnt1sh_s_uxtw:
+; CHECK: ldnt1sh { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i16(<vscale x 4 x i1> %pg,
+ i16* %base,
+ <vscale x 4 x i32> %b)
+ %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+; LDNT1B/LDNT1SB
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i8(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.sxtw.nxv4i8(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+
+; LDNT1H/LDNT1SH
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.sxtw.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+
+; LDNT1W/LDNT1SW
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.sxtw.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.gather.sxtw.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
--- /dev/null
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; LDNT1B, LDNT1W, LDNT1H, LDNT1D: base + 64-bit unscaled offsets
+; e.g. ldnt1h { z0.d }, p0/z, [z0.d, x0]
+;
+
+define <vscale x 2 x i64> @gldnt1b_d(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1b_d:
+; CHECK: ldnt1b { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldnt1.gather.nxv2i8(<vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i64> %b)
+ %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldnt1h_d(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1h_d:
+; CHECK: ldnt1h { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %b)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldnt1w_d(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: gldnt1w_d:
+; CHECK: ldnt1w { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %offsets)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldnt1d_d(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1d_d:
+; CHECK: ldnt1d { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.nxv2i64(<vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gldnt1d_d_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1d_d_double:
+; CHECK: ldnt1d { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.gather.nxv2f64(<vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i64> %b)
+ ret <vscale x 2 x double> %load
+}
+
+;
+; LDNT1SB, LDNT1SW, LDNT1SH: base + 64-bit unscaled offsets
+; e.g. ldnt1sh { z0.d }, p0/z, [z0.d, x0]
+;
+
+define <vscale x 2 x i64> @gldnt1sb_d(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1sb_d:
+; CHECK: ldnt1sb { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldnt1.gather.nxv2i8(<vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i64> %b)
+ %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldnt1sh_d(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1sh_d:
+; CHECK: ldnt1sh { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.nxv2i16(<vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %b)
+ %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldnt1sw_d(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: gldnt1sw_d:
+; CHECK: ldnt1sw { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.nxv2i32(<vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %offsets)
+ %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ldnt1.gather.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i64>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.gather.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
--- /dev/null
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; LDNT1B, LDNT1W, LDNT1H, LDNT1D: vector base + scalar offset
+; ldnt1b { z0.s }, p0/z, [z0.s, x0]
+;
+
+; LDNT1B
+define <vscale x 4 x i32> @gldnt1b_s(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1b_s:
+; CHECK: ldnt1b { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 %offset)
+ %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldnt1b_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1b_d:
+; CHECK: ldnt1b { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDNT1H
+define <vscale x 4 x i32> @gldnt1h_s(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1h_s:
+; CHECK: ldnt1h { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv416.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 %offset)
+ %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldnt1h_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1h_d:
+; CHECK: ldnt1h { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDNT1W
+define <vscale x 4 x i32> @gldnt1w_s(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1w_s:
+; CHECK: ldnt1w { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 %offset)
+ ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x float> @gldnt1w_s_float(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1w_s_float:
+; CHECK: ldnt1w { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 %offset)
+ ret <vscale x 4 x float> %load
+}
+
+define <vscale x 2 x i64> @gldnt1w_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1w_d:
+; CHECK: ldnt1w { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDNT1D
+define <vscale x 2 x i64> @gldnt1d_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1d_d:
+; CHECK: ldnt1d { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret <vscale x 2 x i64> %load
+}
+
+; LDNT1D
+define <vscale x 2 x double> @gldnt1d_d_double(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1d_d_double:
+; CHECK: ldnt1d { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret <vscale x 2 x double> %load
+}
+
+;
+; LDNT1SB, LDNT1SW, LDNT1SH, LDNT1SD: vector base + scalar offset
+; ldnt1sb { z0.s }, p0/z, [z0.s, x0]
+;
+
+; LDNT1SB
+define <vscale x 4 x i32> @gldnt1sb_s(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1sb_s:
+; CHECK: ldnt1sb { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 %offset)
+ %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldnt1sb_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1sb_d:
+; CHECK: ldnt1sb { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDNT1SH
+define <vscale x 4 x i32> @gldnt1sh_s(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1sh_s:
+; CHECK: ldnt1sh { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv416.nxv4i32(<vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 %offset)
+ %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldnt1sh_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1sh_d:
+; CHECK: ldnt1sh { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDNT1SW
+define <vscale x 2 x i64> @gldnt1sw_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1sw_d:
+; CHECK: ldnt1sw { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+ %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %res
+}
+
+; LDNT1B/LDNT1SB
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+; LDNT1H/LDNT1SH
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv416.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+; LDNT1W/LDNT1SW
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4f32.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+; LDNT1D
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
--- /dev/null
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; STNT1B, STNT1W, STNT1H, STNT1D: base + 32-bit unscaled offset, zero (uxtw)
+; extended to 64 bits.
+; e.g. stnt1h { z0.d }, p0, [z1.d, x0]
+;
+
+; STNT1B
+define void @sstnt1b_s_uxtw(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %offsets) {
+; CHECK-LABEL: sstnt1b_s_uxtw:
+; CHECK: stnt1b { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i8>
+ call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i8(<vscale x 4 x i8> %data_trunc,
+ <vscale x 4 x i1> %pg,
+ i8* %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
+
+; STNT1H
+define void @sstnt1h_s_uxtw(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %offsets) {
+; CHECK-LABEL: sstnt1h_s_uxtw:
+; CHECK: stnt1h { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i16>
+ call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i16(<vscale x 4 x i16> %data_trunc,
+ <vscale x 4 x i1> %pg,
+ i16* %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
+
+; STNT1W
+define void @sstnt1w_s_uxtw(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %offsets) {
+; CHECK-LABEL: sstnt1w_s_uxtw:
+; CHECK: stnt1w { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i32(<vscale x 4 x i32> %data,
+ <vscale x 4 x i1> %pg,
+ i32* %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
+
+define void @sstnt1w_s_uxtw_float(<vscale x 4 x float> %data, <vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %offsets) {
+; CHECK-LABEL: sstnt1w_s_uxtw_float:
+; CHECK: stnt1w { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4f32(<vscale x 4 x float> %data,
+ <vscale x 4 x i1> %pg,
+ float* %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
+
+; STNT1B
+declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i8*, <vscale x 2 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i8*, <vscale x 2 x i32>)
+
+; STNT1H
+declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
+
+; STNT1W
+declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
+
+declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*, <vscale x 4 x i32>)
--- /dev/null
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; STNT1B, STNT1W, STNT1H, STNT1D: base + 64-bit unscaled offset
+; e.g. stnt1h { z0.d }, p0, [z1.d, x0]
+;
+
+define void @sstnt1b_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sstnt1b_d:
+; CHECK: stnt1b { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i8>
+ call void @llvm.aarch64.sve.stnt1.scatter.nxv2i8(<vscale x 2 x i8> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i64> %b)
+ ret void
+}
+
+define void @sstnt1h_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sstnt1h_d:
+; CHECK: stnt1h { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i16>
+ call void @llvm.aarch64.sve.stnt1.scatter.nxv2i16(<vscale x 2 x i16> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %b)
+ ret void
+}
+
+define void @sstnt1w_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sstnt1w_d:
+; CHECK: stnt1w { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i32>
+ call void @llvm.aarch64.sve.stnt1.scatter.nxv2i32(<vscale x 2 x i32> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %b)
+ ret void
+}
+
+define void @sstnt1d_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sstnt1d_d:
+; CHECK: stnt1d { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.scatter.nxv2i64(<vscale x 2 x i64> %data,
+ <vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i64> %b)
+ ret void
+}
+
+define void @sstnt1d_d_double(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sstnt1d_d_double:
+; CHECK: stnt1d { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.scatter.nxv2f64(<vscale x 2 x double> %data,
+ <vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i64> %b)
+ ret void
+}
+
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i8*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*, <vscale x 2 x i64>)
--- /dev/null
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; STNT1B, STNT1W, STNT1H, STNT1D: vector base + scalar offset
+; stnt1b { z0.s }, p0/z, [z0.s, x0]
+;
+
+; STNT1B
+define void @stnt1b_s(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: stnt1b_s:
+; CHECK: stnt1b { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i8>
+ call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i8> %data_trunc,
+ <vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 %offset)
+ ret void
+}
+
+define void @stnt1b_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: stnt1b_d:
+; CHECK: stnt1b { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i8>
+ call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i8> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret void
+}
+
+; STNT1H
+define void @stnt1h_s(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: stnt1h_s:
+; CHECK: stnt1h { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i16>
+ call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i16.nxv4i32(<vscale x 4 x i16> %data_trunc,
+ <vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 %offset)
+ ret void
+}
+
+define void @stnt1h_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: stnt1h_d:
+; CHECK: stnt1h { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i16>
+ call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i16> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret void
+}
+
+; STNT1W
+define void @stnt1w_s(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: stnt1w_s:
+; CHECK: stnt1w { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i32> %data,
+ <vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 %offset)
+ ret void
+}
+
+define void @stnt1w_f32_s(<vscale x 4 x float> %data, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: stnt1w_f32_s:
+; CHECK: stnt1w { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4f32.nxv4i32(<vscale x 4 x float> %data,
+ <vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 %offset)
+ ret void
+}
+
+define void @stnt1w_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: stnt1w_d:
+; CHECK: stnt1w { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i32>
+ call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i32> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret void
+}
+
+; STNT1D
+define void @stnt1d_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: stnt1d_d:
+; CHECK: stnt1d { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> %data,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret void
+}
+
+define void @stnt1d_f64_d(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: stnt1d_f64_d:
+; CHECK: stnt1d { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double> %data,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 %offset)
+ ret void
+}
+
+; STNT1B
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i8>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i8>, <vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+; STNT1H
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i16>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i16.nxv4i32(<vscale x 4 x i16>, <vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+; STNT1W
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i32>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4f32.nxv4i32(<vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+; STNT1D
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2f32.nxv2i64(<vscale x 2 x float>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)