[AArch64][SVE] Add intrinsics for non-temporal gather-loads/scatter-stores

author Andrzej Warzynski <andrzej.warzynski@arm.com>

Wed, 19 Feb 2020 12:25:30 +0000 (12:25 +0000)

committer Andrzej Warzynski <andrzej.warzynski@arm.com>

Mon, 2 Mar 2020 10:38:28 +0000 (10:38 +0000)
author Andrzej Warzynski <andrzej.warzynski@arm.com>
Wed, 19 Feb 2020 12:25:30 +0000 (12:25 +0000)
committer Andrzej Warzynski <andrzej.warzynski@arm.com>
Mon, 2 Mar 2020 10:38:28 +0000 (10:38 +0000)
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td

index ac5d530..3976dde 100644 (file)
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1761,6 +1761,22 @@ def int_aarch64_sve_ldff1_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic
  
  
  //
+// Non-temporal gather loads: scalar base + vector offsets
+//
+
+// 64 bit unscaled offsets
+def int_aarch64_sve_ldnt1_gather : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic;
+
+// 32 bit unscaled offsets, zero (zxtw) extended to 64 bits
+def int_aarch64_sve_ldnt1_gather_uxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;
+
+//
+// Non-temporal gather loads: vector base + scalar offset
+//
+
+def int_aarch64_sve_ldnt1_gather_scalar_offset  : AdvSIMD_GatherLoad_VS_Intrinsic;
+
+//
  // Scatter stores: scalar base + vector offsets
  //
  
@@ -1792,6 +1808,22 @@ def int_aarch64_sve_st1_scatter_uxtw_index
  def int_aarch64_sve_st1_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsic;
  
  //
+// Non-temporal scatter stores: scalar base + vector offsets
+//
+
+// 64 bit unscaled offsets
+def int_aarch64_sve_stnt1_scatter : AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic;
+
+// 32 bit unscaled offsets, zero (zxtw) extended to 64 bits
+def int_aarch64_sve_stnt1_scatter_uxtw : AdvSIMD_ScatterStore_SV_32b_Offsets_Intrinsic;
+
+//
+// Non-temporal scatter stores: vector base + scalar offset
+//
+
+def int_aarch64_sve_stnt1_scatter_scalar_offset  : AdvSIMD_ScatterStore_VS_Intrinsic;
+
+//
  // SVE2 - Uniform DSP operations
  //
  
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

index 2346ce5..6466aae 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1438,6 +1438,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case AArch64ISD::GLDFF1S_UXTW_SCALED:
      return "AArch64ISD::GLDFF1S_UXTW_SCALED";
    case AArch64ISD::GLDFF1S_IMM:       return "AArch64ISD::GLDFF1S_IMM";
+
+  case AArch64ISD::GLDNT1:            return "AArch64ISD::GLDNT1";
+  case AArch64ISD::GLDNT1S:           return "AArch64ISD::GLDNT1S";
+
    case AArch64ISD::SST1:              return "AArch64ISD::SST1";
    case AArch64ISD::SST1_SCALED:       return "AArch64ISD::SST1_SCALED";
    case AArch64ISD::SST1_SXTW:         return "AArch64ISD::SST1_SXTW";
@@ -1445,6 +1449,9 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case AArch64ISD::SST1_SXTW_SCALED:  return "AArch64ISD::SST1_SXTW_SCALED";
    case AArch64ISD::SST1_UXTW_SCALED:  return "AArch64ISD::SST1_UXTW_SCALED";
    case AArch64ISD::SST1_IMM:          return "AArch64ISD::SST1_IMM";
+
+  case AArch64ISD::SSTNT1:            return "AArch64ISD::SSTNT1";
+
    case AArch64ISD::LDP:               return "AArch64ISD::LDP";
    case AArch64ISD::STP:               return "AArch64ISD::STP";
    case AArch64ISD::STNP:              return "AArch64ISD::STNP";
@@ -10457,6 +10464,7 @@ static SDValue performSVEAndCombine(SDNode *N,
    case AArch64ISD::GLDFF1_UXTW:
    case AArch64ISD::GLDFF1_UXTW_SCALED:
    case AArch64ISD::GLDFF1_IMM:
+  case AArch64ISD::GLDNT1:
      MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
      break;
    default:
@@ -12644,6 +12652,14 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
    // vector of offsets  (that fits into one register)
    SDValue Offset = N->getOperand(5);
  
+  // In the case of non-temporal gather loads there's only one SVE instruction
+  // per data-size: "scalar + vector", i.e.
+  //    * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
+  // Since we do have intrinsics that allow the arguments to be in a different
+  // order, we may need to swap them to match the spec.
+  if (Opcode == AArch64ISD::SSTNT1 && Offset.getValueType().isVector())
+      std::swap(Base, Offset);
+
    // SST1_IMM requires that the offset is an immediate that is:
    //    * a multiple of #SizeInBytes,
    //    * in the range [0, 31 x #SizeInBytes],
@@ -12730,6 +12746,14 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
    // vector of offsets  (that fits into one register)
    SDValue Offset = N->getOperand(4);
  
+  // In the case of non-temporal gather loads there's only one SVE instruction
+  // per data-size: "scalar + vector", i.e.
+  //    * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
+  // Since we do have intrinsics that allow the arguments to be in a different
+  // order, we may need to swap them to match the spec.
+  if (Opcode == AArch64ISD::GLDNT1 && Offset.getValueType().isVector())
+      std::swap(Base, Offset);
+
    // GLD{FF}1_IMM requires that the offset is an immediate that is:
    //    * a multiple of #SizeInBytes,
    //    * in the range [0, 31 x #SizeInBytes],
@@ -12859,6 +12883,9 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
    case AArch64ISD::GLDFF1_IMM:
      NewOpc = AArch64ISD::GLDFF1S_IMM;
      break;
+  case AArch64ISD::GLDNT1:
+    NewOpc = AArch64ISD::GLDNT1S;
+    break;
    default:
      return SDValue();
    }
@@ -12972,12 +12999,24 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
        return performNEONPostLDSTCombine(N, DCI, DAG);
      case Intrinsic::aarch64_sve_ldnt1:
        return performLDNT1Combine(N, DAG);
+    case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
+    case Intrinsic::aarch64_sve_ldnt1_gather:
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
+    case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
      case Intrinsic::aarch64_sve_ldnf1:
        return performLDNF1Combine(N, DAG, AArch64ISD::LDNF1);
      case Intrinsic::aarch64_sve_ldff1:
        return performLDNF1Combine(N, DAG, AArch64ISD::LDFF1);
      case Intrinsic::aarch64_sve_stnt1:
        return performSTNT1Combine(N, DAG);
+    case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1);
+    case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1);
+    case Intrinsic::aarch64_sve_stnt1_scatter:
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1);
      case Intrinsic::aarch64_sve_ld1_gather:
        return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1);
      case Intrinsic::aarch64_sve_ld1_gather_index:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h

index c5914b1..2f26744 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -261,6 +261,10 @@ enum NodeType : unsigned {
    GLDFF1S_SXTW_SCALED,
    GLDFF1S_IMM,
  
+  // Non-temporal gather loads
+  GLDNT1,
+  GLDNT1S,
+
    // Scatter store
    SST1,
    SST1_SCALED,
@@ -270,6 +274,9 @@ enum NodeType : unsigned {
    SST1_SXTW_SCALED,
    SST1_IMM,
  
+  // Non-temporal scatter store
+  SSTNT1,
+
    // Strict (exception-raising) floating point comparison
    STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
    STRICT_FCMPE,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

index 3687b3a..542533a 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -69,6 +69,9 @@ def AArch64ldff1s_gather_uxtw_scaled : SDNode<"AArch64ISD::GLDFF1S_UXTW_SCALED",
  def AArch64ldff1s_gather_sxtw_scaled : SDNode<"AArch64ISD::GLDFF1S_SXTW_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
  def AArch64ldff1s_gather_imm         : SDNode<"AArch64ISD::GLDFF1S_IMM",         SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
  
+def AArch64ldnt1_gather  : SDNode<"AArch64ISD::GLDNT1",  SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ldnt1s_gather : SDNode<"AArch64ISD::GLDNT1S", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
+
  // Scatter stores - node definitions
  //
  def SDT_AArch64_SCATTER_SV : SDTypeProfile<0, 5, [
@@ -89,6 +92,8 @@ def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED", SDT_
  def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
  def AArch64st1_scatter_imm         : SDNode<"AArch64ISD::SST1_IMM",         SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
  
+def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>;
+
  // AArch64 SVE/SVE2 - the remaining node definitions
  //
  
@@ -1909,32 +1914,32 @@ let Predicates = [HasSVE2] in {
    def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
  
    // SVE2 non-temporal gather loads
-  defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
-  defm LDNT1B_ZZR_S  : sve2_mem_gldnt_vs<0b00001, "ldnt1b",  Z_s, ZPR32>;
-  defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
-  defm LDNT1H_ZZR_S  : sve2_mem_gldnt_vs<0b00101, "ldnt1h",  Z_s, ZPR32>;
-  defm LDNT1W_ZZR_S  : sve2_mem_gldnt_vs<0b01001, "ldnt1w",  Z_s, ZPR32>;
-
-  defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
-  defm LDNT1B_ZZR_D  : sve2_mem_gldnt_vs<0b10010, "ldnt1b",  Z_d, ZPR64>;
-  defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
-  defm LDNT1H_ZZR_D  : sve2_mem_gldnt_vs<0b10110, "ldnt1h",  Z_d, ZPR64>;
-  defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
-  defm LDNT1W_ZZR_D  : sve2_mem_gldnt_vs<0b11010, "ldnt1w",  Z_d, ZPR64>;
-  defm LDNT1D_ZZR_D  : sve2_mem_gldnt_vs<0b11110, "ldnt1d",  Z_d, ZPR64>;
+  defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00000, "ldnt1sb", AArch64ldnt1s_gather, nxv4i8>;
+  defm LDNT1B_ZZR_S  : sve2_mem_gldnt_vs_32_ptrs<0b00001, "ldnt1b",  AArch64ldnt1_gather,  nxv4i8>;
+  defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00100, "ldnt1sh", AArch64ldnt1s_gather, nxv4i16>;
+  defm LDNT1H_ZZR_S  : sve2_mem_gldnt_vs_32_ptrs<0b00101, "ldnt1h",  AArch64ldnt1_gather,  nxv4i16>;
+  defm LDNT1W_ZZR_S  : sve2_mem_gldnt_vs_32_ptrs<0b01001, "ldnt1w",  AArch64ldnt1_gather,  nxv4i32>;
+
+  defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10000, "ldnt1sb", AArch64ldnt1s_gather, nxv2i8>;
+  defm LDNT1B_ZZR_D  : sve2_mem_gldnt_vs_64_ptrs<0b10010, "ldnt1b",  AArch64ldnt1_gather,  nxv2i8>;
+  defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10100, "ldnt1sh", AArch64ldnt1s_gather, nxv2i16>;
+  defm LDNT1H_ZZR_D  : sve2_mem_gldnt_vs_64_ptrs<0b10110, "ldnt1h",  AArch64ldnt1_gather,  nxv2i16>;
+  defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11000, "ldnt1sw", AArch64ldnt1s_gather, nxv2i32>;
+  defm LDNT1W_ZZR_D  : sve2_mem_gldnt_vs_64_ptrs<0b11010, "ldnt1w",  AArch64ldnt1_gather,  nxv2i32>;
+  defm LDNT1D_ZZR_D  : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d",  AArch64ldnt1_gather,  nxv2i64>;
  
    // SVE2 vector splice (constructive)
    defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
  
    // SVE2 non-temporal scatter stores
-  defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
-  defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
-  defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
-
-  defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
-  defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
-  defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
-  defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
+  defm STNT1B_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b001, "stnt1b", AArch64stnt1_scatter, nxv4i8>;
+  defm STNT1H_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b011, "stnt1h", AArch64stnt1_scatter, nxv4i16>;
+  defm STNT1W_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b101, "stnt1w", AArch64stnt1_scatter, nxv4i32>;
+
+  defm STNT1B_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b000, "stnt1b", AArch64stnt1_scatter, nxv2i8>;
+  defm STNT1H_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b010, "stnt1h", AArch64stnt1_scatter, nxv2i16>;
+  defm STNT1W_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b100, "stnt1w", AArch64stnt1_scatter, nxv2i32>;
+  defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>;
  
    // SVE2 table lookup (three sources)
    defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td

index 7cd35d8..b055012 100644 (file)
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -5071,16 +5071,36 @@ class sve2_mem_sstnt_vs_base<bits<3> opc, string asm,
    let mayStore = 1;
  }
  
-multiclass sve2_mem_sstnt_vs<bits<3> opc, string asm,
-                             RegisterOperand listty, ZPRRegOp zprty> {
-  def _REAL : sve2_mem_sstnt_vs_base<opc, asm, listty, zprty>;
+multiclass sve2_mem_sstnt_vs_32_ptrs<bits<3> opc, string asm,
+                             SDPatternOperator op,
+                             ValueType vt> {
+  def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_s, ZPR32>;
  
    def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
-                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
+                 (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>;
    def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
-                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
+                 (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>;
    def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
-                 (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
+                 (!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>;
+
+  def : Pat <(op (nxv4i32 ZPR32:$Zt), (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zn), (i64 GPR64:$Rm), vt),
+             (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm)>;
+}
+
+multiclass sve2_mem_sstnt_vs_64_ptrs<bits<3> opc, string asm,
+                             SDPatternOperator op,
+                             ValueType vt> {
+  def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_d, ZPR64>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
+                 (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+                 (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+                 (!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;
+
+  def : Pat <(op (nxv2i64 ZPR64:$Zt), (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64:$Rm), vt),
+             (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>;
  }
  
  class sve_mem_sst_sv<bits<3> opc, bit xs, bit scaled, string asm,
@@ -6529,17 +6549,38 @@ class sve2_mem_gldnt_vs_base<bits<5> opc, dag iops, string asm,
    let mayLoad = 1;
  }
  
-multiclass sve2_mem_gldnt_vs<bits<5> opc, string asm,
-                             RegisterOperand listty, ZPRRegOp zprty> {
-  def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
-                                     asm, listty>;
+multiclass sve2_mem_gldnt_vs_32_ptrs<bits<5> opc, string asm,
+                                  SDPatternOperator op,
+                                  ValueType vt> {
+  def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm),
+                                     asm, Z_s>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
+                 (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+                 (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+                 (!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>;
+
+  def : Pat <(nxv4i32 (op (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zd), (i64 GPR64:$Rm), vt)),
+             (!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR32:$Zd, GPR64:$Rm)>;
+}
+
+multiclass sve2_mem_gldnt_vs_64_ptrs<bits<5> opc, string asm,
+                                   SDPatternOperator op,
+                                   ValueType vt> {
+  def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm),
+                                     asm, Z_d>;
  
    def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
-                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
+                 (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>;
    def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
-                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
+                 (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>;
    def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
-                 (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
+                 (!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;
+
+  def : Pat <(nxv2i64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zd), (i64 GPR64:$Rm), vt)),
+             (!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR64:$Zd, GPR64:$Rm)>;
  }
  
  //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-32bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-32bit-unscaled-offset.ll

new file mode 100644 (file)

index 0000000..ab570ce
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-32bit-unscaled-offset.ll
@@ -0,0 +1,96 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; LDNT1B, LDNT1W, LDNT1H, LDNT1D: base + 32-bit unscaled offsets, zero (uxtw)
+; extended to 64 bits.
+;   e.g. ldnt1h { z0.s }, p0/z, [z0.s, x0]
+;
+
+; LDNT1B
+define <vscale x 4 x i32> @gldnt1b_s_uxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldnt1b_s_uxtw:
+; CHECK: ldnt1b { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i8(<vscale x 4 x i1> %pg,
+                                                                            i8* %base,
+                                                                            <vscale x 4 x i32> %b)
+  %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+; LDNT1H
+define <vscale x 4 x i32> @gldnt1h_s_uxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldnt1h_s_uxtw:
+; CHECK: ldnt1h { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT:  ret
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                              i16* %base,
+                                                                              <vscale x 4 x i32> %b)
+  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+; LDNT1W
+define <vscale x 4 x i32> @gldnt1w_s_uxtw(<vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldnt1w_s_uxtw:
+; CHECK: ldnt1w { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT:  ret
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                              i32* %base,
+                                                                              <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x float> @gldnt1w_s_uxtw_float(<vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldnt1w_s_uxtw_float:
+; CHECK: ldnt1w { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT:  ret
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                                float* %base,
+                                                                                <vscale x 4 x i32> %b)
+  ret <vscale x 4 x float> %load
+}
+
+; LDNT1SB, LDNT1SW, LDNT1SH: base + 32-bit unscaled offsets, zero (uxtw)
+; extended to 64 bits.
+;   e.g. ldnt1sh { z0.s }, p0/z, [z0.s, x0]
+;
+
+; LDNT1SB
+define <vscale x 4 x i32> @gldnt1sb_s_uxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldnt1sb_s_uxtw:
+; CHECK: ldnt1sb { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i8(<vscale x 4 x i1> %pg,
+                                                                            i8* %base,
+                                                                            <vscale x 4 x i32> %b)
+  %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+; LDNT1SH
+define <vscale x 4 x i32> @gldnt1sh_s_uxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gldnt1sh_s_uxtw:
+; CHECK: ldnt1sh { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT:  ret
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                              i16* %base,
+                                                                              <vscale x 4 x i32> %b)
+  %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+; LDNT1B/LDNT1SB
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i8(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.sxtw.nxv4i8(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+
+; LDNT1H/LDNT1SH
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.sxtw.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+
+; LDNT1W/LDNT1SW
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.sxtw.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.gather.sxtw.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-unscaled-offset.ll

new file mode 100644 (file)

index 0000000..bc1e961
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-unscaled-offset.ll
@@ -0,0 +1,103 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; LDNT1B, LDNT1W, LDNT1H, LDNT1D: base + 64-bit unscaled offsets
+;   e.g. ldnt1h { z0.d }, p0/z, [z0.d, x0]
+;
+
+define <vscale x 2 x i64> @gldnt1b_d(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1b_d:
+; CHECK: ldnt1b { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldnt1.gather.nxv2i8(<vscale x 2 x i1> %pg,
+                                                                       i8* %base,
+                                                                       <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldnt1h_d(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1h_d:
+; CHECK: ldnt1h { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                         i16* %base,
+                                                                         <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldnt1w_d(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: gldnt1w_d:
+; CHECK: ldnt1w { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                         i32* %base,
+                                                                         <vscale x 2 x i64> %offsets)
+  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldnt1d_d(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1d_d:
+; CHECK: ldnt1d { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                         i64* %base,
+                                                                         <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gldnt1d_d_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1d_d_double:
+; CHECK: ldnt1d { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.gather.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                            double* %base,
+                                                                            <vscale x 2 x i64> %b)
+  ret <vscale x 2 x double> %load
+}
+
+;
+; LDNT1SB, LDNT1SW, LDNT1SH: base + 64-bit unscaled offsets
+;   e.g. ldnt1sh { z0.d }, p0/z, [z0.d, x0]
+;
+
+define <vscale x 2 x i64> @gldnt1sb_d(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1sb_d:
+; CHECK: ldnt1sb { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldnt1.gather.nxv2i8(<vscale x 2 x i1> %pg,
+                                                                       i8* %base,
+                                                                       <vscale x 2 x i64> %b)
+  %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldnt1sh_d(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1sh_d:
+; CHECK: ldnt1sh { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                         i16* %base,
+                                                                         <vscale x 2 x i64> %b)
+  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldnt1sw_d(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: gldnt1sw_d:
+; CHECK: ldnt1sw { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                         i32* %base,
+                                                                         <vscale x 2 x i64> %offsets)
+  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ldnt1.gather.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i64>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.gather.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-vector-base-scalar-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-vector-base-scalar-offset.ll

new file mode 100644 (file)

index 0000000..65d3365
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-vector-base-scalar-offset.ll
@@ -0,0 +1,188 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; LDNT1B, LDNT1W, LDNT1H, LDNT1D: vector base + scalar offset
+;   ldnt1b { z0.s }, p0/z, [z0.s, x0]
+;
+
+; LDNT1B
+define <vscale x 4 x i32> @gldnt1b_s(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1b_s:
+; CHECK:    ldnt1b { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                             <vscale x 4 x i32> %base,
+                                                                                             i64 %offset)
+  %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldnt1b_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1b_d:
+; CHECK:    ldnt1b { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                             <vscale x 2 x i64> %base,
+                                                                                             i64 %offset)
+  %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LDNT1H
+define <vscale x 4 x i32> @gldnt1h_s(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1h_s:
+; CHECK:    ldnt1h { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv416.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                              <vscale x 4 x i32> %base,
+                                                                                              i64 %offset)
+  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldnt1h_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1h_d:
+; CHECK:    ldnt1h { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                               <vscale x 2 x i64> %base,
+                                                                                               i64 %offset)
+  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LDNT1W
+define <vscale x 4 x i32> @gldnt1w_s(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1w_s:
+; CHECK:    ldnt1w { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                               <vscale x 4 x i32> %base,
+                                                                                               i64 %offset)
+  ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x float> @gldnt1w_s_float(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1w_s_float:
+; CHECK:    ldnt1w { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                                 <vscale x 4 x i32> %base,
+                                                                                                 i64 %offset)
+  ret <vscale x 4 x float> %load
+}
+
+define <vscale x 2 x i64> @gldnt1w_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1w_d:
+; CHECK:    ldnt1w { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                               <vscale x 2 x i64> %base,
+                                                                                               i64 %offset)
+  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LDNT1D
+define <vscale x 2 x i64> @gldnt1d_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1d_d:
+; CHECK:    ldnt1d { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                               <vscale x 2 x i64> %base,
+                                                                                               i64 %offset)
+  ret <vscale x 2 x i64> %load
+}
+
+; LDNT1D
+define <vscale x 2 x double> @gldnt1d_d_double(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1d_d_double:
+; CHECK:    ldnt1d { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                                  <vscale x 2 x i64> %base,
+                                                                                                  i64 %offset)
+  ret <vscale x 2 x double> %load
+}
+
+;
+; LDNT1SB, LDNT1SW, LDNT1SH, LDNT1SD: vector base + scalar offset
+;   ldnt1sb { z0.s }, p0/z, [z0.s, x0]
+;
+
+; LDNT1SB
+define <vscale x 4 x i32> @gldnt1sb_s(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1sb_s:
+; CHECK:    ldnt1sb { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                             <vscale x 4 x i32> %base,
+                                                                                             i64 %offset)
+  %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldnt1sb_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1sb_d:
+; CHECK:    ldnt1sb { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                             <vscale x 2 x i64> %base,
+                                                                                             i64 %offset)
+  %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LDNT1SH
+define <vscale x 4 x i32> @gldnt1sh_s(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1sh_s:
+; CHECK:    ldnt1sh { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv416.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                              <vscale x 4 x i32> %base,
+                                                                                              i64 %offset)
+  %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldnt1sh_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1sh_d:
+; CHECK:    ldnt1sh { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                               <vscale x 2 x i64> %base,
+                                                                                               i64 %offset)
+  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LDNT1SW
+define <vscale x 2 x i64> @gldnt1sw_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1sw_d:
+; CHECK:    ldnt1sw { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                               <vscale x 2 x i64> %base,
+                                                                                               i64 %offset)
+  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LDNT1B/LDNT1SB
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+; LDNT1H/LDNT1SH
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv416.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+; LDNT1W/LDNT1SW
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare <vscale x 4 x float>  @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4f32.nxv4i32(<vscale x 4 x i1>,  <vscale x 4 x i32>, i64)
+
+; LDNT1D
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-32bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-32bit-unscaled-offset.ll

new file mode 100644 (file)

index 0000000..56836ba
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-32bit-unscaled-offset.ll
@@ -0,0 +1,77 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; STNT1B, STNT1W, STNT1H, STNT1D: base + 32-bit unscaled offset, zero (uxtw)
+; extended to 64 bits.
+;   e.g. stnt1h { z0.d }, p0, [z1.d, x0]
+;
+
+; STNT1B
+define void @sstnt1b_s_uxtw(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %offsets) {
+; CHECK-LABEL: sstnt1b_s_uxtw:
+; CHECK: stnt1b { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+  %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i8>
+  call void  @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i8(<vscale x 4 x i8> %data_trunc,
+                                                         <vscale x 4 x i1> %pg,
+                                                         i8* %base,
+                                                         <vscale x 4 x i32> %offsets)
+  ret void
+}
+
+; STNT1H
+define void @sstnt1h_s_uxtw(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %offsets) {
+; CHECK-LABEL: sstnt1h_s_uxtw:
+; CHECK: stnt1h { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT:  ret
+  %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i16>
+  call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i16(<vscale x 4 x i16> %data_trunc,
+                                                         <vscale x 4 x i1> %pg,
+                                                         i16* %base,
+                                                         <vscale x 4 x i32> %offsets)
+  ret void
+}
+
+; STNT1W
+define void @sstnt1w_s_uxtw(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %offsets) {
+; CHECK-LABEL: sstnt1w_s_uxtw:
+; CHECK: stnt1w { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT:  ret
+  call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i32(<vscale x 4 x i32> %data,
+                                                         <vscale x 4 x i1> %pg,
+                                                         i32* %base,
+                                                         <vscale x 4 x i32> %offsets)
+  ret void
+}
+
+define void @sstnt1w_s_uxtw_float(<vscale x 4 x float> %data, <vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %offsets) {
+; CHECK-LABEL: sstnt1w_s_uxtw_float:
+; CHECK: stnt1w { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT:  ret
+  call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4f32(<vscale x 4 x float> %data,
+                                                         <vscale x 4 x i1> %pg,
+                                                         float* %base,
+                                                         <vscale x 4 x i32> %offsets)
+  ret void
+}
+
+; STNT1B
+declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i8*, <vscale x 2 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i8*, <vscale x 2 x i32>)
+
+; STNT1H
+declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
+
+; STNT1W
+declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
+
+declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*, <vscale x 4 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-unscaled-offset.ll

new file mode 100644 (file)

index 0000000..6cf4d5d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-unscaled-offset.ll
@@ -0,0 +1,70 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; STNT1B, STNT1W, STNT1H, STNT1D: base + 64-bit unscaled offset
+;   e.g. stnt1h { z0.d }, p0, [z1.d, x0]
+;
+
+define void @sstnt1b_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sstnt1b_d:
+; CHECK: stnt1b { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+  %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i8>
+  call void @llvm.aarch64.sve.stnt1.scatter.nxv2i8(<vscale x 2 x i8> %data_trunc,
+                                                   <vscale x 2 x i1> %pg,
+                                                   i8* %base,
+                                                   <vscale x 2 x i64> %b)
+  ret void
+}
+
+define void @sstnt1h_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sstnt1h_d:
+; CHECK: stnt1h { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+  %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i16>
+  call void @llvm.aarch64.sve.stnt1.scatter.nxv2i16(<vscale x 2 x i16> %data_trunc,
+                                                    <vscale x 2 x i1> %pg,
+                                                    i16* %base,
+                                                    <vscale x 2 x i64> %b)
+  ret void
+}
+
+define void @sstnt1w_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sstnt1w_d:
+; CHECK: stnt1w { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+  %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i32>
+  call void @llvm.aarch64.sve.stnt1.scatter.nxv2i32(<vscale x 2 x i32> %data_trunc,
+                                                    <vscale x 2 x i1> %pg,
+                                                    i32* %base,
+                                                    <vscale x 2 x i64> %b)
+  ret void
+}
+
+define void @sstnt1d_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sstnt1d_d:
+; CHECK: stnt1d { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+  call void @llvm.aarch64.sve.stnt1.scatter.nxv2i64(<vscale x 2 x i64> %data,
+                                                    <vscale x 2 x i1> %pg,
+                                                    i64* %base,
+                                                    <vscale x 2 x i64> %b)
+  ret void
+}
+
+define void @sstnt1d_d_double(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sstnt1d_d_double:
+; CHECK: stnt1d { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+  call void @llvm.aarch64.sve.stnt1.scatter.nxv2f64(<vscale x 2 x double> %data,
+                                                    <vscale x 2 x i1> %pg,
+                                                    double* %base,
+                                                    <vscale x 2 x i64> %b)
+  ret void
+}
+
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i8*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*, <vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-vector-base-scalar-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-vector-base-scalar-offset.ll

new file mode 100644 (file)

index 0000000..26d84fe
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-vector-base-scalar-offset.ll
@@ -0,0 +1,134 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; STNT1B, STNT1W, STNT1H, STNT1D: vector base + scalar offset
+;   stnt1b { z0.s }, p0/z, [z0.s, x0]
+;
+
+; STNT1B
+define void @stnt1b_s(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: stnt1b_s:
+; CHECK: stnt1b { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+  %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i8>
+  call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i8> %data_trunc,
+                                                                         <vscale x 4 x i1> %pg,
+                                                                         <vscale x 4 x i32> %base,
+                                                                         i64 %offset)
+  ret void
+}
+
+define void @stnt1b_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: stnt1b_d:
+; CHECK: stnt1b { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+  %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i8>
+  call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i8> %data_trunc,
+                                                                         <vscale x 2 x i1> %pg,
+                                                                         <vscale x 2 x i64> %base,
+                                                                         i64 %offset)
+  ret void
+}
+
+; STNT1H
+define void @stnt1h_s(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: stnt1h_s:
+; CHECK: stnt1h { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+  %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i16>
+  call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i16.nxv4i32(<vscale x 4 x i16> %data_trunc,
+                                                                          <vscale x 4 x i1> %pg,
+                                                                          <vscale x 4 x i32> %base,
+                                                                          i64 %offset)
+  ret void
+}
+
+define void @stnt1h_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: stnt1h_d:
+; CHECK: stnt1h { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+  %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i16>
+  call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i16> %data_trunc,
+                                                                          <vscale x 2 x i1> %pg,
+                                                                          <vscale x 2 x i64> %base,
+                                                                          i64 %offset)
+  ret void
+}
+
+; STNT1W
+define void @stnt1w_s(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: stnt1w_s:
+; CHECK: stnt1w { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+  call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i32> %data,
+                                                                          <vscale x 4 x i1> %pg,
+                                                                          <vscale x 4 x i32> %base,
+                                                                          i64 %offset)
+  ret void
+}
+
+define void @stnt1w_f32_s(<vscale x 4 x float> %data, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: stnt1w_f32_s:
+; CHECK: stnt1w { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+  call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4f32.nxv4i32(<vscale x 4 x float> %data,
+                                                                          <vscale x 4 x i1> %pg,
+                                                                          <vscale x 4 x i32> %base,
+                                                                          i64 %offset)
+  ret void
+}
+
+define void @stnt1w_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: stnt1w_d:
+; CHECK: stnt1w { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+  %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i32>
+  call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i32> %data_trunc,
+                                                                          <vscale x 2 x i1> %pg,
+                                                                          <vscale x 2 x i64> %base,
+                                                                          i64 %offset)
+  ret void
+}
+
+; STNT1D
+define void @stnt1d_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: stnt1d_d:
+; CHECK: stnt1d { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+  call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64> %data,
+                                                                          <vscale x 2 x i1> %pg,
+                                                                          <vscale x 2 x i64> %base,
+                                                                          i64 %offset)
+  ret void
+}
+
+define void @stnt1d_f64_d(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: stnt1d_f64_d:
+; CHECK: stnt1d { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+  call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double> %data,
+                                                                          <vscale x 2 x i1> %pg,
+                                                                          <vscale x 2 x i64> %base,
+                                                                          i64 %offset)
+  ret void
+}
+
+; STNT1B
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i8>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i8.nxv4i32(<vscale x 4 x i8>, <vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+; STNT1H
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i16.nxv2i64(<vscale x 2 x i16>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i16.nxv4i32(<vscale x 4 x i16>, <vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+; STNT1W
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i32.nxv2i64(<vscale x 2 x i32>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4f32.nxv4i32(<vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+; STNT1D
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i64.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2f32.nxv2i64(<vscale x 2 x float>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2f64.nxv2i64(<vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
author	Andrzej Warzynski <andrzej.warzynski@arm.com>
	Wed, 19 Feb 2020 12:25:30 +0000 (12:25 +0000)
committer	Andrzej Warzynski <andrzej.warzynski@arm.com>
	Mon, 2 Mar 2020 10:38:28 +0000 (10:38 +0000)
llvm/include/llvm/IR/IntrinsicsAArch64.td		patch \| blob \| history
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/AArch64/AArch64ISelLowering.h		patch \| blob \| history
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td		patch \| blob \| history
llvm/lib/Target/AArch64/SVEInstrFormats.td		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-32bit-unscaled-offset.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-unscaled-offset.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-vector-base-scalar-offset.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-32bit-unscaled-offset.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-unscaled-offset.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-vector-base-scalar-offset.ll	[new file with mode: 0644]	patch \| blob