[TargetLowering][AArch64][SVE] Take into account accessed type when clamping address

author Bradley Smith <bradley.smith@arm.com>

Mon, 28 Jun 2021 12:39:07 +0000 (13:39 +0100)

committer Bradley Smith <bradley.smith@arm.com>

Wed, 30 Jun 2021 12:30:18 +0000 (13:30 +0100)
author Bradley Smith <bradley.smith@arm.com>
Mon, 28 Jun 2021 12:39:07 +0000 (13:39 +0100)
committer Bradley Smith <bradley.smith@arm.com>
Wed, 30 Jun 2021 12:30:18 +0000 (13:30 +0100)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h

index 00ec95a85f8f39ddfae84477890529085041ab99..47d6ca43a5ac3e756dbda59ca2447bb3fd7b11ce 100644 (file)
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4488,6 +4488,14 @@ public:
    SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
                                    SDValue Index) const;
  
+  /// Get a pointer to a sub-vector of type \p SubVecVT at index \p Idx located
+  /// in memory for a vector of type \p VecVT starting at a base address of
+  /// \p VecPtr. If \p Idx plus the size of \p SubVecVT is out of bounds the
+  /// returned pointer is unspecified, but the value returned will be such that
+  /// the entire subvector would be within the vector bounds.
+  SDValue getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
+                                 EVT SubVecVT, SDValue Index) const;
+
    /// Method for building the DAG expansion of ISD::[US][MIN|MAX]. This
    /// method accepts integers as its arguments.
    SDValue expandIntMINMAX(SDNode *Node, SelectionDAG &DAG) const;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp

index 8392a5def8059f7d0ed2861b063eaad11095462c..d92b23f56e4defe8f47c3616e82d6f5b861e9078 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1370,17 +1370,19 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
                        MachinePointerInfo());
    }
  
-  StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
-
    SDValue NewLoad;
  
-  if (Op.getValueType().isVector())
+  if (Op.getValueType().isVector()) {
+    StackPtr = TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT,
+                                          Op.getValueType(), Idx);
      NewLoad =
          DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, MachinePointerInfo());
-  else
+  } else {
+    StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
      NewLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
                               MachinePointerInfo(),
                               VecVT.getVectorElementType());
+  }
  
    // Replace the chain going out of the store, by the one out of the load.
    DAG.ReplaceAllUsesOfValueWith(Ch, SDValue(NewLoad.getNode(), 1));
@@ -1405,6 +1407,7 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
  
    // Store the value to a temporary stack slot, then LOAD the returned part.
    EVT VecVT = Vec.getValueType();
+  EVT SubVecVT = Part.getValueType();
    SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
    int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
    MachinePointerInfo PtrInfo =
@@ -1414,7 +1417,8 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
    SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
  
    // Then store the inserted part.
-  SDValue SubStackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
+  SDValue SubStackPtr =
+      TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, SubVecVT, Idx);
  
    // Store the subvector.
    Ch = DAG.getStore(
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

index 7bc8d1d2333bdc031aef56c9e9ea09f30a29b46a..c018cfd0a2eaddf08eb1a59a5a997929b7342b5f 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1315,7 +1315,8 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
                                 SmallestAlign);
  
    // Store the new subvector into the specified index.
-  SDValue SubVecPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
+  SDValue SubVecPtr =
+      TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, SubVecVT, Idx);
    Store = DAG.getStore(Store, dl, SubVec, SubVecPtr,
                         MachinePointerInfo::getUnknownStack(MF));
  
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

index 801a994e7fdb436e1893e29daaf2d835abdf65bd..a5e3cc23972eecb0c4b5b68d5dc93e73acbc84d5 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7781,39 +7781,51 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
    return DAG.getNode(ISD::ADD, DL, AddrVT, Addr, Increment);
  }
  
-static SDValue clampDynamicVectorIndex(SelectionDAG &DAG,
-                                       SDValue Idx,
-                                       EVT VecVT,
-                                       const SDLoc &dl) {
+static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, SDValue Idx,
+                                       EVT VecVT, const SDLoc &dl,
+                                       unsigned NumSubElts) {
    if (!VecVT.isScalableVector() && isa<ConstantSDNode>(Idx))
      return Idx;
  
    EVT IdxVT = Idx.getValueType();
    unsigned NElts = VecVT.getVectorMinNumElements();
    if (VecVT.isScalableVector()) {
-    // If this is a constant index and we know the value is less than the
-    // minimum number of elements then it's safe to return Idx.
+    // If this is a constant index and we know the value plus the number of the
+    // elements in the subvector minus one is less than the minimum number of
+    // elements then it's safe to return Idx.
      if (auto *IdxCst = dyn_cast<ConstantSDNode>(Idx))
-      if (IdxCst->getZExtValue() < NElts)
+      if (IdxCst->getZExtValue() + (NumSubElts - 1) < NElts)
          return Idx;
      SDValue VS =
          DAG.getVScale(dl, IdxVT, APInt(IdxVT.getFixedSizeInBits(), NElts));
-    SDValue Sub =
-        DAG.getNode(ISD::SUB, dl, IdxVT, VS, DAG.getConstant(1, dl, IdxVT));
+    unsigned SubOpcode = NumSubElts <= NElts ? ISD::SUB : ISD::USUBSAT;
+    SDValue Sub = DAG.getNode(SubOpcode, dl, IdxVT, VS,
+                              DAG.getConstant(NumSubElts, dl, IdxVT));
      return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx, Sub);
    }
-  if (isPowerOf2_32(NElts)) {
+  if (isPowerOf2_32(NElts) && NumSubElts == 1) {
      APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(), Log2_32(NElts));
      return DAG.getNode(ISD::AND, dl, IdxVT, Idx,
                         DAG.getConstant(Imm, dl, IdxVT));
    }
+  unsigned MaxIndex = NumSubElts < NElts ? NElts - NumSubElts : 0;
    return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx,
-                     DAG.getConstant(NElts - 1, dl, IdxVT));
+                     DAG.getConstant(MaxIndex, dl, IdxVT));
  }
  
  SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
                                                  SDValue VecPtr, EVT VecVT,
                                                  SDValue Index) const {
+  return getVectorSubVecPointer(
+      DAG, VecPtr, VecVT,
+      EVT::getVectorVT(*DAG.getContext(), VecVT.getVectorElementType(), 1),
+      Index);
+}
+
+SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG,
+                                               SDValue VecPtr, EVT VecVT,
+                                               EVT SubVecVT,
+                                               SDValue Index) const {
    SDLoc dl(Index);
    // Make sure the index type is big enough to compute in.
    Index = DAG.getZExtOrTrunc(Index, dl, VecPtr.getValueType());
@@ -7825,7 +7837,11 @@ SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
    assert(EltSize * 8 == EltVT.getFixedSizeInBits() &&
           "Converting bits to bytes lost precision");
  
-  Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl);
+  assert(SubVecVT.isFixedLengthVector() &&
+         SubVecVT.getVectorElementType() == EltVT &&
+         "Sub-vector must be a fixed vector with matching element type");
+  Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl,
+                                  SubVecVT.getVectorNumElements());
  
    EVT IdxVT = Index.getValueType();
  
diff --git a/llvm/test/CodeGen/AArch64/split-vector-insert.ll b/llvm/test/CodeGen/AArch64/split-vector-insert.ll

index b3bf4ac9975bbbea4fe031ae1ec4e7990e4c9e0c..ee69b7945fa45c592e19f004cb96980b724eab5c 100644 (file)
--- a/llvm/test/CodeGen/AArch64/split-vector-insert.ll
+++ b/llvm/test/CodeGen/AArch64/split-vector-insert.ll
@@ -24,7 +24,7 @@ define <vscale x 2 x i64> @test_nxv2i64_v8i64(<vscale x 2 x i64> %a, <8 x i64> %
  ; CHECK-NEXT:    cntd x9
  ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
  ; CHECK-NEXT:    str q1, [sp]
-; CHECK-NEXT:    sub x9, x9, #1 // =1
+; CHECK-NEXT:    sub x9, x9, #2 // =2
  ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
  ; CHECK-NEXT:    mov w8, #2
  ; CHECK-NEXT:    cmp x9, #2 // =2
@@ -74,7 +74,7 @@ define <vscale x 2 x double> @test_nxv2f64_v8f64(<vscale x 2 x double> %a, <8 x
  ; CHECK-NEXT:    cntd x9
  ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
  ; CHECK-NEXT:    str q1, [sp]
-; CHECK-NEXT:    sub x9, x9, #1 // =1
+; CHECK-NEXT:    sub x9, x9, #2 // =2
  ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
  ; CHECK-NEXT:    mov w8, #2
  ; CHECK-NEXT:    cmp x9, #2 // =2
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-vector.ll

index 928407a5f919a114b6117f28b39172d7af47becc..89404417645671cae7d9ad60924a60d639bc99c5 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-extract-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-vector.ll
@@ -18,7 +18,7 @@ define <2 x i64> @extract_v2i64_nxv2i64_idx2(<vscale x 2 x i64> %vec) nounwind {
  ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
  ; CHECK-NEXT:    addvl sp, sp, #-1
  ; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    sub x9, x9, #1 // =1
+; CHECK-NEXT:    sub x9, x9, #2 // =2
  ; CHECK-NEXT:    mov w8, #2
  ; CHECK-NEXT:    cmp x9, #2 // =2
  ; CHECK-NEXT:    ptrue p0.d
@@ -30,8 +30,8 @@ define <2 x i64> @extract_v2i64_nxv2i64_idx2(<vscale x 2 x i64> %vec) nounwind {
  ; CHECK-NEXT:    addvl sp, sp, #1
  ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
  ; CHECK-NEXT:    ret
-%retval = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> %vec, i64 2)
-ret <2 x i64> %retval
+  %retval = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> %vec, i64 2)
+  ret <2 x i64> %retval
  }
  
  ; Should codegen to a nop, since idx is zero.
@@ -40,8 +40,8 @@ define <4 x i32> @extract_v4i32_nxv4i32(<vscale x 4 x i32> %vec) nounwind {
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
  ; CHECK-NEXT:    ret
-%retval = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
-ret <4 x i32> %retval
+  %retval = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
+  ret <4 x i32> %retval
  }
  
  ; Goes through memory currently; idx != 0.
@@ -51,7 +51,7 @@ define <4 x i32> @extract_v4i32_nxv4i32_idx4(<vscale x 4 x i32> %vec) nounwind {
  ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
  ; CHECK-NEXT:    addvl sp, sp, #-1
  ; CHECK-NEXT:    cntw x9
-; CHECK-NEXT:    sub x9, x9, #1 // =1
+; CHECK-NEXT:    sub x9, x9, #4 // =4
  ; CHECK-NEXT:    mov w8, #4
  ; CHECK-NEXT:    cmp x9, #4 // =4
  ; CHECK-NEXT:    ptrue p0.s
@@ -84,7 +84,7 @@ define <8 x i16> @extract_v8i16_nxv8i16_idx8(<vscale x 8 x i16> %vec) nounwind {
  ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
  ; CHECK-NEXT:    addvl sp, sp, #-1
  ; CHECK-NEXT:    cnth x9
-; CHECK-NEXT:    sub x9, x9, #1 // =1
+; CHECK-NEXT:    sub x9, x9, #8 // =8
  ; CHECK-NEXT:    mov w8, #8
  ; CHECK-NEXT:    cmp x9, #8 // =8
  ; CHECK-NEXT:    ptrue p0.h
@@ -117,7 +117,7 @@ define <16 x i8> @extract_v16i8_nxv16i8_idx16(<vscale x 16 x i8> %vec) nounwind
  ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
  ; CHECK-NEXT:    addvl sp, sp, #-1
  ; CHECK-NEXT:    rdvl x9, #1
-; CHECK-NEXT:    sub x9, x9, #1 // =1
+; CHECK-NEXT:    sub x9, x9, #16 // =16
  ; CHECK-NEXT:    ptrue p0.b
  ; CHECK-NEXT:    mov w8, #16
  ; CHECK-NEXT:    cmp x9, #16 // =16
@@ -151,11 +151,62 @@ define <vscale x 1 x i16> @extract_nxv1i16_nxv6i16(<vscale x 6 x i16> %vec) noun
    ret <vscale x 1 x i16> %retval
  }
  
+; Fixed length clamping
+
+define <2 x i64> @extract_fixed_v2i64_nxv2i64(<vscale x 2 x i64> %vec) nounwind #0 {
+; CHECK-LABEL: extract_fixed_v2i64_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub x9, x9, #2 // =2
+; CHECK-NEXT:    mov w8, #2
+; CHECK-NEXT:    cmp x9, #2 // =2
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    csel x8, x9, x8, lo
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-NEXT:    lsl x8, x8, #3
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ldr q0, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> %vec, i64 2)
+  ret <2 x i64> %retval
+}
+
+define <4 x i64> @extract_fixed_v4i64_nxv2i64(<vscale x 2 x i64> %vec) nounwind #0 {
+; CHECK-LABEL: extract_fixed_v4i64_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    subs x9, x9, #4 // =4
+; CHECK-NEXT:    csel x9, xzr, x9, lo
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov w10, #4
+; CHECK-NEXT:    cmp x9, #4 // =4
+; CHECK-NEXT:    ptrue p1.d, vl4
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-NEXT:    csel x9, x9, x10, lo
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x10, x9, lsl #3]
+; CHECK-NEXT:    st1d { z0.d }, p1, [x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> %vec, i64 4)
+  ret <4 x i64> %retval
+}
+
+attributes #0 = { vscale_range(2,2) }
  
  declare <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64>, i64)
  declare <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32>, i64)
  declare <8 x i16> @llvm.experimental.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16>, i64)
  declare <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8>, i64)
  
+declare <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64>, i64)
+
  declare <vscale x 1 x i32> @llvm.experimental.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32>, i64)
  declare <vscale x 1 x i16> @llvm.experimental.vector.extract.nxv1i16.nxv6i16(<vscale x 6 x i16>, i64)
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll

index 2816e97e0986540c1bb663da253cb07f24287531..669c65e1e4a89f4e1820b81ce2cd1bbfe205e231 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
@@ -23,7 +23,7 @@ define <vscale x 2 x i64> @insert_v2i64_nxv2i64_idx2(<vscale x 2 x i64> %vec, <2
  ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
  ; CHECK-NEXT:    addvl sp, sp, #-1
  ; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    sub x9, x9, #1 // =1
+; CHECK-NEXT:    sub x9, x9, #2 // =2
  ; CHECK-NEXT:    mov w8, #2
  ; CHECK-NEXT:    cmp x9, #2 // =2
  ; CHECK-NEXT:    csel x8, x9, x8, lo
@@ -62,7 +62,7 @@ define <vscale x 4 x i32> @insert_v4i32_nxv4i32_idx4(<vscale x 4 x i32> %vec, <4
  ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
  ; CHECK-NEXT:    addvl sp, sp, #-1
  ; CHECK-NEXT:    cntw x9
-; CHECK-NEXT:    sub x9, x9, #1 // =1
+; CHECK-NEXT:    sub x9, x9, #4 // =4
  ; CHECK-NEXT:    mov w8, #4
  ; CHECK-NEXT:    cmp x9, #4 // =4
  ; CHECK-NEXT:    csel x8, x9, x8, lo
@@ -101,7 +101,7 @@ define <vscale x 8 x i16> @insert_v8i16_nxv8i16_idx8(<vscale x 8 x i16> %vec, <8
  ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
  ; CHECK-NEXT:    addvl sp, sp, #-1
  ; CHECK-NEXT:    cnth x9
-; CHECK-NEXT:    sub x9, x9, #1 // =1
+; CHECK-NEXT:    sub x9, x9, #8 // =8
  ; CHECK-NEXT:    mov w8, #8
  ; CHECK-NEXT:    cmp x9, #8 // =8
  ; CHECK-NEXT:    csel x8, x9, x8, lo
@@ -140,7 +140,7 @@ define <vscale x 16 x i8> @insert_v16i8_nxv16i8_idx16(<vscale x 16 x i8> %vec, <
  ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
  ; CHECK-NEXT:    addvl sp, sp, #-1
  ; CHECK-NEXT:    rdvl x9, #1
-; CHECK-NEXT:    sub x9, x9, #1 // =1
+; CHECK-NEXT:    sub x9, x9, #16 // =16
  ; CHECK-NEXT:    mov w8, #16
  ; CHECK-NEXT:    cmp x9, #16 // =16
  ; CHECK-NEXT:    ptrue p0.b
@@ -299,12 +299,66 @@ entry:
    ret <vscale x 6 x i16> %retval
  }
  
+; Fixed length clamping
+
+define <vscale x 2 x i64> @insert_fixed_v2i64_nxv2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind #0 {
+; CHECK-LABEL: insert_fixed_v2i64_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub x9, x9, #2 // =2
+; CHECK-NEXT:    mov w8, #2
+; CHECK-NEXT:    cmp x9, #2 // =2
+; CHECK-NEXT:    csel x8, x9, x8, lo
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    lsl x8, x8, #3
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 2)
+  ret <vscale x 2 x i64> %retval
+}
+
+define <vscale x 2 x i64> @insert_fixed_v4i64_nxv2i64(<vscale x 2 x i64> %vec, <4 x i64>* %ptr) nounwind #0 {
+; CHECK-LABEL: insert_fixed_v4i64_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    subs x8, x8, #4 // =4
+; CHECK-NEXT:    csel x8, xzr, x8, lo
+; CHECK-NEXT:    mov w9, #4
+; CHECK-NEXT:    cmp x8, #4 // =4
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1d { z0.d }, p1, [sp]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x9, x8, lsl #3]
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %subvec = load <4 x i64>, <4 x i64>* %ptr
+  %retval = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> %vec, <4 x i64> %subvec, i64 4)
+  ret <vscale x 2 x i64> %retval
+}
+
+attributes #0 = { vscale_range(2,2) }
  
  declare <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
  declare <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
  declare <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
  declare <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
  
+declare <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64>, <4 x i64>, i64)
+
  declare <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64>, <vscale x 8 x i64>, i64)
  declare <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64)
  declare <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32>, <vscale x 1 x i32>, i64)
author	Bradley Smith <bradley.smith@arm.com>
	Mon, 28 Jun 2021 12:39:07 +0000 (13:39 +0100)
committer	Bradley Smith <bradley.smith@arm.com>
	Wed, 30 Jun 2021 12:30:18 +0000 (13:30 +0100)
llvm/include/llvm/CodeGen/TargetLowering.h		patch \| blob \| history
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp		patch \| blob \| history
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp		patch \| blob \| history
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/AArch64/split-vector-insert.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-extract-vector.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-insert-vector.ll		patch \| blob \| history