SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
SDValue Index) const;
+ /// Get a pointer to a sub-vector of type \p SubVecVT at index \p Idx located
+ /// in memory for a vector of type \p VecVT starting at a base address of
+ /// \p VecPtr. If \p Idx plus the size of \p SubVecVT is out of bounds the
+ /// returned pointer is unspecified, but the value returned will be such that
+ /// the entire subvector would be within the vector bounds.
+ SDValue getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
+ EVT SubVecVT, SDValue Index) const;
+
/// Method for building the DAG expansion of ISD::[US][MIN|MAX]. This
/// method accepts integers as its arguments.
SDValue expandIntMINMAX(SDNode *Node, SelectionDAG &DAG) const;
MachinePointerInfo());
}
- StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
-
SDValue NewLoad;
- if (Op.getValueType().isVector())
+ if (Op.getValueType().isVector()) {
+ StackPtr = TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT,
+ Op.getValueType(), Idx);
NewLoad =
DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, MachinePointerInfo());
- else
+ } else {
+ StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
NewLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
MachinePointerInfo(),
VecVT.getVectorElementType());
+ }
// Replace the chain going out of the store, by the one out of the load.
DAG.ReplaceAllUsesOfValueWith(Ch, SDValue(NewLoad.getNode(), 1));
// Store the value to a temporary stack slot, then LOAD the returned part.
EVT VecVT = Vec.getValueType();
+ EVT SubVecVT = Part.getValueType();
SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
MachinePointerInfo PtrInfo =
SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
// Then store the inserted part.
- SDValue SubStackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
+ SDValue SubStackPtr =
+ TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, SubVecVT, Idx);
// Store the subvector.
Ch = DAG.getStore(
SmallestAlign);
// Store the new subvector into the specified index.
- SDValue SubVecPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
+ SDValue SubVecPtr =
+ TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, SubVecVT, Idx);
Store = DAG.getStore(Store, dl, SubVec, SubVecPtr,
MachinePointerInfo::getUnknownStack(MF));
return DAG.getNode(ISD::ADD, DL, AddrVT, Addr, Increment);
}
-static SDValue clampDynamicVectorIndex(SelectionDAG &DAG,
- SDValue Idx,
- EVT VecVT,
- const SDLoc &dl) {
+static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, SDValue Idx,
+ EVT VecVT, const SDLoc &dl,
+ unsigned NumSubElts) {
if (!VecVT.isScalableVector() && isa<ConstantSDNode>(Idx))
return Idx;
EVT IdxVT = Idx.getValueType();
unsigned NElts = VecVT.getVectorMinNumElements();
if (VecVT.isScalableVector()) {
- // If this is a constant index and we know the value is less than the
- // minimum number of elements then it's safe to return Idx.
+ // If this is a constant index and we know the value plus the number of the
+ // elements in the subvector minus one is less than the minimum number of
+ // elements then it's safe to return Idx.
if (auto *IdxCst = dyn_cast<ConstantSDNode>(Idx))
- if (IdxCst->getZExtValue() < NElts)
+ if (IdxCst->getZExtValue() + (NumSubElts - 1) < NElts)
return Idx;
SDValue VS =
DAG.getVScale(dl, IdxVT, APInt(IdxVT.getFixedSizeInBits(), NElts));
- SDValue Sub =
- DAG.getNode(ISD::SUB, dl, IdxVT, VS, DAG.getConstant(1, dl, IdxVT));
+ unsigned SubOpcode = NumSubElts <= NElts ? ISD::SUB : ISD::USUBSAT;
+ SDValue Sub = DAG.getNode(SubOpcode, dl, IdxVT, VS,
+ DAG.getConstant(NumSubElts, dl, IdxVT));
return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx, Sub);
}
- if (isPowerOf2_32(NElts)) {
+ if (isPowerOf2_32(NElts) && NumSubElts == 1) {
APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(), Log2_32(NElts));
return DAG.getNode(ISD::AND, dl, IdxVT, Idx,
DAG.getConstant(Imm, dl, IdxVT));
}
+ unsigned MaxIndex = NumSubElts < NElts ? NElts - NumSubElts : 0;
return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx,
- DAG.getConstant(NElts - 1, dl, IdxVT));
+ DAG.getConstant(MaxIndex, dl, IdxVT));
}
SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
SDValue VecPtr, EVT VecVT,
SDValue Index) const {
+ return getVectorSubVecPointer(
+ DAG, VecPtr, VecVT,
+ EVT::getVectorVT(*DAG.getContext(), VecVT.getVectorElementType(), 1),
+ Index);
+}
+
+SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG,
+ SDValue VecPtr, EVT VecVT,
+ EVT SubVecVT,
+ SDValue Index) const {
SDLoc dl(Index);
// Make sure the index type is big enough to compute in.
Index = DAG.getZExtOrTrunc(Index, dl, VecPtr.getValueType());
assert(EltSize * 8 == EltVT.getFixedSizeInBits() &&
"Converting bits to bytes lost precision");
- Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl);
+ assert(SubVecVT.isFixedLengthVector() &&
+ SubVecVT.getVectorElementType() == EltVT &&
+ "Sub-vector must be a fixed vector with matching element type");
+ Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl,
+ SubVecVT.getVectorNumElements());
EVT IdxVT = Index.getValueType();
; CHECK-NEXT: cntd x9
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: str q1, [sp]
-; CHECK-NEXT: sub x9, x9, #1 // =1
+; CHECK-NEXT: sub x9, x9, #2 // =2
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: mov w8, #2
; CHECK-NEXT: cmp x9, #2 // =2
; CHECK-NEXT: cntd x9
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: str q1, [sp]
-; CHECK-NEXT: sub x9, x9, #1 // =1
+; CHECK-NEXT: sub x9, x9, #2 // =2
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: mov w8, #2
; CHECK-NEXT: cmp x9, #2 // =2
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: cntd x9
-; CHECK-NEXT: sub x9, x9, #1 // =1
+; CHECK-NEXT: sub x9, x9, #2 // =2
; CHECK-NEXT: mov w8, #2
; CHECK-NEXT: cmp x9, #2 // =2
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
-%retval = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> %vec, i64 2)
-ret <2 x i64> %retval
+ %retval = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> %vec, i64 2)
+ ret <2 x i64> %retval
}
; Should codegen to a nop, since idx is zero.
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
-%retval = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
-ret <4 x i32> %retval
+ %retval = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
+ ret <4 x i32> %retval
}
; Goes through memory currently; idx != 0.
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: cntw x9
-; CHECK-NEXT: sub x9, x9, #1 // =1
+; CHECK-NEXT: sub x9, x9, #4 // =4
; CHECK-NEXT: mov w8, #4
; CHECK-NEXT: cmp x9, #4 // =4
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: cnth x9
-; CHECK-NEXT: sub x9, x9, #1 // =1
+; CHECK-NEXT: sub x9, x9, #8 // =8
; CHECK-NEXT: mov w8, #8
; CHECK-NEXT: cmp x9, #8 // =8
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: rdvl x9, #1
-; CHECK-NEXT: sub x9, x9, #1 // =1
+; CHECK-NEXT: sub x9, x9, #16 // =16
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mov w8, #16
; CHECK-NEXT: cmp x9, #16 // =16
ret <vscale x 1 x i16> %retval
}
+; Fixed length clamping
+
+define <2 x i64> @extract_fixed_v2i64_nxv2i64(<vscale x 2 x i64> %vec) nounwind #0 {
+; CHECK-LABEL: extract_fixed_v2i64_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: sub x9, x9, #2 // =2
+; CHECK-NEXT: mov w8, #2
+; CHECK-NEXT: cmp x9, #2 // =2
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: csel x8, x9, x8, lo
+; CHECK-NEXT: st1d { z0.d }, p0, [sp]
+; CHECK-NEXT: lsl x8, x8, #3
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: ldr q0, [x9, x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> %vec, i64 2)
+ ret <2 x i64> %retval
+}
+
+define <4 x i64> @extract_fixed_v4i64_nxv2i64(<vscale x 2 x i64> %vec) nounwind #0 {
+; CHECK-LABEL: extract_fixed_v4i64_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: subs x9, x9, #4 // =4
+; CHECK-NEXT: csel x9, xzr, x9, lo
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: mov w10, #4
+; CHECK-NEXT: cmp x9, #4 // =4
+; CHECK-NEXT: ptrue p1.d, vl4
+; CHECK-NEXT: st1d { z0.d }, p0, [sp]
+; CHECK-NEXT: csel x9, x9, x10, lo
+; CHECK-NEXT: mov x10, sp
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [x10, x9, lsl #3]
+; CHECK-NEXT: st1d { z0.d }, p1, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> %vec, i64 4)
+ ret <4 x i64> %retval
+}
+
+attributes #0 = { vscale_range(2,2) }
declare <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64>, i64)
declare <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32>, i64)
declare <8 x i16> @llvm.experimental.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16>, i64)
declare <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8>, i64)
+declare <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64>, i64)
+
declare <vscale x 1 x i32> @llvm.experimental.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32>, i64)
declare <vscale x 1 x i16> @llvm.experimental.vector.extract.nxv1i16.nxv6i16(<vscale x 6 x i16>, i64)
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: cntd x9
-; CHECK-NEXT: sub x9, x9, #1 // =1
+; CHECK-NEXT: sub x9, x9, #2 // =2
; CHECK-NEXT: mov w8, #2
; CHECK-NEXT: cmp x9, #2 // =2
; CHECK-NEXT: csel x8, x9, x8, lo
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: cntw x9
-; CHECK-NEXT: sub x9, x9, #1 // =1
+; CHECK-NEXT: sub x9, x9, #4 // =4
; CHECK-NEXT: mov w8, #4
; CHECK-NEXT: cmp x9, #4 // =4
; CHECK-NEXT: csel x8, x9, x8, lo
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: cnth x9
-; CHECK-NEXT: sub x9, x9, #1 // =1
+; CHECK-NEXT: sub x9, x9, #8 // =8
; CHECK-NEXT: mov w8, #8
; CHECK-NEXT: cmp x9, #8 // =8
; CHECK-NEXT: csel x8, x9, x8, lo
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: rdvl x9, #1
-; CHECK-NEXT: sub x9, x9, #1 // =1
+; CHECK-NEXT: sub x9, x9, #16 // =16
; CHECK-NEXT: mov w8, #16
; CHECK-NEXT: cmp x9, #16 // =16
; CHECK-NEXT: ptrue p0.b
ret <vscale x 6 x i16> %retval
}
+; Fixed length clamping
+
+define <vscale x 2 x i64> @insert_fixed_v2i64_nxv2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind #0 {
+; CHECK-LABEL: insert_fixed_v2i64_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: sub x9, x9, #2 // =2
+; CHECK-NEXT: mov w8, #2
+; CHECK-NEXT: cmp x9, #2 // =2
+; CHECK-NEXT: csel x8, x9, x8, lo
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: lsl x8, x8, #3
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: st1d { z0.d }, p0, [sp]
+; CHECK-NEXT: str q1, [x9, x8]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 2)
+ ret <vscale x 2 x i64> %retval
+}
+
+define <vscale x 2 x i64> @insert_fixed_v4i64_nxv2i64(<vscale x 2 x i64> %vec, <4 x i64>* %ptr) nounwind #0 {
+; CHECK-LABEL: insert_fixed_v4i64_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: cntd x8
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: subs x8, x8, #4 // =4
+; CHECK-NEXT: csel x8, xzr, x8, lo
+; CHECK-NEXT: mov w9, #4
+; CHECK-NEXT: cmp x8, #4 // =4
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: csel x8, x8, x9, lo
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: st1d { z0.d }, p1, [sp]
+; CHECK-NEXT: st1d { z1.d }, p0, [x9, x8, lsl #3]
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %subvec = load <4 x i64>, <4 x i64>* %ptr
+ %retval = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> %vec, <4 x i64> %subvec, i64 4)
+ ret <vscale x 2 x i64> %retval
+}
+
+attributes #0 = { vscale_range(2,2) }
declare <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
declare <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
declare <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
declare <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
+declare <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64>, <4 x i64>, i64)
+
declare <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64>, <vscale x 8 x i64>, i64)
declare <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64)
declare <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32>, <vscale x 1 x i32>, i64)