The generic implementation is umin(TC, VF * vscale).
Lowering to vsetvli for RISC-V will come in a future patch.
This patch is a pre-requisite to be able to CodeGen vectorized code from
D99750.
Reviewed By: reames, frasercrmck
Differential Revision: https://reviews.llvm.org/D149916
None.
+'``llvm.experimental.get.vector.length``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+ declare i32 @llvm.experimental.get.vector.length.i32(i32 %cnt, i32 immarg %vf, i1 immarg %scalable)
+ declare i32 @llvm.experimental.get.vector.length.i64(i64 %cnt, i32 immarg %vf, i1 immarg %scalable)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.get.vector.length.*``' intrinsics take a number of
+elements to process and returns how many of the elements can be processed
+with the requested vectorization factor.
+
+Arguments:
+""""""""""
+
+The first argument is an unsigned value of any scalar integer type and specifies
+the total number of elements to be processed. The second argument is an i32
+immediate for the vectorization factor. The third argument indicates if the
+vectorization factor should be multiplied by vscale.
+
+Semantics:
+""""""""""
+
+Returns a positive i32 value (explicit vector length) that is unknown at compile
+time and depends on the hardware specification.
+If the result value does not fit in the result type, then the result is
+a :ref:`poison value <poisonvalues>`.
+
+This intrinsic is intended to be used by loop vectorization with VP intrinsics
+in order to get the number of elements to process on each loop iteration. The
+result should be used to decrease the count for the next iteration until the
+count reaches zero.
+
+If the count is larger than the number of lanes in the type described by the
+last 2 arguments, this intrinsic may return a value less than the number of
+lanes implied by the type. The result will be at least as large as the result
+will be on any later loop iteration.
+
+This intrinsic will only return 0 if the input count is also 0. A non-zero input
+count will produce a non-zero result.
+
Matrix Intrinsics
-----------------
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm,
bool ConstantFold = true);
+ SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
+ bool ConstantFold = true);
+
/// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc.
SDValue getGLOBAL_OFFSET_TABLE(EVT VT) {
return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT);
return true;
}
+ virtual bool shouldExpandGetVectorLength(EVT CountVT, unsigned VF,
+ bool IsScalable) const {
+ return true;
+ }
+
// Return true if op(vecreduce(x), vecreduce(y)) should be reassociated to
// vecreduce(op(x, y)) for the reduction opcode RedOpc.
virtual bool shouldReassociateReduction(unsigned RedOpc, EVT VT) const {
[llvm_anyint_ty, LLVMMatchType<1>],
[IntrNoMem, IntrNoSync, IntrWillReturn]>;
+def int_experimental_get_vector_length:
+ DefaultAttrsIntrinsic<[llvm_i32_ty],
+ [llvm_anyint_ty, llvm_i32_ty, llvm_i1_ty],
+ [IntrNoMem, IntrNoSync, IntrWillReturn,
+ ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+
def int_experimental_vp_splice:
DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>,
return getNode(ISD::VSCALE, DL, VT, getConstant(MulImm, DL, VT));
}
+SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
+ bool ConstantFold) {
+ if (EC.isScalable())
+ return getVScale(DL, VT,
+ APInt(VT.getSizeInBits(), EC.getKnownMinValue()));
+
+ return getConstant(EC.getKnownMinValue(), DL, VT);
+}
+
SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) {
APInt One(ResVT.getScalarSizeInBits(), 1);
return getStepVector(DL, ResVT, One);
setValue(&I, SetCC);
return;
}
+ case Intrinsic::experimental_get_vector_length: {
+ assert(cast<ConstantInt>(I.getOperand(1))->getSExtValue() > 0 &&
+ "Expected positive VF");
+ unsigned VF = cast<ConstantInt>(I.getOperand(1))->getZExtValue();
+ bool IsScalable = cast<ConstantInt>(I.getOperand(2))->isOne();
+
+ SDValue Count = getValue(I.getOperand(0));
+ EVT CountVT = Count.getValueType();
+
+ if (!TLI.shouldExpandGetVectorLength(CountVT, VF, IsScalable)) {
+ visitTargetIntrinsic(I, Intrinsic);
+ return;
+ }
+
+ // Expand to a umin between the trip count and the maximum elements the type
+ // can hold.
+ EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+ // Extend the trip count to at least the result VT.
+ if (CountVT.bitsLT(VT)) {
+ Count = DAG.getNode(ISD::ZERO_EXTEND, sdl, VT, Count);
+ CountVT = VT;
+ }
+
+ SDValue MaxEVL = DAG.getElementCount(sdl, CountVT,
+ ElementCount::get(VF, IsScalable));
+
+ SDValue UMin = DAG.getNode(ISD::UMIN, sdl, CountVT, Count, MaxEVL);
+ // Clip to the result type if needed.
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, sdl, VT, UMin);
+
+ setValue(&I, Trunc);
+ return;
+ }
case Intrinsic::vector_insert: {
SDValue Vec = getValue(I.getOperand(0));
SDValue SubVec = getValue(I.getOperand(1));
Call);
break;
}
+ case Intrinsic::experimental_get_vector_length: {
+ ConstantInt *VF = cast<ConstantInt>(Call.getArgOperand(1));
+ Check(!VF->isNegative() && !VF->isZero(),
+ "get_vector_length: VF must be positive", Call);
+ break;
+ }
case Intrinsic::masked_load: {
Check(Call.getType()->isVectorTy(), "masked_load: must return a vector",
Call);
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s
+
+declare i32 @llvm.experimental.get.vector.length.i16(i16, i32, i1)
+declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i1)
+declare i32 @llvm.experimental.get.vector.length.i64(i64, i32, i1)
+
+define i32 @vector_length_i16(i16 zeroext %tc) {
+; CHECK-LABEL: vector_length_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x8
+; CHECK-NEXT: cmp w0, w8
+; CHECK-NEXT: csel w0, w0, w8, lo
+; CHECK-NEXT: ret
+ %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 2, i1 true)
+ ret i32 %a
+}
+
+define i32 @vector_length_i32(i32 zeroext %tc) {
+; CHECK-LABEL: vector_length_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x8
+; CHECK-NEXT: cmp w0, w8
+; CHECK-NEXT: csel w0, w0, w8, lo
+; CHECK-NEXT: ret
+ %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 2, i1 true)
+ ret i32 %a
+}
+
+define i32 @vector_length_i64(i64 %tc) {
+; CHECK-LABEL: vector_length_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x8
+; CHECK-NEXT: cmp x0, x8
+; CHECK-NEXT: csel x0, x0, x8, lo
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %a = call i32 @llvm.experimental.get.vector.length.i64(i64 %tc, i32 2, i1 true)
+ ret i32 %a
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare i32 @llvm.experimental.get.vector.length.i16(i16, i32, i1)
+declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i1)
+declare i32 @llvm.experimental.get.vector.length.i64(i64, i32, i1)
+
+define i32 @vector_length_i16(i16 zeroext %tc) {
+; CHECK-LABEL: vector_length_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: bltu a0, a1, .LBB0_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB0_2:
+; CHECK-NEXT: ret
+ %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 2, i1 true)
+ ret i32 %a
+}
+
+define i32 @vector_length_i32(i32 zeroext %tc) {
+; RV32-LABEL: vector_length_i32:
+; RV32: # %bb.0:
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: bltu a0, a1, .LBB1_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: mv a0, a1
+; RV32-NEXT: .LBB1_2:
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_length_i32:
+; RV64: # %bb.0:
+; RV64-NEXT: sext.w a0, a0
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: srli a1, a1, 2
+; RV64-NEXT: bltu a0, a1, .LBB1_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: mv a0, a1
+; RV64-NEXT: .LBB1_2:
+; RV64-NEXT: ret
+ %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 2, i1 true)
+ ret i32 %a
+}
+
+define i32 @vector_length_XLen(iXLen zeroext %tc) {
+; RV32-LABEL: vector_length_XLen:
+; RV32: # %bb.0:
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: bltu a0, a1, .LBB2_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: mv a0, a1
+; RV32-NEXT: .LBB2_2:
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_length_XLen:
+; RV64: # %bb.0:
+; RV64-NEXT: sext.w a0, a0
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: srli a1, a1, 2
+; RV64-NEXT: bltu a0, a1, .LBB2_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: mv a0, a1
+; RV64-NEXT: .LBB2_2:
+; RV64-NEXT: ret
+ %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 2, i1 true)
+ ret i32 %a
+}
+
+define i32 @vector_length_i16_fixed(i16 zeroext %tc) {
+; CHECK-LABEL: vector_length_i16_fixed:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 2
+; CHECK-NEXT: bltu a0, a1, .LBB3_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a0, 2
+; CHECK-NEXT: .LBB3_2:
+; CHECK-NEXT: ret
+ %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 2, i1 false)
+ ret i32 %a
+}
+
+define i32 @vector_length_i32_fixed(i32 zeroext %tc) {
+; RV32-LABEL: vector_length_i32_fixed:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 2
+; RV32-NEXT: bltu a0, a1, .LBB4_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: li a0, 2
+; RV32-NEXT: .LBB4_2:
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_length_i32_fixed:
+; RV64: # %bb.0:
+; RV64-NEXT: sext.w a0, a0
+; RV64-NEXT: li a1, 2
+; RV64-NEXT: bltu a0, a1, .LBB4_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: li a0, 2
+; RV64-NEXT: .LBB4_2:
+; RV64-NEXT: ret
+ %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 2, i1 false)
+ ret i32 %a
+}
+
+define i32 @vector_length_XLen_fixed(iXLen zeroext %tc) {
+; RV32-LABEL: vector_length_XLen_fixed:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 2
+; RV32-NEXT: bltu a0, a1, .LBB5_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: li a0, 2
+; RV32-NEXT: .LBB5_2:
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_length_XLen_fixed:
+; RV64: # %bb.0:
+; RV64-NEXT: sext.w a0, a0
+; RV64-NEXT: li a1, 2
+; RV64-NEXT: bltu a0, a1, .LBB5_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: li a0, 2
+; RV64-NEXT: .LBB5_2:
+; RV64-NEXT: ret
+ %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 2, i1 false)
+ ret i32 %a
+}
--- /dev/null
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i1)
+
+define i32 @vector_length_negative_vf(i32 zeroext %tc) {
+ ; CHECK: get_vector_length: VF must be positive
+ ; CHECK-NEXT: %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 -1, i1 true)
+ %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 -1, i1 true)
+ ret i32 %a
+}
+
+define i32 @vector_length_zero_vf(i32 zeroext %tc) {
+ ; CHECK: get_vector_length: VF must be positive
+ ; CHECK-NEXT: %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 0, i1 true)
+ %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 0, i1 true)
+ ret i32 %a
+}