From: ShihPo Hung Date: Wed, 30 Nov 2022 12:58:52 +0000 (-0800) Subject: [RISCV] Add cost model for fixed broadcast shuffle X-Git-Tag: upstream/17.0.6~25887 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0e6f0b7cc38391f3365a862266a8aef50d093135;p=platform%2Fupstream%2Fllvm.git [RISCV] Add cost model for fixed broadcast shuffle This patch adds basic broadcast shuffle costs in order to enable SLP vectorization. And adds `getLMULCost` to consider reciprocal throughput for different LMUL. Reviewed By: reames Differential Revision: https://reviews.llvm.org/D137276 --- diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 3d39789..28a8be7 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -31,6 +31,27 @@ static cl::opt SLPMaxVF( "SLP vectorizer. Defaults to 1 which disables SLP."), cl::init(1), cl::Hidden); +InstructionCost RISCVTTIImpl::getLMULCost(MVT VT) { + // TODO: Here assume reciprocal throughput is 1 for LMUL_1, it is + // implementation-defined. + if (!VT.isVector()) + return InstructionCost::getInvalid(); + unsigned Cost; + if (VT.isScalableVector()) { + unsigned LMul; + bool Fractional; + std::tie(LMul, Fractional) = + RISCVVType::decodeVLMUL(RISCVTargetLowering::getLMUL(VT)); + if (Fractional) + Cost = 1; + else + Cost = LMul; + } else { + Cost = VT.getSizeInBits() / ST->getRealMinVLen(); + } + return std::max(Cost, 1); +} + InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy() && @@ -255,6 +276,44 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, } } + if (isa(Tp) && Kind == TargetTransformInfo::SK_Broadcast) { + std::pair LT = getTypeLegalizationCost(Tp); + bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) == + Instruction::InsertElement); + if (LT.second.getScalarSizeInBits() == 1) { + if (HasScalar) { + // Example sequence: + // andi a0, a0, 1 + // vsetivli zero, 2, e8, mf8, ta, ma (ignored) + // vmv.v.x v8, a0 + // vmsne.vi v0, v8, 0 + return LT.first * getLMULCost(LT.second) * 3; + } + // Example sequence: + // vsetivli zero, 2, e8, mf8, ta, mu (ignored) + // vmv.v.i v8, 0 + // vmerge.vim v8, v8, 1, v0 + // vmv.x.s a0, v8 + // andi a0, a0, 1 + // vmv.v.x v8, a0 + // vmsne.vi v0, v8, 0 + + return LT.first * getLMULCost(LT.second) * 6; + } + + if (HasScalar) { + // Example sequence: + // vmv.v.x v8, a0 + return LT.first * getLMULCost(LT.second); + } + + // Example sequence: + // vrgather.vi v9, v8, 0 + // TODO: vrgather could be slower than vmv.v.x. It is + // implementation-dependent. + return LT.first * getLMULCost(LT.second); + } + return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index bbd9032..36dd869 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -46,6 +46,10 @@ class RISCVTTIImpl : public BasicTTIImplBase { /// the true cost significantly if getVScaleForTuning is wildly off for the /// actual target hardware. unsigned getEstimatedVLFor(VectorType *Ty); + + /// Return the cost of LMUL. The larger the LMUL, the higher the cost. + InstructionCost getLMULCost(MVT VT); + public: explicit RISCVTTIImpl(const RISCVTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), diff --git a/llvm/test/Analysis/CostModel/RISCV/fixed-shuffle-broadcast.ll b/llvm/test/Analysis/CostModel/RISCV/fixed-shuffle-broadcast.ll new file mode 100644 index 0000000..916df3a --- /dev/null +++ b/llvm/test/Analysis/CostModel/RISCV/fixed-shuffle-broadcast.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+experimental-zvfh | FileCheck %s + +define void @broadcast_fixed() #0{ +; CHECK-LABEL: 'broadcast_fixed' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %zero = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %4 = shufflevector <32 x half> undef, <32 x half> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %5 = shufflevector <64 x half> undef, <64 x half> undef, <64 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %10 = shufflevector <32 x float> undef, <32 x float> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %12 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %13 = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %14 = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %17 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %18 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %20 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %21 = shufflevector <128 x i8> undef, <128 x i8> undef, <128 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %24 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %25 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %26 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %27 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %29 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %30 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %31 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %33 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %34 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %35 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %36 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %37 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %38 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %39 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %40 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %41 = shufflevector <32 x i1> undef, <32 x i1> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %42 = shufflevector <64 x i1> undef, <64 x i1> undef, <64 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %43 = shufflevector <128 x i1> undef, <128 x i1> undef, <128 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ins1 = insertelement <128 x i1> poison, i1 poison, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %44 = shufflevector <128 x i1> %ins1, <128 x i1> poison, <128 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ins2 = insertelement <2 x i8> poison, i8 3, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %45 = shufflevector <2 x i8> %ins2, <2 x i8> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %zero = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer + %1 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer + %2 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer + %3 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer + %4 = shufflevector <32 x half> undef, <32 x half> undef, <32 x i32> zeroinitializer + %5 = shufflevector <64 x half> undef, <64 x half> undef, <64 x i32> zeroinitializer + + %6 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer + %7 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer + %8 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer + %9 = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> zeroinitializer + %10 = shufflevector <32 x float> undef, <32 x float> undef, <32 x i32> zeroinitializer + + %11 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer + %12 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer + %13 = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> zeroinitializer + %14 = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> zeroinitializer + + %15 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer + %16 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer + %17 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer + %18 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer + %19 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer + %20 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> zeroinitializer + %21 = shufflevector <128 x i8> undef, <128 x i8> undef, <128 x i32> zeroinitializer + + %22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer + %23 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer + %24 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer + %25 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer + %26 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> zeroinitializer + %27 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> zeroinitializer + + %28 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer + %29 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer + %30 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer + %31 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> zeroinitializer + %32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> zeroinitializer + + %33 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer + %34 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer + %35 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> zeroinitializer + %36 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> zeroinitializer + + %37 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> zeroinitializer + %38 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> zeroinitializer + %39 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> zeroinitializer + %40 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> zeroinitializer + %41 = shufflevector <32 x i1> undef, <32 x i1> undef, <32 x i32> zeroinitializer + %42 = shufflevector <64 x i1> undef, <64 x i1> undef, <64 x i32> zeroinitializer + %43 = shufflevector <128 x i1> undef, <128 x i1> undef, <128 x i32> zeroinitializer + + %ins1 = insertelement <128 x i1> poison, i1 poison, i32 0 + %44 = shufflevector <128 x i1> %ins1, <128 x i1> poison, <128 x i32> zeroinitializer + + %ins2 = insertelement <2 x i8> poison, i8 3, i32 0 + %45 = shufflevector <2 x i8> %ins2, <2 x i8> undef, <2 x i32> zeroinitializer + ret void +}