From: ShihPo Hung <shihpo.hung@sifive.com>
Date: Wed, 30 Nov 2022 12:58:52 +0000 (-0800)
Subject:  [RISCV] Add cost model for fixed broadcast shuffle
X-Git-Tag: upstream/17.0.6~25887
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0e6f0b7cc38391f3365a862266a8aef50d093135;p=platform%2Fupstream%2Fllvm.git

 [RISCV] Add cost model for fixed broadcast shuffle

This patch adds basic broadcast shuffle costs in order to enable SLP vectorization.
And adds `getLMULCost` to consider reciprocal throughput for different LMUL.

Reviewed By: reames

Differential Revision: https://reviews.llvm.org/D137276
---

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 3d39789..28a8be7 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -31,6 +31,27 @@ static cl::opt<unsigned> SLPMaxVF(
         "SLP vectorizer.  Defaults to 1 which disables SLP."),
     cl::init(1), cl::Hidden);
 
+InstructionCost RISCVTTIImpl::getLMULCost(MVT VT) {
+  // TODO: Here assume reciprocal throughput is 1 for LMUL_1, it is
+  // implementation-defined.
+  if (!VT.isVector())
+    return InstructionCost::getInvalid();
+  unsigned Cost;
+  if (VT.isScalableVector()) {
+    unsigned LMul;
+    bool Fractional;
+    std::tie(LMul, Fractional) =
+        RISCVVType::decodeVLMUL(RISCVTargetLowering::getLMUL(VT));
+    if (Fractional)
+      Cost = 1;
+    else
+      Cost = LMul;
+  } else {
+    Cost = VT.getSizeInBits() / ST->getRealMinVLen();
+  }
+  return std::max<unsigned>(Cost, 1);
+}
+
 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                                             TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy() &&
@@ -255,6 +276,44 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     }
   }
 
+  if (isa<FixedVectorType>(Tp) && Kind == TargetTransformInfo::SK_Broadcast) {
+    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
+    bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
+                                           Instruction::InsertElement);
+    if (LT.second.getScalarSizeInBits() == 1) {
+      if (HasScalar) {
+        // Example sequence:
+        //   andi a0, a0, 1
+        //   vsetivli zero, 2, e8, mf8, ta, ma (ignored)
+        //   vmv.v.x v8, a0
+        //   vmsne.vi v0, v8, 0
+        return LT.first * getLMULCost(LT.second) * 3;
+      }
+      // Example sequence:
+      //   vsetivli  zero, 2, e8, mf8, ta, mu (ignored)
+      //   vmv.v.i v8, 0
+      //   vmerge.vim      v8, v8, 1, v0
+      //   vmv.x.s a0, v8
+      //   andi    a0, a0, 1
+      //   vmv.v.x v8, a0
+      //   vmsne.vi  v0, v8, 0
+
+      return LT.first * getLMULCost(LT.second) * 6;
+    }
+
+    if (HasScalar) {
+      // Example sequence:
+      //   vmv.v.x v8, a0
+      return LT.first * getLMULCost(LT.second);
+    }
+
+    // Example sequence:
+    //   vrgather.vi     v9, v8, 0
+    // TODO: vrgather could be slower than vmv.v.x. It is
+    // implementation-dependent.
+    return LT.first * getLMULCost(LT.second);
+  }
+
   return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index bbd9032..36dd869 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -46,6 +46,10 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
   /// the true cost significantly if getVScaleForTuning is wildly off for the
   /// actual target hardware.
   unsigned getEstimatedVLFor(VectorType *Ty);
+
+  /// Return the cost of LMUL. The larger the LMUL, the higher the cost.
+  InstructionCost getLMULCost(MVT VT);
+
 public:
   explicit RISCVTTIImpl(const RISCVTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
diff --git a/llvm/test/Analysis/CostModel/RISCV/fixed-shuffle-broadcast.ll b/llvm/test/Analysis/CostModel/RISCV/fixed-shuffle-broadcast.ll
new file mode 100644
index 0000000..916df3a
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/RISCV/fixed-shuffle-broadcast.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+experimental-zvfh | FileCheck %s
+
+define void  @broadcast_fixed() #0{
+; CHECK-LABEL: 'broadcast_fixed'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zero = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = shufflevector <32 x half> undef, <32 x half> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %5 = shufflevector <64 x half> undef, <64 x half> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %10 = shufflevector <32 x float> undef, <32 x float> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %13 = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %14 = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %20 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %21 = shufflevector <128 x i8> undef, <128 x i8> undef, <128 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %25 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %26 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %27 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %30 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %31 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %34 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %35 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %36 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %37 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %38 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %39 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %40 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %41 = shufflevector <32 x i1> undef, <32 x i1> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %42 = shufflevector <64 x i1> undef, <64 x i1> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %43 = shufflevector <128 x i1> undef, <128 x i1> undef, <128 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ins1 = insertelement <128 x i1> poison, i1 poison, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %44 = shufflevector <128 x i1> %ins1, <128 x i1> poison, <128 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ins2 = insertelement <2 x i8> poison, i8 3, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %45 = shufflevector <2 x i8> %ins2, <2 x i8> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %zero = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
+  %1 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
+  %2 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
+  %3 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
+  %4 = shufflevector <32 x half> undef, <32 x half> undef, <32 x i32> zeroinitializer
+  %5 = shufflevector <64 x half> undef, <64 x half> undef, <64 x i32> zeroinitializer
+
+  %6 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+  %7 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+  %8 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
+  %9 = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> zeroinitializer
+  %10 = shufflevector <32 x float> undef, <32 x float> undef, <32 x i32> zeroinitializer
+
+  %11 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+  %12 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+  %13 = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> zeroinitializer
+  %14 = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> zeroinitializer
+
+  %15 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+  %16 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+  %17 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+  %18 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+  %19 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer
+  %20 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> zeroinitializer
+  %21 = shufflevector <128 x i8> undef, <128 x i8> undef, <128 x i32> zeroinitializer
+
+  %22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+  %23 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+  %24 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+  %25 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+  %26 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> zeroinitializer
+  %27 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> zeroinitializer
+
+  %28 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+  %29 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+  %30 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+  %31 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> zeroinitializer
+  %32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> zeroinitializer
+
+  %33 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+  %34 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+  %35 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> zeroinitializer
+  %36 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> zeroinitializer
+
+  %37 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> zeroinitializer
+  %38 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> zeroinitializer
+  %39 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> zeroinitializer
+  %40 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> zeroinitializer
+  %41 = shufflevector <32 x i1> undef, <32 x i1> undef, <32 x i32> zeroinitializer
+  %42 = shufflevector <64 x i1> undef, <64 x i1> undef, <64 x i32> zeroinitializer
+  %43 = shufflevector <128 x i1> undef, <128 x i1> undef, <128 x i32> zeroinitializer
+
+  %ins1 = insertelement <128 x i1> poison, i1 poison, i32 0
+  %44 = shufflevector <128 x i1> %ins1, <128 x i1> poison, <128 x i32> zeroinitializer
+
+  %ins2 = insertelement <2 x i8> poison, i8 3, i32 0
+  %45 = shufflevector <2 x i8> %ins2, <2 x i8> undef, <2 x i32> zeroinitializer
+  ret void
+}