[TTI] Change getOperandsScalarizationOverhead to take Type args

author David Green <david.green@arm.com>

Tue, 23 Feb 2021 13:04:59 +0000 (13:04 +0000)

committer David Green <david.green@arm.com>

Tue, 23 Feb 2021 13:04:59 +0000 (13:04 +0000)
author David Green <david.green@arm.com>
Tue, 23 Feb 2021 13:04:59 +0000 (13:04 +0000)
committer David Green <david.green@arm.com>
Tue, 23 Feb 2021 13:04:59 +0000 (13:04 +0000)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h

index 255a02e7ffd3c7c5be93bf6a1c0698f9467c41d5..8205df794a3b8562a3e9288a19ca92e0e73e4087 100644 (file)
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -726,10 +726,10 @@ public:
                                      bool Insert, bool Extract) const;
  
    /// Estimate the overhead of scalarizing an instructions unique
-  /// non-constant operands. The types of the arguments are ordinarily
-  /// scalar, in which case the costs are multiplied with VF.
+  /// non-constant operands. The (potentially vector) types to use for each of
+  /// argument are passes via Tys.
    unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                            unsigned VF) const;
+                                            ArrayRef<Type *> Tys) const;
  
    /// If target has efficient vector element load/store instructions, it can
    /// return true here so that insertion/extraction costs are not added to
@@ -1479,7 +1479,7 @@ public:
                                              bool Insert, bool Extract) = 0;
    virtual unsigned
    getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                   unsigned VF) = 0;
+                                   ArrayRef<Type *> Tys) = 0;
    virtual bool supportsEfficientVectorElementLoadStore() = 0;
    virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
    virtual MemCmpExpansionOptions
@@ -1864,8 +1864,8 @@ public:
      return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
    }
    unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                            unsigned VF) override {
-    return Impl.getOperandsScalarizationOverhead(Args, VF);
+                                            ArrayRef<Type *> Tys) override {
+    return Impl.getOperandsScalarizationOverhead(Args, Tys);
    }
  
    bool supportsEfficientVectorElementLoadStore() override {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

index 97259bb1e61125869c0ebc73b720b30a257e37cc..83b65760cef63925bd77f0e664ae2dd0f3ccd085 100644 (file)
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -289,7 +289,7 @@ public:
    }
  
    unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                            unsigned VF) const {
+                                            ArrayRef<Type *> Tys) const {
      return 0;
    }
  
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h

index c2b1a7a7f83589b19e73cdfe5dd9febf7a5cda80..b7468e831123b0444c4c4a4f6759ca1417a33d8b 100644 (file)
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -609,51 +609,48 @@ public:
      return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
    }
  
-  /// Estimate the overhead of scalarizing an instruction's unique
-  /// non-constant operands. The types of the arguments are ordinarily
-  /// scalar, in which case the costs are multiplied with VF.
+  /// Estimate the overhead of scalarizing an instructions unique
+  /// non-constant operands. The (potentially vector) types to use for each of
+  /// argument are passes via Tys.
    unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                            unsigned VF) {
+                                            ArrayRef<Type *> Tys) {
+    assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
+
      unsigned Cost = 0;
      SmallPtrSet<const Value*, 4> UniqueOperands;
-    for (const Value *A : Args) {
+    for (int I = 0, E = Args.size(); I != E; I++) {
        // Disregard things like metadata arguments.
-      Type *Ty = A->getType();
+      const Value *A = Args[I];
+      Type *Ty = Tys[I];
        if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
            !Ty->isPtrOrPtrVectorTy())
          continue;
  
        if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
-        auto *VecTy = dyn_cast<VectorType>(Ty);
-        if (VecTy) {
-          // If A is a vector operand, VF should be 1 or correspond to A.
-          assert((VF == 1 ||
-                  VF == cast<FixedVectorType>(VecTy)->getNumElements()) &&
-                 "Vector argument does not match VF");
-        }
-        else
-          VecTy = FixedVectorType::get(Ty, VF);
-
-        Cost += getScalarizationOverhead(VecTy, false, true);
+        if (auto *VecTy = dyn_cast<VectorType>(Ty))
+          Cost += getScalarizationOverhead(VecTy, false, true);
        }
      }
  
      return Cost;
    }
  
-  unsigned getScalarizationOverhead(VectorType *InTy,
-                                    ArrayRef<const Value *> Args) {
-    auto *Ty = cast<FixedVectorType>(InTy);
-
+  /// Estimate the overhead of scalarizing the inputs and outputs of an
+  /// instruction, with return type RetTy and arguments Args of type Tys. If
+  /// Args are unknown (empty), then the cost associated with one argument is
+  /// added as a heuristic.
+  unsigned getScalarizationOverhead(VectorType *RetTy,
+                                    ArrayRef<const Value *> Args,
+                                    ArrayRef<Type *> Tys) {
      unsigned Cost = 0;
  
-    Cost += getScalarizationOverhead(Ty, true, false);
+    Cost += getScalarizationOverhead(RetTy, true, false);
      if (!Args.empty())
-      Cost += getOperandsScalarizationOverhead(Args, Ty->getNumElements());
+      Cost += getOperandsScalarizationOverhead(Args, Tys);
      else
        // When no information on arguments is provided, we add the cost
        // associated with one argument as a heuristic.
-      Cost += getScalarizationOverhead(Ty, false, true);
+      Cost += getScalarizationOverhead(RetTy, false, true);
  
      return Cost;
    }
@@ -710,7 +707,8 @@ public:
            Opd1PropInfo, Opd2PropInfo, Args, CxtI);
        // Return the cost of multiple scalar invocation plus the cost of
        // inserting and extracting the values.
-      return getScalarizationOverhead(VTy, Args) + Num * Cost;
+      SmallVector<Type *> Tys(Args.size(), Ty);
+      return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
      }
  
      // We don't know anything about this scalar instruction.
@@ -1354,7 +1352,7 @@ public:
          ScalarizationCost +=
              getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
        ScalarizationCost +=
-          getOperandsScalarizationOverhead(Args, RetVF.getKnownMinValue());
+          getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
      }
  
      IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp

index ac2e5c5c2aa7c9d774acd94b01ded571fa3cb4ba..0b2fdcbca402251cf616e4b1fbb4834764f846d9 100644 (file)
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -469,8 +469,8 @@ TargetTransformInfo::getScalarizationOverhead(VectorType *Ty,
  }
  
  unsigned TargetTransformInfo::getOperandsScalarizationOverhead(
-    ArrayRef<const Value *> Args, unsigned VF) const {
-  return TTIImpl->getOperandsScalarizationOverhead(Args, VF);
+    ArrayRef<const Value *> Args, ArrayRef<Type *> Tys) const {
+  return TTIImpl->getOperandsScalarizationOverhead(Args, Tys);
  }
  
  bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

index 2b0f9b5599d8370df2caef25a54302242be2f756..eb2733ff0310af830e4526540225c6b9b547bba7 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -549,7 +549,8 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
            Opd1PropInfo, Opd2PropInfo, Args, CxtI);
        // Return the cost of multiple scalar invocation plus the cost of
        // inserting and extracting the values.
-      return getScalarizationOverhead(VTy, Args) + Num * Cost;
+      SmallVector<Type *> Tys(Args.size(), Ty);
+      return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
      }
  
      // We don't know anything about this scalar instruction.
@@ -752,7 +753,8 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
        if (!RetTy->isVoidTy())
          ScalarizationCost +=
              getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
-      ScalarizationCost += getOperandsScalarizationOverhead(Args, RetVF);
+      ScalarizationCost +=
+          getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
      }
  
      IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

index 023c44d4f149d790c176c3d30c7201463fdd47b8..4b403c83037423d964f9726598ae71c5ece48e03 100644 (file)
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1363,7 +1363,8 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
                                             CostKind);
      // Return the cost of multiple scalar invocation plus the cost of
      // inserting and extracting the values.
-    return BaseT::getScalarizationOverhead(VTy, Args) + Num * Cost;
+    SmallVector<Type *> Tys(Args.size(), Ty);
+    return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
    }
  
    return BaseCost;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp

index c6ef63b2aa318533059c9928cb3cb055f370aa9a..063a5571c13b66c9276455959ec1a05388c8bfb5 100644 (file)
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -118,9 +118,10 @@ unsigned HexagonTTIImpl::getScalarizationOverhead(VectorType *Ty,
    return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
  }
  
-unsigned HexagonTTIImpl::getOperandsScalarizationOverhead(
-      ArrayRef<const Value*> Args, unsigned VF) {
-  return BaseT::getOperandsScalarizationOverhead(Args, VF);
+unsigned
+HexagonTTIImpl::getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
+                                                 ArrayRef<Type *> Tys) {
+  return BaseT::getOperandsScalarizationOverhead(Args, Tys);
  }
  
  unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy,
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h

index 10b31088e45111d61ebcbc43a2d02ed57305636b..61d50b9457a6f1a7cf2706510a4c92a5e45419e4 100644 (file)
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -107,7 +107,7 @@ public:
    unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
                                      bool Insert, bool Extract);
    unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                            unsigned VF);
+                                            ArrayRef<Type *> Tys);
    unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type*> Tys,
                              TTI::TargetCostKind CostKind);
    unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp

index e7ac2391512f23b9c6e979bc75b448abed57ed38..535f164baf285f08c0641b3c21fbef979e2ba0f5 100644 (file)
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -487,8 +487,10 @@ int SystemZTTIImpl::getArithmeticInstrCost(
  
      if (DivRemConstPow2)
        return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
-    if (DivRemConst)
-      return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args);
+    if (DivRemConst) {
+      SmallVector<Type *> Tys(Args.size(), Ty);
+      return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args, Tys);
+    }
      if ((SignedDivRem || UnsignedDivRem) && VF > 4)
        // Temporary hack: disable high vectorization factors with integer
        // division/remainder, which will get scalarized and handled with
@@ -511,7 +513,9 @@ int SystemZTTIImpl::getArithmeticInstrCost(
          // inserting and extracting the values.
          unsigned ScalarCost =
              getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
-        unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(VTy, Args);
+        SmallVector<Type *> Tys(Args.size(), Ty);
+        unsigned Cost =
+            (VF * ScalarCost) + getScalarizationOverhead(VTy, Args, Tys);
          // FIXME: VF 2 for these FP operations are currently just as
          // expensive as for VF 4.
          if (VF == 2)
@@ -528,7 +532,9 @@ int SystemZTTIImpl::getArithmeticInstrCost(
  
      // There is no native support for FRem.
      if (Opcode == Instruction::FRem) {
-      unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args);
+      SmallVector<Type *> Tys(Args.size(), Ty);
+      unsigned Cost =
+          (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args, Tys);
        // FIXME: VF 2 for float is currently just as expensive as for VF 4.
        if (VF == 2 && ScalarBits == 32)
          Cost *= 2;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

index 762302b779f7933fd954338968b19fa4c0d0059e..cebb8ece2827e7fe4caa3a6777b254d9db674a22 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3634,15 +3634,15 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
    return Cost;
  }
  
+static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
+  if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
+    return Elt;
+  return VectorType::get(Elt, VF);
+}
+
  InstructionCost
  LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
                                                     ElementCount VF) {
-  auto MaybeVectorizeType = [](Type *Elt, ElementCount VF) -> Type * {
-    if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
-      return Elt;
-    return VectorType::get(Elt, VF);
-  };
-
    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
    assert(ID && "Expected intrinsic call!");
    Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
@@ -6914,8 +6914,11 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
  
    // Skip operands that do not require extraction/scalarization and do not incur
    // any overhead.
+  SmallVector<Type *> Tys;
+  for (auto *V : filterExtractingOperands(Ops, VF))
+    Tys.push_back(MaybeVectorizeType(V->getType(), VF));
    return Cost + TTI.getOperandsScalarizationOverhead(
-                    filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
+                    filterExtractingOperands(Ops, VF), Tys);
  }
  
  void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
diff --git a/llvm/test/Analysis/CostModel/PowerPC/matrix.ll b/llvm/test/Analysis/CostModel/PowerPC/matrix.ll

new file mode 100644 (file)

index 0000000..4f0416f
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/PowerPC/matrix.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; This test checks we don't crash on certain matrix operations, more than
+; checks the cost of the intrinsics per-se.
+
+define void @matrix() {
+; CHECK-LABEL: 'matrix'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %matrix1 = call <1 x i32> @llvm.matrix.column.major.load.v1i32(i32* nonnull align 4 undef, i64 1, i1 false, i32 1, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 452 for instruction: %0 = call <10 x i32> @llvm.matrix.multiply.v10i32.v10i32.v1i32(<10 x i32> undef, <1 x i32> %matrix1, i32 10, i32 1, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+entry:
+  %matrix1 = call <1 x i32> @llvm.matrix.column.major.load.v1i32(i32* nonnull align 4 undef, i64 1, i1 false, i32 1, i32 1)
+  %0 = call <10 x i32> @llvm.matrix.multiply.v10i32.v10i32.v1i32(<10 x i32> undef, <1 x i32> %matrix1, i32 10, i32 1, i32 1)
+  ret void
+}
+
+declare <1 x i32> @llvm.matrix.column.major.load.v1i32(i32* nocapture, i64, i1 immarg, i32 immarg, i32 immarg) #2
+declare <10 x i32> @llvm.matrix.multiply.v10i32.v10i32.v1i32(<10 x i32>, <1 x i32>, i32 immarg, i32 immarg, i32 immarg) #3
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll

index 8302dd2e879d2c342f6a066583b713474bf6a55b..bb70cbcfd4e9683e805a66088ccb120f1086fe78 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
@@ -72,10 +72,11 @@ for.end:
  }
  
  define void @vec_intrinsic(i64 %N, double* nocapture readonly %a) {
+;; FIXME: Should be calling sin_vec, once the cost of scalarizing is handled.
  ; CHECK-LABEL: @vec_intrinsic
  ; CHECK: vector.body:
  ; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>*
-; CHECK: call fast <vscale x 2 x double> @sin_vec(<vscale x 2 x double> %[[LOAD]])
+; CHECK: call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %[[LOAD]])
  entry:
    %cmp7 = icmp sgt i64 %N, 0
    br i1 %cmp7, label %for.body, label %for.end
@@ -86,6 +87,7 @@ for.body:
    %0 = load double, double* %arrayidx, align 8
    %1 = call fast double @llvm.sin.f64(double %0) #2
    %add = fadd fast double %1, 1.000000e+00
+  store double %add, double* %arrayidx, align 8
    %iv.next = add nuw nsw i64 %iv, 1
    %exitcond = icmp eq i64 %iv.next, %N
    br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
author	David Green <david.green@arm.com>
	Tue, 23 Feb 2021 13:04:59 +0000 (13:04 +0000)
committer	David Green <david.green@arm.com>
	Tue, 23 Feb 2021 13:04:59 +0000 (13:04 +0000)
llvm/include/llvm/Analysis/TargetTransformInfo.h		patch \| blob \| history
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h		patch \| blob \| history
llvm/include/llvm/CodeGen/BasicTTIImpl.h		patch \| blob \| history
llvm/lib/Analysis/TargetTransformInfo.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp		patch \| blob \| history
llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp		patch \| blob \| history
llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp		patch \| blob \| history
llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h		patch \| blob \| history
llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp		patch \| blob \| history
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
llvm/test/Analysis/CostModel/PowerPC/matrix.ll	[new file with mode: 0644]	patch \| blob
llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll		patch \| blob \| history