[SystemZ] Take better care when computing needed vector registers in TTI.

author Jonas Paulsson <paulsson@linux.vnet.ibm.com>

Wed, 10 Oct 2018 07:36:27 +0000 (07:36 +0000)

committer Jonas Paulsson <paulsson@linux.vnet.ibm.com>

Wed, 10 Oct 2018 07:36:27 +0000 (07:36 +0000)
author Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Wed, 10 Oct 2018 07:36:27 +0000 (07:36 +0000)
committer Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Wed, 10 Oct 2018 07:36:27 +0000 (07:36 +0000)
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp

index 9b518f8..6efd7f6 100644 (file)
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -328,6 +328,25 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
    return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
  }
  
+// Return the bit size for the scalar type or vector element
+// type. getScalarSizeInBits() returns 0 for a pointer type.
+static unsigned getScalarSizeInBits(Type *Ty) {
+  unsigned Size =
+    (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
+  assert(Size > 0 && "Element must have non-zero size.");
+  return Size;
+}
+
+// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
+// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
+// 3.
+static unsigned getNumVectorRegs(Type *Ty) {
+  assert(Ty->isVectorTy() && "Expected vector type");
+  unsigned WideBits = getScalarSizeInBits(Ty) * Ty->getVectorNumElements();
+  assert(WideBits > 0 && "Could not compute size of vector");
+  return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
+}
+
  int SystemZTTIImpl::getArithmeticInstrCost(
      unsigned Opcode, Type *Ty,
      TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
@@ -370,7 +389,7 @@ int SystemZTTIImpl::getArithmeticInstrCost(
    if (Ty->isVectorTy()) {
      assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type.");
      unsigned VF = Ty->getVectorNumElements();
-    unsigned NumVectors = getNumberOfParts(Ty);
+    unsigned NumVectors = getNumVectorRegs(Ty);
  
      // These vector operations are custom handled, but are still supported
      // with one instruction per vector, regardless of element size.
@@ -465,12 +484,11 @@ int SystemZTTIImpl::getArithmeticInstrCost(
                                         Opd1PropInfo, Opd2PropInfo, Args);
  }
  
-
  int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                     Type *SubTp) {
    assert (Tp->isVectorTy());
    assert (ST->hasVector() && "getShuffleCost() called.");
-  unsigned NumVectors = getNumberOfParts(Tp);
+  unsigned NumVectors = getNumVectorRegs(Tp);
  
    // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
  
@@ -525,7 +543,7 @@ getVectorTruncCost(Type *SrcTy, Type *DstTy) {
  
    // TODO: Since fp32 is expanded, the extract cost should always be 0.
  
-  unsigned NumParts = getNumberOfParts(SrcTy);
+  unsigned NumParts = getNumVectorRegs(SrcTy);
    if (NumParts <= 2)
      // Up to 2 vector registers can be truncated efficiently with pack or
      // permute. The latter requires an immediate mask to be loaded, which
@@ -568,7 +586,7 @@ getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) {
      // The bitmask will be truncated.
      PackCost = getVectorTruncCost(SrcTy, DstTy);
    else if (SrcScalarBits < DstScalarBits) {
-    unsigned DstNumParts = getNumberOfParts(DstTy);
+    unsigned DstNumParts = getNumVectorRegs(DstTy);
      // Each vector select needs its part of the bitmask unpacked.
      PackCost = Log2Diff * DstNumParts;
      // Extra cost for moving part of mask before unpacking.
@@ -613,8 +631,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
      assert (ST->hasVector() && "getCastInstrCost() called with vector type.");
      assert (Dst->isVectorTy());
      unsigned VF = Src->getVectorNumElements();
-    unsigned NumDstVectors = getNumberOfParts(Dst);
-    unsigned NumSrcVectors = getNumberOfParts(Src);
+    unsigned NumDstVectors = getNumVectorRegs(Dst);
+    unsigned NumSrcVectors = getNumVectorRegs(Src);
  
      if (Opcode == Instruction::Trunc) {
        if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
@@ -763,7 +781,7 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondT
        // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
        // floats.  FIXME: <2 x float> generates same code as <4 x float>.
        unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
-      unsigned NumVecs_cmp = getNumberOfParts(ValTy);
+      unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
  
        unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
        return Cost;
@@ -779,7 +797,7 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondT
          PackCost =
            getVectorBitmaskConversionCost(CmpOpTy, ValTy);
  
-      return getNumberOfParts(ValTy) /*vsel*/ + PackCost;
+      return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
      }
    }
    else { // Scalar
@@ -808,7 +826,7 @@ getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
      return ((Index % 2 == 0) ? 1 : 0);
  
    if (Opcode == Instruction::ExtractElement) {
-    int Cost = ((Val->getScalarSizeInBits() == 1) ? 2 /*+test-under-mask*/ : 1);
+    int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
  
      // Give a slight penalty for moving out of vector pipeline to FXU unit.
      if (Index == 0 && Val->isIntOrIntVectorTy())
@@ -828,7 +846,7 @@ int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
    if (!Src->isVectorTy() && Opcode == Instruction::Load &&
        I != nullptr && I->hasOneUse()) {
        const Instruction *UserI = cast<Instruction>(*I->user_begin());
-      unsigned Bits = Src->getScalarSizeInBits();
+      unsigned Bits = getScalarSizeInBits(Src);
        bool FoldsLoad = false;
        switch (UserI->getOpcode()) {
        case Instruction::ICmp:
@@ -870,7 +888,8 @@ int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
        }
    }
  
-  unsigned NumOps = getNumberOfParts(Src);
+  unsigned NumOps =
+    (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
  
    if (Src->getScalarSizeInBits() == 128)
      // 128 bit scalars are held in a pair of two 64 bit registers.
@@ -887,11 +906,7 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
    assert(isa<VectorType>(VecTy) &&
           "Expect a vector type for interleaved memory op");
  
-  unsigned WideBits = (VecTy->isPtrOrPtrVectorTy() ?
-     (64U * VecTy->getVectorNumElements()) : VecTy->getPrimitiveSizeInBits());
-  assert (WideBits > 0 && "Could not compute size of vector");
-  int NumWideParts =
-    ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
+  int NumWideParts = getNumVectorRegs(VecTy);
  
    // How many source vectors are handled to produce a vectorized operand?
    int NumElsPerVector = (VecTy->getVectorNumElements() / NumWideParts);
diff --git a/llvm/test/Analysis/CostModel/SystemZ/load-ptr-cmp-fold.ll b/llvm/test/Analysis/CostModel/SystemZ/load-ptr-cmp-fold.ll

new file mode 100644 (file)

index 0000000..6822797
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/SystemZ/load-ptr-cmp-fold.ll
@@ -0,0 +1,11 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+
+; Test that the cost heuristic for a folded load works also for a pointer operand.
+define void @fun0(i64* %lhs, i64** %rhs_ptr) {
+  %rhs = load i64*, i64** %rhs_ptr
+  %c = icmp eq i64* %lhs, %rhs
+  ret void
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'fun0':
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %rhs = load
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %c = icmp
+}
diff --git a/llvm/test/Analysis/CostModel/SystemZ/numvectorregs.ll b/llvm/test/Analysis/CostModel/SystemZ/numvectorregs.ll

new file mode 100644 (file)

index 0000000..35b58e3
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/SystemZ/numvectorregs.ll
@@ -0,0 +1,10 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+
+; Test that the cost for the number of vector registers is returned for a
+; non-power-of-two vector type.
+define <6 x double> @fun0(<6 x double> %lhs, <6 x double> %rhs) {
+  %a = fadd <6 x double> %lhs, %rhs
+  ret <6 x double> %a
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'fun0':
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %a = fadd <6 x double>
+}
author	Jonas Paulsson <paulsson@linux.vnet.ibm.com>
	Wed, 10 Oct 2018 07:36:27 +0000 (07:36 +0000)
committer	Jonas Paulsson <paulsson@linux.vnet.ibm.com>
	Wed, 10 Oct 2018 07:36:27 +0000 (07:36 +0000)
llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp		patch \| blob \| history
llvm/test/Analysis/CostModel/SystemZ/load-ptr-cmp-fold.ll	[new file with mode: 0644]	patch \| blob
llvm/test/Analysis/CostModel/SystemZ/numvectorregs.ll	[new file with mode: 0644]	patch \| blob