From 37f4ccb27545ca28a52a1a1c21cbccee03044d04 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto@arm.com>
Date: Fri, 6 Nov 2020 15:53:59 +0000
Subject: [PATCH] [AArch64]Add memory op cost model for SVE

This patch adds/fixes memory op cost model for SVE with fixed-width
vector.

Differential Revision: https://reviews.llvm.org/D90950
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp    | 11 ++-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h      |  5 +-
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp       |  5 ++
 llvm/lib/Target/AArch64/AArch64Subtarget.h         |  1 +
 .../Target/AArch64/AArch64TargetTransformInfo.cpp  |  6 +-
 .../Target/AArch64/AArch64TargetTransformInfo.h    |  1 +
 .../CostModel/AArch64/mem-op-cost-model.ll         | 88 ++++++++++++++++++++++
 .../AArch64/scalable-mem-op-cost-model.ll          | 51 +++++++++++++
 8 files changed, 157 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
 create mode 100644 llvm/test/Analysis/CostModel/AArch64/scalable-mem-op-cost-model.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5d30b5f..15c67b4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -269,7 +269,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
     }
 
-    if (useSVEForFixedLengthVectors()) {
+    if (Subtarget->useSVEForFixedLengthVectors()) {
       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
         if (useSVEForFixedLengthVectorVT(VT))
           addRegisterClass(VT, &AArch64::ZPRRegClass);
@@ -1085,7 +1085,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
     // NOTE: Currently this has to happen after computeRegisterProperties rather
     // than the preferred option of combining it with the addRegisterClass call.
-    if (useSVEForFixedLengthVectors()) {
+    if (Subtarget->useSVEForFixedLengthVectors()) {
       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
         if (useSVEForFixedLengthVectorVT(VT))
           addTypeForFixedLengthSVE(VT);
@@ -4140,14 +4140,13 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   }
 }
 
-bool AArch64TargetLowering::useSVEForFixedLengthVectors() const {
-  // Prefer NEON unless larger SVE registers are available.
-  return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256;
+bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
+  return !Subtarget->useSVEForFixedLengthVectors();
 }
 
 bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
     EVT VT, bool OverrideNEON) const {
-  if (!useSVEForFixedLengthVectors())
+  if (!Subtarget->useSVEForFixedLengthVectors())
     return false;
 
   if (!VT.isFixedLengthVector())
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index bfc83a9..47248b9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -773,9 +773,7 @@ public:
   /// illegal as the original, thus leading to an infinite legalisation loop.
   /// NOTE: Once BUILD_VECTOR is legal or can be custom lowered for all legal
   /// vector types this override can be removed.
-  bool mergeStoresAfterLegalization(EVT VT) const override {
-    return !useSVEForFixedLengthVectors();
-  }
+  bool mergeStoresAfterLegalization(EVT VT) const override;
 
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
@@ -1008,7 +1006,6 @@ private:
   bool shouldLocalize(const MachineInstr &MI,
                       const TargetTransformInfo *TTI) const override;
 
-  bool useSVEForFixedLengthVectors() const;
   // Normally SVE is only used for byte size vectors that do not fit within a
   // NEON vector. This changes when OverrideNEON is true, allowing SVE to be
   // used for 64bit and 128bit vectors as well.
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index fdf979b..b4d71ac 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -368,3 +368,8 @@ unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const {
     return (SVEVectorBitsMin / 128) * 128;
   return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
 }
+
+bool AArch64Subtarget::useSVEForFixedLengthVectors() const {
+  // Prefer NEON unless larger SVE registers are available.
+  return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 67c682c..4eb4843 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -555,6 +555,7 @@ public:
   // implied by the architecture.
   unsigned getMaxSVEVectorSizeInBits() const;
   unsigned getMinSVEVectorSizeInBits() const;
+  bool useSVEForFixedLengthVectors() const;
 };
 } // End llvm namespace
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 595f403..4f7ebff 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -751,6 +751,10 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   return Options;
 }
 
+bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
+  return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
+}
+
 int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
                                     MaybeAlign Alignment, unsigned AddressSpace,
                                     TTI::TargetCostKind CostKind,
@@ -778,7 +782,7 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
     return LT.first * 2 * AmortizationCost;
   }
 
-  if (Ty->isVectorTy() &&
+  if (useNeonVector(Ty) &&
       cast<VectorType>(Ty)->getElementType()->isIntegerTy(8)) {
     unsigned ProfitableNumElements;
     if (Opcode == Instruction::Store)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a624f8b..baf11cd 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -147,6 +147,7 @@ public:
 
   TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                     bool IsZeroCmp) const;
+  bool useNeonVector(const Type *Ty) const;
 
   int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
                       unsigned AddressSpace,
diff --git a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
new file mode 100644
index 0000000..3a4e0f0
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
@@ -0,0 +1,88 @@
+; Check memory cost model action for fixed vector SVE and Neon
+; Vector bits size lower than 256 bits end up assuming Neon cost model
+; CHECK-NEON has same performance as CHECK-SVE-128
+
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+neon  < %s | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s --check-prefix=CHECK-SVE-128
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s --check-prefix=CHECK-SVE-256
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s --check-prefix=CHECK-SVE-512
+
+define <16 x i8> @load16(<16 x i8>* %ptr) {
+; CHECK: 'Cost Model Analysis' for function 'load16':
+; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+  %out = load <16 x i8>, <16 x i8>* %ptr
+  ret <16 x i8> %out
+}
+
+define void @store16(<16 x i8>* %ptr, <16 x i8> %val) {
+; CHECK: 'Cost Model Analysis' for function 'store16':
+; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+  store <16 x i8> %val, <16 x i8>* %ptr
+  ret void
+}
+
+define <8 x i8> @load8(<8 x i8>* %ptr) {
+; CHECK: 'Cost Model Analysis' for function 'load8':
+; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+  %out = load <8 x i8>, <8 x i8>* %ptr
+  ret <8 x i8> %out
+}
+
+define void @store8(<8 x i8>* %ptr, <8 x i8> %val) {
+; CHECK: 'Cost Model Analysis' for function 'store8':
+; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+  store <8 x i8> %val, <8 x i8>* %ptr
+  ret void
+}
+
+define <4 x i8> @load4(<4 x i8>* %ptr) {
+; CHECK: 'Cost Model Analysis' for function 'load4':
+; CHECK-NEON: Cost Model: Found an estimated cost of 64 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 64 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+  %out = load <4 x i8>, <4 x i8>* %ptr
+  ret <4 x i8> %out
+}
+
+define void @store4(<4 x i8>* %ptr, <4 x i8> %val) {
+; CHECK: 'Cost Model Analysis' for function 'store4':
+; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+  store <4 x i8> %val, <4 x i8>* %ptr
+  ret void
+}
+
+define <16 x i16> @load_256(<16 x i16>* %ptr) {
+; CHECK: 'Cost Model Analysis' for function 'load_256':
+; CHECK-NEON: Cost Model: Found an estimated cost of 2 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 2 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+  %out = load <16 x i16>, <16 x i16>* %ptr
+  ret <16 x i16> %out
+}
+
+define <8 x i64> @load_512(<8 x i64>* %ptr) {
+; CHECK: 'Cost Model Analysis' for function 'load_512':
+; CHECK-NEON: Cost Model: Found an estimated cost of 4 for instruction:
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 4 for instruction:
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 2 for instruction:
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
+  %out = load <8 x i64>, <8 x i64>* %ptr
+  ret <8 x i64> %out
+}
diff --git a/llvm/test/Analysis/CostModel/AArch64/scalable-mem-op-cost-model.ll b/llvm/test/Analysis/CostModel/AArch64/scalable-mem-op-cost-model.ll
new file mode 100644
index 0000000..1a7b262
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/scalable-mem-op-cost-model.ll
@@ -0,0 +1,51 @@
+; Checks if the memory cost model does not break when using scalable vectors
+
+; RUN: opt  -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 8 x i8> @load-sve-8(<vscale x 8 x i8>* %ptr) {
+; CHECK-LABEL: 'load-sve-8':
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+  %retval = load <vscale x 8 x i8>, <vscale x 8 x i8>* %ptr
+  ret <vscale x 8 x i8> %retval
+}
+
+define void  @store-sve-8(<vscale x 8 x i8>* %ptr, <vscale x 8 x i8> %val) {
+; CHECK-LABEL: 'store-sve-8'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+  store <vscale x 8 x i8> %val, <vscale x 8 x i8>* %ptr
+  ret void
+}
+
+define <vscale x 16 x i8> @load-sve-16(<vscale x 16 x i8>* %ptr) {
+; CHECK-LABEL: 'load-sve-16':
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+  %retval = load <vscale x 16 x i8>, <vscale x 16 x i8>* %ptr
+  ret <vscale x 16 x i8> %retval
+}
+
+define void  @store-sve-16(<vscale x 16 x i8>* %ptr, <vscale x 16 x i8> %val) {
+; CHECK-LABEL: 'store-sve-16'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+  store <vscale x 16 x i8> %val, <vscale x 16 x i8>* %ptr
+  ret void
+}
+
+define <vscale x 32 x i8> @load-sve-32(<vscale x 32 x i8>* %ptr) {
+; CHECK-LABEL: 'load-sve-32':
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+  %retval = load <vscale x 32 x i8>, <vscale x 32 x i8>* %ptr
+  ret <vscale x 32 x i8> %retval
+}
+
+define void  @store-sve-32(<vscale x 32 x i8>* %ptr, <vscale x 32 x i8> %val) {
+; CHECK-LABEL: 'store-sve-32'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction:
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction:
+  store <vscale x 32 x i8> %val, <vscale x 32 x i8>* %ptr
+  ret void
+}
-- 
2.7.4