From acd9cc74957ba63967015946fc3349988ab200b1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 15 Apr 2021 09:22:32 +0100
Subject: [PATCH] [AArch64] Use type-legalization cost for code size memop
 cost.

At the moment, getMemoryOpCost returns 1 for all inputs if CostKind is
CodeSize or SizeAndLatency. This fools LoopUnroll into thinking memory
operations on large vectors have a cost of one, even if they will get
expanded to a large number of memory operations in the backend.

This patch updates getMemoryOpCost to return the cost for the type
legalization for both CodeSize and SizeAndLatency. This should more
accurately reflect the number of memory operations required.

I am not sure how latency should properly be included in SizeAndLatency
from the description, but returning the size cost should be clearly more
accurate.

This does not cause any binary changes when building
MultiSource/SPEC2000/SPEC2006 with -O3 -flto for AArch64, likely because
large vector memops are not really formed by code emitted from Clang.
But using the C/C++ matrix extension can easily result in code with very
large vector operations directly from Clang, e.g.
https://clang.godbolt.org/z/6xzxcTGvb

Reviewed By: samparker

Differential Revision: https://reviews.llvm.org/D100291
---
 .../Target/AArch64/AArch64TargetTransformInfo.cpp  | 11 ++--
 llvm/test/Analysis/CostModel/AArch64/store.ll      | 14 ++---
 .../LoopUnroll/AArch64/large-vector-ops.ll         | 68 ++++------------------
 3 files changed, 24 insertions(+), 69 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cb516c3..754e4cf 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -944,10 +944,6 @@ InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
                                                 unsigned AddressSpace,
                                                 TTI::TargetCostKind CostKind,
                                                 const Instruction *I) {
-  // TODO: Handle other cost kinds.
-  if (CostKind != TTI::TCK_RecipThroughput)
-    return 1;
-
   // Type legalization can't handle structs
   if (TLI->getValueType(DL, Ty,  true) == MVT::Other)
     return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
@@ -955,6 +951,13 @@ InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
 
   auto LT = TLI->getTypeLegalizationCost(DL, Ty);
 
+  // TODO: consider latency as well for TCK_SizeAndLatency.
+  if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
+    return LT.first;
+
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return 1;
+
   if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
       LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
     // Unaligned stores are extremely inefficient. We don't split all
diff --git a/llvm/test/Analysis/CostModel/AArch64/store.ll b/llvm/test/Analysis/CostModel/AArch64/store.ll
index 6374175..3f8e2a4 100644
--- a/llvm/test/Analysis/CostModel/AArch64/store.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/store.ll
@@ -30,13 +30,13 @@ define void @getMemoryOpCost() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'getMemoryOpCost'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i64> undef, <4 x i64>* undef, align 4
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i32> undef, <8 x i32>* undef, align 4
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i16> undef, <16 x i16>* undef, align 4
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i8> undef, <32 x i8>* undef, align 4
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x double> undef, <4 x double>* undef, align 4
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x float> undef, <8 x float>* undef, align 4
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x half> undef, <16 x half>* undef, align 4
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> undef, <4 x i64>* undef, align 4
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i32> undef, <8 x i32>* undef, align 4
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <16 x i16> undef, <16 x i16>* undef, align 4
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <32 x i8> undef, <32 x i8>* undef, align 4
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x double> undef, <4 x double>* undef, align 4
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x float> undef, <8 x float>* undef, align 4
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <16 x half> undef, <16 x half>* undef, align 4
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, <2 x i64>* undef, align 4
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, <4 x i32>* undef, align 4
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, <8 x i16>* undef, align 4
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/large-vector-ops.ll b/llvm/test/Transforms/LoopUnroll/AArch64/large-vector-ops.ll
index 0ef3625..650567c 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/large-vector-ops.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/large-vector-ops.ll
@@ -6,70 +6,22 @@ target triple = "arm64-apple-ios5.0.0"
 
 ; The loop in the function only contains a few instructions, but they will get
 ; lowered to a very large amount of target instructions.
-; FIXME: Currently the cost-model assigns a cost of 1 to those large vector ops.
 define void @loop_with_large_vector_ops(i32 %i, <225 x double>* %A, <225 x double>* %B) {
 ; CHECK-LABEL: @loop_with_large_vector_ops(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[LV_1:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8
-; CHECK-NEXT:    [[LV_2:%.*]] = load <225 x double>, <225 x double>* [[A]], align 8
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr <225 x double>, <225 x double>* [[A:%.*]], i32 [[IV]]
+; CHECK-NEXT:    [[LV_1:%.*]] = load <225 x double>, <225 x double>* [[A_GEP]], align 8
+; CHECK-NEXT:    [[B_GEP:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 [[IV]]
+; CHECK-NEXT:    [[LV_2:%.*]] = load <225 x double>, <225 x double>* [[B_GEP]], align 8
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul <225 x double> [[LV_1]], [[LV_2]]
-; CHECK-NEXT:    store <225 x double> [[MUL]], <225 x double>* [[A]], align 8
-; CHECK-NEXT:    [[A_GEP_1:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 1
-; CHECK-NEXT:    [[LV_1_1:%.*]] = load <225 x double>, <225 x double>* [[A_GEP_1]], align 8
-; CHECK-NEXT:    [[B_GEP_1:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 1
-; CHECK-NEXT:    [[LV_2_1:%.*]] = load <225 x double>, <225 x double>* [[B_GEP_1]], align 8
-; CHECK-NEXT:    [[MUL_1:%.*]] = fmul <225 x double> [[LV_1_1]], [[LV_2_1]]
-; CHECK-NEXT:    store <225 x double> [[MUL_1]], <225 x double>* [[B_GEP_1]], align 8
-; CHECK-NEXT:    [[A_GEP_2:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 2
-; CHECK-NEXT:    [[LV_1_2:%.*]] = load <225 x double>, <225 x double>* [[A_GEP_2]], align 8
-; CHECK-NEXT:    [[B_GEP_2:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 2
-; CHECK-NEXT:    [[LV_2_2:%.*]] = load <225 x double>, <225 x double>* [[B_GEP_2]], align 8
-; CHECK-NEXT:    [[MUL_2:%.*]] = fmul <225 x double> [[LV_1_2]], [[LV_2_2]]
-; CHECK-NEXT:    store <225 x double> [[MUL_2]], <225 x double>* [[B_GEP_2]], align 8
-; CHECK-NEXT:    [[A_GEP_3:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 3
-; CHECK-NEXT:    [[LV_1_3:%.*]] = load <225 x double>, <225 x double>* [[A_GEP_3]], align 8
-; CHECK-NEXT:    [[B_GEP_3:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 3
-; CHECK-NEXT:    [[LV_2_3:%.*]] = load <225 x double>, <225 x double>* [[B_GEP_3]], align 8
-; CHECK-NEXT:    [[MUL_3:%.*]] = fmul <225 x double> [[LV_1_3]], [[LV_2_3]]
-; CHECK-NEXT:    store <225 x double> [[MUL_3]], <225 x double>* [[B_GEP_3]], align 8
-; CHECK-NEXT:    [[A_GEP_4:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 4
-; CHECK-NEXT:    [[LV_1_4:%.*]] = load <225 x double>, <225 x double>* [[A_GEP_4]], align 8
-; CHECK-NEXT:    [[B_GEP_4:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 4
-; CHECK-NEXT:    [[LV_2_4:%.*]] = load <225 x double>, <225 x double>* [[B_GEP_4]], align 8
-; CHECK-NEXT:    [[MUL_4:%.*]] = fmul <225 x double> [[LV_1_4]], [[LV_2_4]]
-; CHECK-NEXT:    store <225 x double> [[MUL_4]], <225 x double>* [[B_GEP_4]], align 8
-; CHECK-NEXT:    [[A_GEP_5:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 5
-; CHECK-NEXT:    [[LV_1_5:%.*]] = load <225 x double>, <225 x double>* [[A_GEP_5]], align 8
-; CHECK-NEXT:    [[B_GEP_5:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 5
-; CHECK-NEXT:    [[LV_2_5:%.*]] = load <225 x double>, <225 x double>* [[B_GEP_5]], align 8
-; CHECK-NEXT:    [[MUL_5:%.*]] = fmul <225 x double> [[LV_1_5]], [[LV_2_5]]
-; CHECK-NEXT:    store <225 x double> [[MUL_5]], <225 x double>* [[B_GEP_5]], align 8
-; CHECK-NEXT:    [[A_GEP_6:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 6
-; CHECK-NEXT:    [[LV_1_6:%.*]] = load <225 x double>, <225 x double>* [[A_GEP_6]], align 8
-; CHECK-NEXT:    [[B_GEP_6:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 6
-; CHECK-NEXT:    [[LV_2_6:%.*]] = load <225 x double>, <225 x double>* [[B_GEP_6]], align 8
-; CHECK-NEXT:    [[MUL_6:%.*]] = fmul <225 x double> [[LV_1_6]], [[LV_2_6]]
-; CHECK-NEXT:    store <225 x double> [[MUL_6]], <225 x double>* [[B_GEP_6]], align 8
-; CHECK-NEXT:    [[A_GEP_7:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 7
-; CHECK-NEXT:    [[LV_1_7:%.*]] = load <225 x double>, <225 x double>* [[A_GEP_7]], align 8
-; CHECK-NEXT:    [[B_GEP_7:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 7
-; CHECK-NEXT:    [[LV_2_7:%.*]] = load <225 x double>, <225 x double>* [[B_GEP_7]], align 8
-; CHECK-NEXT:    [[MUL_7:%.*]] = fmul <225 x double> [[LV_1_7]], [[LV_2_7]]
-; CHECK-NEXT:    store <225 x double> [[MUL_7]], <225 x double>* [[B_GEP_7]], align 8
-; CHECK-NEXT:    [[A_GEP_8:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 8
-; CHECK-NEXT:    [[LV_1_8:%.*]] = load <225 x double>, <225 x double>* [[A_GEP_8]], align 8
-; CHECK-NEXT:    [[B_GEP_8:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 8
-; CHECK-NEXT:    [[LV_2_8:%.*]] = load <225 x double>, <225 x double>* [[B_GEP_8]], align 8
-; CHECK-NEXT:    [[MUL_8:%.*]] = fmul <225 x double> [[LV_1_8]], [[LV_2_8]]
-; CHECK-NEXT:    store <225 x double> [[MUL_8]], <225 x double>* [[B_GEP_8]], align 8
-; CHECK-NEXT:    [[A_GEP_9:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 9
-; CHECK-NEXT:    [[LV_1_9:%.*]] = load <225 x double>, <225 x double>* [[A_GEP_9]], align 8
-; CHECK-NEXT:    [[B_GEP_9:%.*]] = getelementptr <225 x double>, <225 x double>* [[A]], i32 9
-; CHECK-NEXT:    [[LV_2_9:%.*]] = load <225 x double>, <225 x double>* [[B_GEP_9]], align 8
-; CHECK-NEXT:    [[MUL_9:%.*]] = fmul <225 x double> [[LV_1_9]], [[LV_2_9]]
-; CHECK-NEXT:    store <225 x double> [[MUL_9]], <225 x double>* [[B_GEP_9]], align 8
+; CHECK-NEXT:    store <225 x double> [[MUL]], <225 x double>* [[B_GEP]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IV_NEXT]], 10
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 entry:
-- 
2.7.4