From f008b5b8ce724d60f0f0eeafceee0119c42022d4 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sun, 8 Dec 2019 15:26:32 +0000
Subject: [PATCH] [ARM] Additional tests and minor formatting. NFC

This adds some extra cost model tests for shifts, and does some minor
adjustments to some Neon code to make it clear as to what it applies to.
Both NFC.
---
 llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp     | 86 +++++++++----------
 llvm/test/Analysis/CostModel/ARM/freeshift.ll      | 96 ++++++++++++++++++++++
 .../Transforms/LoopVectorize/ARM/mve-shiftcost.ll  | 86 +++++++++++++++++++
 3 files changed, 225 insertions(+), 43 deletions(-)
 create mode 100644 llvm/test/Analysis/CostModel/ARM/freeshift.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll

diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 10faeb7..f7443c8 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -650,50 +650,50 @@ int ARMTTIImpl::getArithmeticInstrCost(
   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
-  const unsigned FunctionCallDivCost = 20;
-  const unsigned ReciprocalDivCost = 10;
-  static const CostTblEntry CostTbl[] = {
-    // Division.
-    // These costs are somewhat random. Choose a cost of 20 to indicate that
-    // vectorizing devision (added function call) is going to be very expensive.
-    // Double registers types.
-    { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
-    { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
-    { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
-    { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
-    { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
-    // Quad register types.
-    { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
-    // Multiplication.
-  };
-
   if (ST->hasNEON()) {
+    const unsigned FunctionCallDivCost = 20;
+    const unsigned ReciprocalDivCost = 10;
+    static const CostTblEntry CostTbl[] = {
+      // Division.
+      // These costs are somewhat random. Choose a cost of 20 to indicate that
+      // vectorizing devision (added function call) is going to be very expensive.
+      // Double registers types.
+      { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
+      { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
+      { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
+      { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
+      { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
+      // Quad register types.
+      { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
+      // Multiplication.
+    };
+
     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
       return LT.first * Entry->Cost;
 
diff --git a/llvm/test/Analysis/CostModel/ARM/freeshift.ll b/llvm/test/Analysis/CostModel/ARM/freeshift.ll
new file mode 100644
index 0000000..8e48a7f7
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/freeshift.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -cost-model -analyze -mtriple=thumbv8.1m.main-none-eabi < %s | FileCheck %s
+
+define void @shl(i32 %a, i32 %b) {
+; CHECK-LABEL: 'shl'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ss = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xs = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ns = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %os = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %is = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %as = shl i32 %a, 3
+  %ac = add i32 %b, %as
+  %ss = shl i32 %a, 3
+  %sc = sub i32 %b, %ss
+  %xs = shl i32 %a, 3
+  %xc = xor i32 %b, %xs
+  %ns = shl i32 %a, 3
+  %nc = and i32 %b, %ns
+  %os = shl i32 %a, 3
+  %oc = or i32 %b, %os
+  %is = shl i32 %a, 3
+  %ic = icmp eq i32 %b, %is
+  ret void
+}
+
+define void @ashr(i32 %a, i32 %b) {
+; CHECK-LABEL: 'ashr'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ss = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xs = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ns = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %os = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %is = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %as = ashr i32 %a, 3
+  %ac = add i32 %b, %as
+  %ss = ashr i32 %a, 3
+  %sc = sub i32 %b, %ss
+  %xs = ashr i32 %a, 3
+  %xc = xor i32 %b, %xs
+  %ns = ashr i32 %a, 3
+  %nc = and i32 %b, %ns
+  %os = ashr i32 %a, 3
+  %oc = or i32 %b, %os
+  %is = ashr i32 %a, 3
+  %ic = icmp eq i32 %b, %is
+  ret void
+}
+
+define void @lshr(i32 %a, i32 %b) {
+; CHECK-LABEL: 'lshr'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ss = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xs = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ns = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %os = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %is = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %as = lshr i32 %a, 3
+  %ac = add i32 %b, %as
+  %ss = lshr i32 %a, 3
+  %sc = sub i32 %b, %ss
+  %xs = lshr i32 %a, 3
+  %xc = xor i32 %b, %xs
+  %ns = lshr i32 %a, 3
+  %nc = and i32 %b, %ns
+  %os = lshr i32 %a, 3
+  %oc = or i32 %b, %os
+  %is = lshr i32 %a, 3
+  %ic = icmp eq i32 %b, %is
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
new file mode 100644
index 0000000..858d3d8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
@@ -0,0 +1,86 @@
+; RUN: opt -loop-vectorize -enable-arm-maskedldst < %s -S -o - | FileCheck %s --check-prefix=CHECK
+; RUN: opt -loop-vectorize -enable-arm-maskedldst -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CHECK-COST
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-none-eabi"
+
+; CHECK-LABEL: test
+; CHECK-COST: LV: Found an estimated cost of 1 for VF 1 For instruction:   %and515 = shl i32 %l41, 3
+; CHECK-COST: LV: Found an estimated cost of 1 for VF 1 For instruction:   %l45 = and i32 %and515, 131072
+; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction:   %and515 = shl i32 %l41, 3
+; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction:   %l45 = and i32 %and515, 131072
+; CHECK: vector.body
+
+define void @test([101 x i32] *%src, i32 %N) #0 {
+entry:
+  br label %for.body386
+  
+for.body386:                                      ; preds = %entry, %l77
+  %add387 = phi i32 [ %inc532, %l77 ], [ 0, %entry ]
+  %arrayidx388 = getelementptr inbounds [101 x i32], [101 x i32]* %src, i32 0, i32 %add387
+  %l41 = load i32, i32* %arrayidx388, align 4
+  %l42 = and i32 %l41, 65535
+  %l43 = icmp eq i32 %l42, 0
+  br i1 %l43, label %l77, label %l44
+
+l44:                                               ; preds = %for.body386
+  %and515 = shl i32 %l41, 3
+  %l45 = and i32 %and515, 131072
+  %and506 = shl i32 %l41, 5
+  %l46 = and i32 %and506, 262144
+  %and497 = shl i32 %l41, 7
+  %l47 = and i32 %and497, 524288
+  %and488 = shl i32 %l41, 9
+  %l48 = and i32 %and488, 1048576
+  %and479 = shl i32 %l41, 11
+  %l49 = and i32 %and479, 2097152
+  %and470 = shl i32 %l41, 13
+  %l50 = and i32 %and470, 4194304
+  %and461 = shl i32 %l41, 15
+  %l51 = and i32 %and461, 8388608
+  %and452 = shl i32 %l41, 17
+  %l52 = and i32 %and452, 16777216
+  %and443 = shl i32 %l41, 19
+  %l53 = and i32 %and443, 33554432
+  %and434 = shl i32 %l41, 21
+  %l54 = and i32 %and434, 67108864
+  %and425 = shl i32 %l41, 23
+  %l55 = and i32 %and425, 134217728
+  %and416 = shl i32 %l41, 25
+  %l56 = and i32 %and416, 268435456
+  %and407 = shl i32 %l41, 27
+  %l57 = and i32 %and407, 536870912
+  %and398 = shl i32 %l41, 29
+  %l58 = and i32 %and398, 1073741824
+  %l59 = shl i32 %l41, 31
+  %l60 = or i32 %l59, %l41
+  %l61 = or i32 %l58, %l60
+  %l62 = or i32 %l57, %l61
+  %l63 = or i32 %l56, %l62
+  %l64 = or i32 %l55, %l63
+  %l65 = or i32 %l54, %l64
+  %l66 = or i32 %l53, %l65
+  %l67 = or i32 %l52, %l66
+  %l68 = or i32 %l51, %l67
+  %l69 = or i32 %l50, %l68
+  %l70 = or i32 %l49, %l69
+  %l71 = or i32 %l48, %l70
+  %l72 = or i32 %l47, %l71
+  %l73 = or i32 %l46, %l72
+  %l74 = or i32 %l45, %l73
+  %and524 = shl i32 %l41, 1
+  %l75 = and i32 %and524, 65536
+  %l76 = or i32 %l75, %l74
+  store i32 %l76, i32* %arrayidx388, align 4
+  br label %l77
+
+l77:                                               ; preds = %for.body386, %l44
+  %inc532 = add nuw nsw i32 %add387, 1
+  %exitcond649 = icmp eq i32 %inc532, %N
+  br i1 %exitcond649, label %exit, label %for.body386
+
+exit:
+  ret void
+}
+
+attributes #0 = { nounwind "min-legal-vector-width"="0" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "use-soft-float"="false" }
-- 
2.7.4