From 2aba753e843271a0cae9702a70793c05f64e04de Mon Sep 17 00:00:00 2001
From: Balaram Makam <bmakam@codeaurora.org>
Date: Fri, 31 Mar 2017 18:16:53 +0000
Subject: [PATCH] [AArch64] Add new subtarget feature to fold LSL into address
 mode.

Summary:
This feature enables folding of logical shift operations of up to 3 places into addressing mode on Kryo and Falkor that have a fastpath LSL.

Reviewers: mcrosier, rengolin, t.p.northover

Subscribers: junbuml, gberry, llvm-commits, aemerson

Differential Revision: https://reviews.llvm.org/D31113

llvm-svn: 299240
---
 llvm/lib/Target/AArch64/AArch64.td                |  9 ++-
 llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp   | 47 +++++++++++++-
 llvm/lib/Target/AArch64/AArch64Subtarget.h        |  2 +
 llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll | 74 +++++++++++++++++++++++
 4 files changed, 127 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 0989710..84473fd 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -126,6 +126,9 @@ def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
                                         "equivalent when the immediate does "
                                         "not fit in the encoding.">;
 
+def FeatureLSLFast : SubtargetFeature<
+    "lsl-fast", "HasLSLFast", "true",
+    "CPU has a fastpath logical shift of up to 3 places">;
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@@ -279,7 +282,8 @@ def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    FeaturePerfMon,
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureZCZeroing
+                                   FeatureZCZeroing,
+                                   FeatureLSLFast
                                    ]>;
 
 def ProcFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
@@ -293,7 +297,8 @@ def ProcFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureRDM,
-                                   FeatureZCZeroing
+                                   FeatureZCZeroing,
+                                   FeatureLSLFast
                                    ]>;
 
 def ProcThunderX2T99  : SubtargetFeature<"thunderx2t99", "ARMProcFamily",
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 3099383..ae01ea4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -328,11 +328,52 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
   }
 }
 
+/// \brief Determine whether it is worth it to fold SHL into the addressing
+/// mode.
+static bool isWorthFoldingSHL(SDValue V) {
+  assert(V.getOpcode() == ISD::SHL && "invalid opcode");
+  // It is worth folding logical shift of up to three places.
+  auto *CSD = dyn_cast<ConstantSDNode>(V.getOperand(1));
+  if (!CSD)
+    return false;
+  unsigned ShiftVal = CSD->getZExtValue();
+  if (ShiftVal > 3)
+    return false;
+
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = V.getNode();
+  for (SDNode *UI : Node->uses())
+    if (!isa<MemSDNode>(*UI))
+      for (SDNode *UII : UI->uses())
+        if (!isa<MemSDNode>(*UII))
+          return false;
+  return true;
+}
+
 /// \brief Determine whether it is worth to fold V into an extended register.
 bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
-  // it hurts if the value is used at least twice, unless we are optimizing
-  // for code size.
-  return ForCodeSize || V.hasOneUse();
+  // Trivial if we are optimizing for code size or if there is only
+  // one use of the value.
+  if (ForCodeSize || V.hasOneUse())
+    return true;
+  // If a subtarget has a fastpath LSL we can fold a logical shift into
+  // the addressing mode and save a cycle.
+  if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL &&
+      isWorthFoldingSHL(V))
+    return true;
+  if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) {
+    const SDValue LHS = V.getOperand(0);
+    const SDValue RHS = V.getOperand(1);
+    if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
+      return true;
+    if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS))
+      return true;
+  }
+
+  // It hurts otherwise, since the value will be reused.
+  return false;
 }
 
 /// SelectShiftedRegister - Select a "shifted register" operand.  If the value
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 10377cb..0ef8949b 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -69,6 +69,7 @@ protected:
   bool HasPerfMon = false;
   bool HasFullFP16 = false;
   bool HasSPE = false;
+  bool HasLSLFast = false;
 
   // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
   bool HasZeroCycleRegMove = false;
@@ -232,6 +233,7 @@ public:
   bool hasPerfMon() const { return HasPerfMon; }
   bool hasFullFP16() const { return HasFullFP16; }
   bool hasSPE() const { return HasSPE; }
+  bool hasLSLFast() const { return HasLSLFast; }
 
   bool isLittleEndian() const { return IsLittle; }
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
new file mode 100644
index 0000000..0dfe04b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+lsl-fast | FileCheck %s
+
+%struct.a = type [256 x i16]
+%struct.b = type [256 x i32]
+%struct.c = type [256 x i64]
+
+declare void @foo()
+define i16 @halfword(%struct.a* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: halfword:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: ldrh [[REG1:w[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #1]
+; CHECK: strh [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #1]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.a, %struct.a* %ctx, i64 0, i64 %idxprom83
+  %result = load i16, i16* %arrayidx86, align 2
+  call void @foo()
+  store i16 %result, i16* %arrayidx86, align 2
+  ret i16 %result
+}
+
+define i32 @word(%struct.b* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: word:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: ldr [[REG1:w[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #2]
+; CHECK: str [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #2]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.b, %struct.b* %ctx, i64 0, i64 %idxprom83
+  %result = load i32, i32* %arrayidx86, align 4
+  call void @foo()
+  store i32 %result, i32* %arrayidx86, align 4
+  ret i32 %result
+}
+
+define i64 @doubleword(%struct.c* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: doubleword:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: ldr [[REG1:x[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #3]
+; CHECK: str [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #3]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.c, %struct.c* %ctx, i64 0, i64 %idxprom83
+  %result = load i64, i64* %arrayidx86, align 8
+  call void @foo()
+  store i64 %result, i64* %arrayidx86, align 8
+  ret i64 %result
+}
+
+define i64 @multi_use_non_memory(i64 %a, i64 %b) {
+; CHECK-LABEL: multi_use_non_memory:
+; CHECK: lsl [[REG1:x[0-9]+]], x0, #3
+; CHECK-NOT: cmp [[REG1]], x1, lsl # 3
+; CHECK-NEXT: lsl [[REG2:x[0-9]+]], x1, #3
+; CHECK-NEXT: cmp [[REG1]], [[REG2]]
+entry:
+  %mul1 = shl i64 %a, 3
+  %mul2 = shl i64 %b, 3
+  %cmp = icmp slt i64 %mul1, %mul2
+  br i1 %cmp, label %truebb, label %falsebb
+truebb:
+  tail call void @foo()
+  unreachable
+falsebb:
+  %cmp2 = icmp sgt i64 %mul1, %mul2
+  br i1 %cmp2, label %exitbb, label %endbb
+exitbb:
+ ret i64 %mul1
+endbb:
+ ret i64 %mul2
+}
-- 
2.7.4