From 2aba753e843271a0cae9702a70793c05f64e04de Mon Sep 17 00:00:00 2001 From: Balaram Makam Date: Fri, 31 Mar 2017 18:16:53 +0000 Subject: [PATCH] [AArch64] Add new subtarget feature to fold LSL into address mode. Summary: This feature enables folding of logical shift operations of up to 3 places into addressing mode on Kryo and Falkor that have a fastpath LSL. Reviewers: mcrosier, rengolin, t.p.northover Subscribers: junbuml, gberry, llvm-commits, aemerson Differential Revision: https://reviews.llvm.org/D31113 llvm-svn: 299240 --- llvm/lib/Target/AArch64/AArch64.td | 9 ++- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 47 +++++++++++++- llvm/lib/Target/AArch64/AArch64Subtarget.h | 2 + llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll | 74 +++++++++++++++++++++++ 4 files changed, 127 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 0989710..84473fd 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -126,6 +126,9 @@ def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates", "equivalent when the immediate does " "not fit in the encoding.">; +def FeatureLSLFast : SubtargetFeature< + "lsl-fast", "HasLSLFast", "true", + "CPU has a fastpath logical shift of up to 3 places">; //===----------------------------------------------------------------------===// // Architectures. // @@ -279,7 +282,8 @@ def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", FeaturePerfMon, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, - FeatureZCZeroing + FeatureZCZeroing, + FeatureLSLFast ]>; def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", @@ -293,7 +297,8 @@ def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, FeatureRDM, - FeatureZCZeroing + FeatureZCZeroing, + FeatureLSLFast ]>; def ProcThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 3099383..ae01ea4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -328,11 +328,52 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) { } } +/// \brief Determine whether it is worth it to fold SHL into the addressing +/// mode. +static bool isWorthFoldingSHL(SDValue V) { + assert(V.getOpcode() == ISD::SHL && "invalid opcode"); + // It is worth folding logical shift of up to three places. + auto *CSD = dyn_cast(V.getOperand(1)); + if (!CSD) + return false; + unsigned ShiftVal = CSD->getZExtValue(); + if (ShiftVal > 3) + return false; + + // Check if this particular node is reused in any non-memory related + // operation. If yes, do not try to fold this node into the address + // computation, since the computation will be kept. + const SDNode *Node = V.getNode(); + for (SDNode *UI : Node->uses()) + if (!isa(*UI)) + for (SDNode *UII : UI->uses()) + if (!isa(*UII)) + return false; + return true; +} + /// \brief Determine whether it is worth to fold V into an extended register. bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const { - // it hurts if the value is used at least twice, unless we are optimizing - // for code size. - return ForCodeSize || V.hasOneUse(); + // Trivial if we are optimizing for code size or if there is only + // one use of the value. + if (ForCodeSize || V.hasOneUse()) + return true; + // If a subtarget has a fastpath LSL we can fold a logical shift into + // the addressing mode and save a cycle. + if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL && + isWorthFoldingSHL(V)) + return true; + if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) { + const SDValue LHS = V.getOperand(0); + const SDValue RHS = V.getOperand(1); + if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS)) + return true; + if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS)) + return true; + } + + // It hurts otherwise, since the value will be reused. + return false; } /// SelectShiftedRegister - Select a "shifted register" operand. If the value diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 10377cb..0ef8949b 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -69,6 +69,7 @@ protected: bool HasPerfMon = false; bool HasFullFP16 = false; bool HasSPE = false; + bool HasLSLFast = false; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove = false; @@ -232,6 +233,7 @@ public: bool hasPerfMon() const { return HasPerfMon; } bool hasFullFP16() const { return HasFullFP16; } bool hasSPE() const { return HasSPE; } + bool hasLSLFast() const { return HasLSLFast; } bool isLittleEndian() const { return IsLittle; } diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll new file mode 100644 index 0000000..0dfe04b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll @@ -0,0 +1,74 @@ +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+lsl-fast | FileCheck %s + +%struct.a = type [256 x i16] +%struct.b = type [256 x i32] +%struct.c = type [256 x i64] + +declare void @foo() +define i16 @halfword(%struct.a* %ctx, i32 %xor72) nounwind { +; CHECK-LABEL: halfword: +; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8 +; CHECK: ldrh [[REG1:w[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #1] +; CHECK: strh [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #1] + %shr81 = lshr i32 %xor72, 9 + %conv82 = zext i32 %shr81 to i64 + %idxprom83 = and i64 %conv82, 255 + %arrayidx86 = getelementptr inbounds %struct.a, %struct.a* %ctx, i64 0, i64 %idxprom83 + %result = load i16, i16* %arrayidx86, align 2 + call void @foo() + store i16 %result, i16* %arrayidx86, align 2 + ret i16 %result +} + +define i32 @word(%struct.b* %ctx, i32 %xor72) nounwind { +; CHECK-LABEL: word: +; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8 +; CHECK: ldr [[REG1:w[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #2] +; CHECK: str [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #2] + %shr81 = lshr i32 %xor72, 9 + %conv82 = zext i32 %shr81 to i64 + %idxprom83 = and i64 %conv82, 255 + %arrayidx86 = getelementptr inbounds %struct.b, %struct.b* %ctx, i64 0, i64 %idxprom83 + %result = load i32, i32* %arrayidx86, align 4 + call void @foo() + store i32 %result, i32* %arrayidx86, align 4 + ret i32 %result +} + +define i64 @doubleword(%struct.c* %ctx, i32 %xor72) nounwind { +; CHECK-LABEL: doubleword: +; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8 +; CHECK: ldr [[REG1:x[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #3] +; CHECK: str [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #3] + %shr81 = lshr i32 %xor72, 9 + %conv82 = zext i32 %shr81 to i64 + %idxprom83 = and i64 %conv82, 255 + %arrayidx86 = getelementptr inbounds %struct.c, %struct.c* %ctx, i64 0, i64 %idxprom83 + %result = load i64, i64* %arrayidx86, align 8 + call void @foo() + store i64 %result, i64* %arrayidx86, align 8 + ret i64 %result +} + +define i64 @multi_use_non_memory(i64 %a, i64 %b) { +; CHECK-LABEL: multi_use_non_memory: +; CHECK: lsl [[REG1:x[0-9]+]], x0, #3 +; CHECK-NOT: cmp [[REG1]], x1, lsl # 3 +; CHECK-NEXT: lsl [[REG2:x[0-9]+]], x1, #3 +; CHECK-NEXT: cmp [[REG1]], [[REG2]] +entry: + %mul1 = shl i64 %a, 3 + %mul2 = shl i64 %b, 3 + %cmp = icmp slt i64 %mul1, %mul2 + br i1 %cmp, label %truebb, label %falsebb +truebb: + tail call void @foo() + unreachable +falsebb: + %cmp2 = icmp sgt i64 %mul1, %mul2 + br i1 %cmp2, label %exitbb, label %endbb +exitbb: + ret i64 %mul1 +endbb: + ret i64 %mul2 +} -- 2.7.4