From 42c83f131eff4046ba166b88959ae29ebce0c925 Mon Sep 17 00:00:00 2001 From: Hal Finkel Date: Wed, 7 Sep 2016 07:36:11 +0000 Subject: [PATCH] [PowerPC] Fix address-offset folding for plain addi When folding an addi into a memory access that can take an immediate offset, we were implicitly assuming that the existing offset was zero. This was incorrect. If we're dealing with an addi with a plain constant, we can add it to the existing offset (assuming that doesn't overflow the immediate, etc.), but if we have anything else (i.e. something that will become a relocation expression), we'll go back to requiring the existing immediate offset to be zero (because we don't know what the requirements on that relocation expression might be - e.g. maybe it is paired with some addis in some relevant way). On the other hand, when dealing with a plain addi with a regular constant immediate, the alignment restrictions (from the TOC base pointer, etc.) are irrelevant. I've added the test case from PR30280, which demonstrated the bug, but also demonstrates a missed optimization opportunity (i.e. we don't need the memory accesses at all). Fixes PR30280. llvm-svn: 280789 --- llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 53 +++++++++++++++++++-------- llvm/test/CodeGen/PowerPC/addi-offset-fold.ll | 40 ++++++++++++++++++++ 2 files changed, 78 insertions(+), 15 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/addi-offset-fold.ll diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 56e3922..91eb051 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -4385,25 +4385,48 @@ void PPCDAGToDAGISel::PeepholePPC64() { SDValue HBase = Base.getOperand(0); int Offset = N->getConstantOperandVal(FirstOp); - if (Offset < 0 || Offset > MaxDisplacement) { - // If we have a addi(toc@l)/addis(toc@ha) pair, and the addis has only - // one use, then we can do this for any offset, we just need to also - // update the offset (i.e. the symbol addend) on the addis also. - if (Base.getMachineOpcode() != PPC::ADDItocL) - continue; + if (ReplaceFlags) { + if (Offset < 0 || Offset > MaxDisplacement) { + // If we have a addi(toc@l)/addis(toc@ha) pair, and the addis has only + // one use, then we can do this for any offset, we just need to also + // update the offset (i.e. the symbol addend) on the addis also. + if (Base.getMachineOpcode() != PPC::ADDItocL) + continue; - if (!HBase.isMachineOpcode() || - HBase.getMachineOpcode() != PPC::ADDIStocHA) - continue; + if (!HBase.isMachineOpcode() || + HBase.getMachineOpcode() != PPC::ADDIStocHA) + continue; - if (!Base.hasOneUse() || !HBase.hasOneUse()) - continue; + if (!Base.hasOneUse() || !HBase.hasOneUse()) + continue; - SDValue HImmOpnd = HBase.getOperand(1); - if (HImmOpnd != ImmOpnd) - continue; + SDValue HImmOpnd = HBase.getOperand(1); + if (HImmOpnd != ImmOpnd) + continue; - UpdateHBase = true; + UpdateHBase = true; + } + } else { + // If we're directly folding the addend from an addi instruction, then: + // 1. In general, the offset on the memory access must be zero. + // 2. If the addend is a constant, then it can be combined with a + // non-zero offset, but only if the result meets the encoding + // requirements. + if (auto *C = dyn_cast(ImmOpnd)) { + Offset += C->getSExtValue(); + + if ((StorageOpcode == PPC::LWA || StorageOpcode == PPC::LD || + StorageOpcode == PPC::STD) && (Offset % 4) != 0) + continue; + + if (!isInt<16>(Offset)) + continue; + + ImmOpnd = CurDAG->getTargetConstant(Offset, SDLoc(ImmOpnd), + ImmOpnd.getValueType()); + } else if (Offset != 0) { + continue; + } } // We found an opportunity. Reverse the operands from the add diff --git a/llvm/test/CodeGen/PowerPC/addi-offset-fold.ll b/llvm/test/CodeGen/PowerPC/addi-offset-fold.ll new file mode 100644 index 0000000..063bfdb --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/addi-offset-fold.ll @@ -0,0 +1,40 @@ +; RUN: llc < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +%struct.S0 = type <{ i32, [5 x i8] }> + +; Function Attrs: norecurse nounwind readnone +define signext i32 @foo([2 x i64] %a.coerce) local_unnamed_addr #0 { +entry: + %a = alloca %struct.S0, align 8 + %a.coerce.fca.0.extract = extractvalue [2 x i64] %a.coerce, 0 + %a.coerce.fca.1.extract = extractvalue [2 x i64] %a.coerce, 1 + %a.0.a.0..sroa_cast = bitcast %struct.S0* %a to i64* + store i64 %a.coerce.fca.0.extract, i64* %a.0.a.0..sroa_cast, align 8 + %tmp.sroa.2.0.extract.trunc = trunc i64 %a.coerce.fca.1.extract to i8 + %a.8.a.8..sroa_idx = getelementptr inbounds %struct.S0, %struct.S0* %a, i64 0, i32 1, i64 4 + store i8 %tmp.sroa.2.0.extract.trunc, i8* %a.8.a.8..sroa_idx, align 8 + %a.4.a.4..sroa_idx = getelementptr inbounds %struct.S0, %struct.S0* %a, i64 0, i32 1 + %a.4.a.4..sroa_cast = bitcast [5 x i8]* %a.4.a.4..sroa_idx to i40* + %a.4.a.4.bf.load = load i40, i40* %a.4.a.4..sroa_cast, align 4 + %bf.lshr = lshr i40 %a.4.a.4.bf.load, 31 + %bf.lshr.tr = trunc i40 %bf.lshr to i32 + %bf.cast = and i32 %bf.lshr.tr, 127 + ret i32 %bf.cast + +; CHECK-LABEL: @foo +; FIXME: We don't need to do these stores/loads at all. +; CHECK-DAG: std 3, -24(1) +; CHECK-DAG: stb 4, -16(1) +; CHECK: ori 2, 2, 0 +; CHECK-DAG: lbz [[REG1:[0-9]+]], -16(1) +; CHECK-DAG: lwz [[REG2:[0-9]+]], -20(1) +; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG1]], 32 +; CHECK-DAG: or [[REG4:[0-9]+]], [[REG2]], [[REG3]] +; CHECK: rldicl 3, [[REG4]], 33, 57 +; CHECK: blr +} + +attributes #0 = { nounwind "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "target-cpu"="ppc64le" } + -- 2.7.4