From fc752bb70aa8a52d7a683b88c5cc0fa234bdb91a Mon Sep 17 00:00:00 2001 From: Pablo Barrio Date: Wed, 7 Sep 2016 12:49:15 +0000 Subject: [PATCH] [ARM] Lower UDIV+UREM to UDIV+MLS (and the same for SREM) Summary: This saves a library call to __aeabi_uidivmod. However, the processor must feature hardware division in order to benefit from the transformation. Reviewers: scott-0, jmolloy, compnerd, rengolin Subscribers: t.p.northover, compnerd, aemerson, rengolin, samparker, llvm-commits Differential Revision: https://reviews.llvm.org/D24133 llvm-svn: 280808 --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 19 ++++++++++++++- llvm/test/CodeGen/ARM/urem-opt-size.ll | 43 +++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index cc0b58f..31594e3 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -12098,6 +12098,24 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { bool isSigned = (Opcode == ISD::SDIVREM); EVT VT = Op->getValueType(0); Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + SDLoc dl(Op); + + // If the target has hardware divide, use divide + multiply + subtract: + // div = a / b + // rem = a - b * div + // return {div, rem} + // This should be lowered into UDIV/SDIV + MLS later on. + if (Subtarget->hasDivide()) { + unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; + const SDValue Dividend = Op->getOperand(0); + const SDValue Divisor = Op->getOperand(1); + SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); + SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); + SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); + + SDValue Values[2] = {Div, Rem}; + return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); + } RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), VT.getSimpleVT().SimpleTy); @@ -12111,7 +12129,6 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr); - SDLoc dl(Op); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(InChain) .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) diff --git a/llvm/test/CodeGen/ARM/urem-opt-size.ll b/llvm/test/CodeGen/ARM/urem-opt-size.ll index 7f1cd43..b9c2188 100644 --- a/llvm/test/CodeGen/ARM/urem-opt-size.ll +++ b/llvm/test/CodeGen/ARM/urem-opt-size.ll @@ -3,7 +3,12 @@ ; expanded to a sequence of umull, lsrs, muls and sub instructions, but ; just a call to __aeabi_uidivmod. ; +; When the processor features hardware division, UDIV + UREM can be turned +; into UDIV + MLS. This prevents the library function __aeabi_uidivmod to be +; pulled into the binary. The test uses ARMv7-M. +; ; RUN: llc -mtriple=armv7a-eabi -mattr=-neon -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv7m-eabi -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=V7M target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv7m-arm-none-eabi" @@ -28,11 +33,16 @@ entry: ret i32 %div } +; Test for unsigned remainder define i32 @foo3() local_unnamed_addr #0 { entry: ; CHECK-LABEL: foo3: ; CHECK: __aeabi_uidivmod ; CHECK-NOT: umull +; V7M-LABEL: foo3: +; V7M: udiv [[R2:r[0-9]+]], [[R0:r[0-9]+]], [[R1:r[0-9]+]] +; V7M: mls {{r[0-9]+}}, [[R2]], [[R1]], [[R0]] +; V7M-NOT: __aeabi_uidivmod %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() %rem = urem i32 %call, 1000000 %cmp = icmp eq i32 %rem, 0 @@ -40,6 +50,39 @@ entry: ret i32 %conv } +; Test for signed remainder +define i32 @foo4() local_unnamed_addr #0 { +entry: +; CHECK-LABEL: foo4: +; CHECK:__aeabi_idivmod +; V7M-LABEL: foo4: +; V7M: sdiv [[R2:r[0-9]+]], [[R0:r[0-9]+]], [[R1:r[0-9]+]] +; V7M: mls {{r[0-9]+}}, [[R2]], [[R1]], [[R0]] +; V7M-NOT: __aeabi_idivmod + %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() + %rem = srem i32 %call, 1000000 + ret i32 %rem +} + +; Check that doing a sdiv+srem has the same effect as only the srem, +; as the division needs to be computed anyway in order to calculate +; the remainder (i.e. make sure we don't end up with two divisions). +define i32 @foo5() local_unnamed_addr #0 { +entry: +; CHECK-LABEL: foo5: +; CHECK:__aeabi_idivmod +; V7M-LABEL: foo5: +; V7M: sdiv [[R2:r[0-9]+]], [[R0:r[0-9]+]], [[R1:r[0-9]+]] +; V7M-NOT: sdiv +; V7M: mls {{r[0-9]+}}, [[R2]], [[R1]], [[R0]] +; V7M-NOT: __aeabi_idivmod + %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() + %div = sdiv i32 %call, 1000000 + %rem = srem i32 %call, 1000000 + %add = add i32 %div, %rem + ret i32 %add +} + declare i32 @GetValue(...) local_unnamed_addr attributes #0 = { minsize nounwind optsize } -- 2.7.4