This adds the ExpandLargeDivRem to the default pass pipeline.
The limit at which it expands div/rem instructions is configured
via a new TargetTransformInfo hook (default: no expansion)
X86, Arm and AArch64 backends implement this hook to expand div/rem
instructions with more than 128 bits.
Differential Revision: https://reviews.llvm.org/D130076
/// would typically be allowed using throughput or size cost models.
bool hasDivRemOp(Type *DataType, bool IsSigned) const;
+ /// Returns the maximum bitwidth of legal div and rem instructions.
+ unsigned maxLegalDivRemBitWidth() const;
+
/// Return true if the given instruction (assumed to be a memory access
/// instruction) has a volatile variant. If that's the case then we can avoid
/// addrspacecast to generic AS for volatile loads/stores. Default
const SmallBitVector &OpcodeMask) const = 0;
virtual bool enableOrderedReductions() = 0;
virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0;
+ virtual unsigned maxLegalDivRemBitWidth() = 0;
virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0;
virtual bool prefersVectorizedAddressing() = 0;
virtual InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
bool hasDivRemOp(Type *DataType, bool IsSigned) override {
return Impl.hasDivRemOp(DataType, IsSigned);
}
+ unsigned maxLegalDivRemBitWidth() override {
+ return Impl.maxLegalDivRemBitWidth();
+ }
bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) override {
return Impl.hasVolatileVariant(I, AddrSpace);
}
bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; }
+ bool maxLegalDivRemBitWidth() const {
+ return llvm::IntegerType::MAX_INT_BITS;
+ }
+
bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const {
return false;
}
return TTIImpl->hasDivRemOp(DataType, IsSigned);
}
+unsigned TargetTransformInfo::maxLegalDivRemBitWidth() const {
+ return TTIImpl->maxLegalDivRemBitWidth();
+}
+
bool TargetTransformInfo::hasVolatileVariant(Instruction *I,
unsigned AddrSpace) const {
return TTIImpl->hasVolatileVariant(I, AddrSpace);
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
using namespace llvm;
static cl::opt<unsigned>
- ExpandDivRemBits("expand-div-rem-bits", cl::Hidden, cl::init(128),
+ ExpandDivRemBits("expand-div-rem-bits", cl::Hidden,
+ cl::init(llvm::IntegerType::MAX_INT_BITS),
cl::desc("div and rem instructions on integers with "
"more than <N> bits are expanded."));
-static bool runImpl(Function &F) {
+static bool isConstantPowerOfTwo(llvm::Value *V, bool SignedOp) {
+ auto *C = dyn_cast<ConstantInt>(V);
+ if (!C)
+ return false;
+
+ APInt Val = C->getValue();
+ if (SignedOp && Val.isNegative())
+ Val = -Val;
+ return Val.isPowerOf2();
+}
+
+static bool isSigned(unsigned int Opcode) {
+ return Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
+}
+
+static bool runImpl(Function &F, const TargetTransformInfo &TTI) {
SmallVector<BinaryOperator *, 4> Replace;
bool Modified = false;
+ unsigned MaxLegalDivRemBitWidth = TTI.maxLegalDivRemBitWidth();
+ if (ExpandDivRemBits != llvm::IntegerType::MAX_INT_BITS)
+ MaxLegalDivRemBitWidth = ExpandDivRemBits;
+
+ if (MaxLegalDivRemBitWidth >= llvm::IntegerType::MAX_INT_BITS)
+ return false;
+
for (auto &I : instructions(F)) {
switch (I.getOpcode()) {
case Instruction::UDiv:
case Instruction::SRem: {
// TODO: This doesn't handle vectors.
auto *IntTy = dyn_cast<IntegerType>(I.getType());
- if (!IntTy || IntTy->getIntegerBitWidth() <= ExpandDivRemBits)
+ if (!IntTy || IntTy->getIntegerBitWidth() <= MaxLegalDivRemBitWidth)
+ continue;
+
+ // The backend has peephole optimizations for powers of two.
+ if (isConstantPowerOfTwo(I.getOperand(1), isSigned(I.getOpcode())))
continue;
Replace.push_back(&cast<BinaryOperator>(I));
PreservedAnalyses ExpandLargeDivRemPass::run(Function &F,
FunctionAnalysisManager &AM) {
- bool Changed = runImpl(F);
+ TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+ bool Changed = runImpl(F, TTI);
if (Changed)
return PreservedAnalyses::none();
initializeExpandLargeDivRemLegacyPassPass(*PassRegistry::getPassRegistry());
}
- bool runOnFunction(Function &F) override { return runImpl(F); }
+ bool runOnFunction(Function &F) override {
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ return runImpl(F, TTI);
+ }
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
addPass(createPreISelIntrinsicLoweringPass());
PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
+ addPass(createExpandLargeDivRemPass());
addIRPasses();
addCodeGenPrepare();
addPassesToHandleExceptions();
bool enableOrderedReductions() const { return true; }
+ unsigned maxLegalDivRemBitWidth() const { return 128; }
+
InstructionCost getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
return isLegalMaskedGather(Ty, Alignment);
}
+ unsigned maxLegalDivRemBitWidth() const { return 64; }
+
InstructionCost getMemcpyCost(const Instruction *I);
int getNumMemOps(const IntrinsicInst *I) const;
return BaseT::isExpensiveToSpeculativelyExecute(I);
}
+unsigned X86TTIImpl::maxLegalDivRemBitWidth() const {
+ return ST->is64Bit() ? 128 : 64;
+}
+
bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
return false;
}
const SmallBitVector &OpcodeMask) const;
bool hasDivRemOp(Type *DataType, bool IsSigned);
bool isExpensiveToSpeculativelyExecute(const Instruction *I);
+ unsigned maxLegalDivRemBitWidth() const;
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty);
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
; CHECK-NEXT: ModulePass Manager
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
+; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Lower Garbage Collection Instructions
; CHECK-NEXT: ModulePass Manager
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
+; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: SVE intrinsics optimizations
; CHECK-NEXT: FunctionPass Manager
--- /dev/null
+; RUN: llc -mtriple=aarch64-linux-gnuabi < %s | FileCheck %s
+
+define i65 @udiv65(i65 %a, i65 %b) nounwind {
+; CHECK-LABEL: udiv65:
+; CHECK-NOT: call
+ %res = udiv i65 %a, %b
+ ret i65 %res
+}
+
+define i129 @udiv129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: udiv129:
+; CHECK-NOT: call
+ %res = udiv i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @urem129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: urem129:
+; CHECK-NOT: call
+ %res = urem i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @sdiv129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: sdiv129:
+; CHECK-NOT: call
+ %res = sdiv i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @srem129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: srem129:
+; CHECK-NOT: call
+ %res = srem i129 %a, %b
+ ret i129 %res
+}
+
+; Some higher sizes
+define i257 @sdiv257(i257 %a, i257 %b) nounwind {
+; CHECK-LABEL: sdiv257:
+; CHECK-NOT: call
+ %res = sdiv i257 %a, %b
+ ret i257 %res
+}
; CHECK: ModulePass Manager
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
+; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Simplify the CFG
; CHECK-NEXT: Dominator Tree Construction
--- /dev/null
+; RUN: llc -mtriple=arm-eabi < %s | FileCheck %s
+
+define i65 @udiv65(i65 %a, i65 %b) nounwind {
+; CHECK-LABEL: udiv65:
+; CHECK-NOT: call
+ %res = udiv i65 %a, %b
+ ret i65 %res
+}
+
+define i129 @udiv129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: udiv129:
+; CHECK-NOT: call
+ %res = udiv i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @urem129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: urem129:
+; CHECK-NOT: call
+ %res = urem i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @sdiv129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: sdiv129:
+; CHECK-NOT: call
+ %res = sdiv i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @srem129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: srem129:
+; CHECK-NOT: call
+ %res = srem i129 %a, %b
+ ret i129 %res
+}
+
+; Some higher sizes
+define i257 @sdiv257(i257 %a, i257 %b) nounwind {
+; CHECK-LABEL: sdiv257:
+; CHECK-NOT: call
+ %res = sdiv i257 %a, %b
+ ret i257 %res
+}
; CHECK-NEXT: ModulePass Manager
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
+; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Lower AMX intrinsics
; CHECK-NEXT: Lower AMX type for load/store
define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-LABEL: scalar_i128:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $40, %esp
-; X86-NEXT: movl 44(%ebp), %edi
-; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: pushl 40(%ebp)
-; X86-NEXT: pushl 36(%ebp)
-; X86-NEXT: pushl 32(%ebp)
-; X86-NEXT: pushl 28(%ebp)
-; X86-NEXT: pushl 24(%ebp)
-; X86-NEXT: pushl 20(%ebp)
-; X86-NEXT: pushl 16(%ebp)
-; X86-NEXT: pushl 12(%ebp)
-; X86-NEXT: pushl %eax
-; X86-NEXT: calll __divti3
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: movl %ecx, 12(%edi)
-; X86-NEXT: movl %esi, 8(%edi)
-; X86-NEXT: movl %eax, 4(%edi)
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %ebx, (%edx)
-; X86-NEXT: movl 28(%ebp), %eax
-; X86-NEXT: imull %eax, %ecx
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: imull 32(%ebp), %esi
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: movl 36(%ebp), %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: imull %edi, %ecx
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl 40(%ebp), %eax
-; X86-NEXT: imull %ebx, %eax
-; X86-NEXT: addl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl 28(%ebp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull 32(%ebp)
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
-; X86-NEXT: setb %cl
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull 32(%ebp)
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl 16(%ebp), %esi
-; X86-NEXT: sbbl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 20(%ebp), %edi
-; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: movl 24(%ebp), %ebx
-; X86-NEXT: sbbl %edx, %ebx
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %ebx, 12(%eax)
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl $4
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
+; X86: udiv-do-while
;
; X64-LABEL: scalar_i128:
; X64: # %bb.0:
define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-LABEL: scalar_i128:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $40, %esp
-; X86-NEXT: movl 44(%ebp), %edi
-; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: pushl 40(%ebp)
-; X86-NEXT: pushl 36(%ebp)
-; X86-NEXT: pushl 32(%ebp)
-; X86-NEXT: pushl 28(%ebp)
-; X86-NEXT: pushl 24(%ebp)
-; X86-NEXT: pushl 20(%ebp)
-; X86-NEXT: pushl 16(%ebp)
-; X86-NEXT: pushl 12(%ebp)
-; X86-NEXT: pushl %eax
-; X86-NEXT: calll __udivti3
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: movl %ecx, 12(%edi)
-; X86-NEXT: movl %esi, 8(%edi)
-; X86-NEXT: movl %eax, 4(%edi)
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %ebx, (%edx)
-; X86-NEXT: movl 28(%ebp), %eax
-; X86-NEXT: imull %eax, %ecx
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: imull 32(%ebp), %esi
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: movl 36(%ebp), %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: imull %edi, %ecx
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl 40(%ebp), %eax
-; X86-NEXT: imull %ebx, %eax
-; X86-NEXT: addl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl 28(%ebp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull 32(%ebp)
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
-; X86-NEXT: setb %cl
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull 32(%ebp)
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl 16(%ebp), %esi
-; X86-NEXT: sbbl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 20(%ebp), %edi
-; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: movl 24(%ebp), %ebx
-; X86-NEXT: sbbl %edx, %ebx
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %ebx, 12(%eax)
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl $4
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
+; X86: udiv-do-while
;
; X64-LABEL: scalar_i128:
; X64: # %bb.0:
define i128 @test3(i128 %x) nounwind {
; X86-LABEL: test3:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl %esp, %eax
-; X86-NEXT: pushl $-1
-; X86-NEXT: pushl $-5
-; X86-NEXT: pushl $-1
-; X86-NEXT: pushl $-3
-; X86-NEXT: pushl 24(%ebp)
-; X86-NEXT: pushl 20(%ebp)
-; X86-NEXT: pushl 16(%ebp)
-; X86-NEXT: pushl 12(%ebp)
-; X86-NEXT: pushl %eax
-; X86-NEXT: calll __divti3
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: movl (%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 12(%esi)
-; X86-NEXT: movl %edx, 8(%esi)
-; X86-NEXT: movl %ecx, 4(%esi)
-; X86-NEXT: movl %eax, (%esi)
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: leal -8(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl $4
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
+; X86: udiv-do-while
;
; X64-LABEL: test3:
; X64: # %bb.0:
define i128 @test2(i128 %x) nounwind {
; X86-LABEL: test2:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl %esp, %eax
-; X86-NEXT: pushl $-1
-; X86-NEXT: pushl $-4
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl 24(%ebp)
-; X86-NEXT: pushl 20(%ebp)
-; X86-NEXT: pushl 16(%ebp)
-; X86-NEXT: pushl 12(%ebp)
-; X86-NEXT: pushl %eax
-; X86-NEXT: calll __udivti3
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: movl (%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 12(%esi)
-; X86-NEXT: movl %edx, 8(%esi)
-; X86-NEXT: movl %ecx, 4(%esi)
-; X86-NEXT: movl %eax, (%esi)
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: leal -8(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl $4
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
+; X86: udiv-do-while
;
; X64-LABEL: test2:
; X64: # %bb.0:
define i128 @test3(i128 %x) nounwind {
; X86-LABEL: test3:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl %esp, %eax
-; X86-NEXT: pushl $-1
-; X86-NEXT: pushl $-5
-; X86-NEXT: pushl $-1
-; X86-NEXT: pushl $-3
-; X86-NEXT: pushl 24(%ebp)
-; X86-NEXT: pushl 20(%ebp)
-; X86-NEXT: pushl 16(%ebp)
-; X86-NEXT: pushl 12(%ebp)
-; X86-NEXT: pushl %eax
-; X86-NEXT: calll __udivti3
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: movl (%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 12(%esi)
-; X86-NEXT: movl %edx, 8(%esi)
-; X86-NEXT: movl %ecx, 4(%esi)
-; X86-NEXT: movl %eax, (%esi)
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: leal -8(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl $4
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
+; X86: udiv-do-while
;
; X64-LABEL: test3:
; X64: # %bb.0:
+++ /dev/null
-; RUN: llc -mtriple=i686-linux-gnu -o - %s | FileCheck %s
-
-@var = global i128 0
-
-; We were trying to convert the i128 operation into a libcall, but failing to
-; perform sret demotion when we couldn't return the result in registers. Make
-; sure we marshal the return properly:
-
-define void @test_sret_libcall(i128 %l, i128 %r) {
-; CHECK-LABEL: test_sret_libcall:
-
- ; Stack for call: 4(sret ptr), 16(i128 %l), 16(128 %r). So next logical
- ; (aligned) place for the actual sret data is %esp + 20.
-; CHECK: leal 20(%esp), [[SRET_ADDR:%[a-z]+]]
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl [[SRET_ADDR]]
-
-; CHECK: calll __udivti3
-
-; CHECK: addl $44, %esp
-; CHECK-DAG: movl 8(%esp), [[RES0:%[a-z]+]]
-; CHECK-DAG: movl 12(%esp), [[RES1:%[a-z]+]]
-; CHECK-DAG: movl 16(%esp), [[RES2:%[a-z]+]]
-; CHECK-DAG: movl 20(%esp), [[RES3:%[a-z]+]]
-; CHECK-DAG: movl [[RES0]], var
-; CHECK-DAG: movl [[RES1]], var+4
-; CHECK-DAG: movl [[RES2]], var+8
-; CHECK-DAG: movl [[RES3]], var+12
- %quot = udiv i128 %l, %r
- store i128 %quot, ptr @var
- ret void
-}
; CHECK-NEXT: ModulePass Manager
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
+; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Lower AMX intrinsics
; CHECK-NEXT: Lower AMX type for load/store
; X64-NEXT: movq %rax, (%rax)
; X64-NEXT: movb $0, (%rax)
; X64-NEXT: retq
-;
-; X86-LABEL: f:
-; X86: # %bb.0: # %BB
-; X86-NEXT: pushl %ebp
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %ebp, -8
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: .cfi_def_cfa_register %ebp
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movzbl (%eax), %eax
-; X86-NEXT: cmpb $0, (%eax)
-; X86-NEXT: setne (%eax)
-; X86-NEXT: leal -{{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, (%eax)
-; X86-NEXT: movb $0, (%eax)
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
-; X86-NEXT: .cfi_def_cfa %esp, 4
-; X86-NEXT: retl
BB:
%A30 = alloca i66
%L17 = load i66, ptr %A30
--- /dev/null
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
+
+; On i686, this is expanded into a loop. On x86_64, this calls __udivti3.
+define i65 @udiv65(i65 %a, i65 %b) nounwind {
+; X86-LABEL: udiv65:
+; X86-NOT: call
+;
+; X64-LABEL: udiv65:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: andl $1, %esi
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: callq __udivti3@PLT
+; X64-NEXT: popq %rcx
+; X64-NEXT: retq
+ %res = udiv i65 %a, %b
+ ret i65 %res
+}
+
+define i129 @udiv129(i129 %a, i129 %b) nounwind {
+; X86-LABEL: udiv129:
+; X86-NOT: call
+;
+; X64-LABEL: udiv129:
+; X64-NOT: call
+ %res = udiv i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @urem129(i129 %a, i129 %b) nounwind {
+; X86-LABEL: urem129:
+; X86-NOT: call
+;
+; X64-LABEL: urem129:
+; X64-NOT: call
+ %res = urem i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @sdiv129(i129 %a, i129 %b) nounwind {
+; X86-LABEL: sdiv129:
+; X86-NOT: call
+;
+; X64-LABEL: sdiv129:
+; X64-NOT: call
+ %res = sdiv i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @srem129(i129 %a, i129 %b) nounwind {
+; X86-LABEL: srem129:
+; X86-NOT: call
+;
+; X64-LABEL: srem129:
+; X64-NOT: call
+ %res = srem i129 %a, %b
+ ret i129 %res
+}
+
+; Some higher sizes
+define i257 @sdiv257(i257 %a, i257 %b) nounwind {
+; X86-LABEL: sdiv257:
+; X86-NOT: call
+;
+; X64-LABEL: sdiv257:
+; X64-NOT: call
+ %res = sdiv i257 %a, %b
+ ret i257 %res
+}