From: Wei Mi Date: Mon, 16 Jul 2018 15:42:20 +0000 (+0000) Subject: [RegAlloc] Skip global splitting if the live range is huge and its spill is X-Git-Tag: llvmorg-7.0.0-rc1~1338 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=40c4aa7637d07a807ceb8f2a5d7d69cf3d50085a;p=platform%2Fupstream%2Fllvm.git [RegAlloc] Skip global splitting if the live range is huge and its spill is trivially rematerializable. We run into a case where machineLICM hoists a large number of live ranges outside of a big loop because it thinks those live ranges are trivially rematerializable. In regalloc, global splitting is tried out first for those live ranges before they are spilled and rematerialized. Because the global splitting algorithm is quadratic, increasing a lot of global splitting candidates causes huge compile time increase (50s to 1400s on my local machine when compiling a module). However, we think for live ranges which are very large and are trivially rematerialiable, it is better to just skip global splitting so as to save compile time with little chance of sacrificing performance. We uses the segment size of live range to indirectly evaluate whether the global splitting of the live range can introduce high cost, and use an option as a knob to adjust the size limit threshold. Differential Revision: https://reviews.llvm.org/D49353 llvm-svn: 337186 --- diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 07b201b..e8e6cf2 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -125,6 +125,11 @@ static cl::opt EnableDeferredSpilling( "variable because of other evicted variables."), cl::init(false)); +static cl::opt + HugeSizeForSplit("huge-size-for-split", cl::Hidden, + cl::desc("Last chance recoloring max depth"), + cl::init(5000)); + // FIXME: Find a good default for this flag and remove the flag. static cl::opt CSRFirstTimeCost("regalloc-csr-first-time-cost", @@ -478,6 +483,7 @@ private: SmallVectorImpl&, unsigned = ~0u); unsigned tryRegionSplit(LiveInterval&, AllocationOrder&, SmallVectorImpl&); + unsigned isSplitBenefitWorthCost(LiveInterval &VirtReg); /// Calculate cost of region splitting. unsigned calculateRegionSplitCost(LiveInterval &VirtReg, AllocationOrder &Order, @@ -1771,8 +1777,21 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit, MF->verify(this, "After splitting live range around region"); } +// Global split has high compile time cost especially for large live range. +// Return false for the case here where the potential benefit will never +// worth the cost. +unsigned RAGreedy::isSplitBenefitWorthCost(LiveInterval &VirtReg) { + MachineInstr *MI = MRI->getUniqueVRegDef(VirtReg.reg); + if (MI && TII->isTriviallyReMaterializable(*MI, AA) && + VirtReg.size() > HugeSizeForSplit) + return false; + return true; +} + unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order, SmallVectorImpl &NewVRegs) { + if (!isSplitBenefitWorthCost(VirtReg)) + return 0; unsigned NumCands = 0; BlockFrequency SpillCost = calcSpillCost(); BlockFrequency BestCost; diff --git a/llvm/test/CodeGen/X86/limit-split-cost.mir b/llvm/test/CodeGen/X86/limit-split-cost.mir new file mode 100644 index 0000000..219ff37 --- /dev/null +++ b/llvm/test/CodeGen/X86/limit-split-cost.mir @@ -0,0 +1,150 @@ +# REQUIRES: asserts +# RUN: llc -mtriple=x86_64-- -run-pass=greedy %s -debug-only=regalloc -huge-size-for-split=0 -o /dev/null 2>&1 | FileCheck %s +# Check no global region split is needed because the live range to split is trivially rematerializable. +# CHECK-NOT: Compact region bundles +--- | + ; ModuleID = '' + source_filename = "2.cc" + target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + target triple = "x86_64-unknown-linux-gnu" + + @m = local_unnamed_addr global i32 0, align 4 + @.str = private unnamed_addr constant [4 x i8] c"abc\00", align 1 + @.str.1 = private unnamed_addr constant [4 x i8] c"def\00", align 1 + @.str.2 = private unnamed_addr constant [4 x i8] c"ghi\00", align 1 + + ; Function Attrs: uwtable + define void @_Z3fooi(i32 %value) local_unnamed_addr #0 { + entry: + br label %do.body + + do.body: ; preds = %do.cond, %entry + tail call void asm sideeffect "", "~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"() #2, !srcloc !3 + switch i32 %value, label %do.cond [ + i32 0, label %sw.bb + i32 1, label %sw.bb1 + i32 2, label %sw.bb2 + ] + + sw.bb: ; preds = %do.body + tail call void @_Z3gooPKc(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0)) + br label %sw.bb1 + + sw.bb1: ; preds = %sw.bb, %do.body + tail call void @_Z3gooPKc(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.1, i64 0, i64 0)) + br label %sw.bb2 + + sw.bb2: ; preds = %sw.bb1, %do.body + tail call void @_Z3gooPKc(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.2, i64 0, i64 0)) + br label %do.cond + + do.cond: ; preds = %sw.bb2, %do.body + %0 = load i32, i32* @m, align 4, !tbaa !4 + %cmp = icmp eq i32 %0, 5 + br i1 %cmp, label %do.end, label %do.body + + do.end: ; preds = %do.cond + ret void + } + + declare void @_Z3gooPKc(i8*) local_unnamed_addr #1 + + ; Function Attrs: nounwind + declare void @llvm.stackprotector(i8*, i8**) #2 + + attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #2 = { nounwind } + + !llvm.module.flags = !{!0, !1} + !llvm.ident = !{!2} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{i32 7, !"PIC Level", i32 2} + !2 = !{!"clang version 7.0.0 (trunk 335057)"} + !3 = !{i32 80} + !4 = !{!5, !5, i64 0} + !5 = !{!"int", !6, i64 0} + !6 = !{!"omnipotent char", !7, i64 0} + !7 = !{!"Simple C++ TBAA"} + +... +--- +name: _Z3fooi +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: gr32 } + - { id: 1, class: gr32 } + - { id: 2, class: gr32 } + - { id: 3, class: gr64 } + - { id: 4, class: gr64 } + - { id: 5, class: gr64 } + - { id: 6, class: gr64 } + - { id: 7, class: gr32 } + - { id: 8, class: gr32 } +liveins: + - { reg: '$edi', virtual-reg: '%0' } +frameInfo: + hasCalls: true +body: | + bb.0.entry: + liveins: $edi + + %0:gr32 = COPY $edi + %5:gr64 = LEA64r $rip, 1, $noreg, @.str.2, $noreg + %6:gr64 = MOV64rm $rip, 1, $noreg, target-flags(x86-gotpcrel) @m, $noreg :: (load 8 from got) + %4:gr64 = LEA64r $rip, 1, $noreg, @.str.1, $noreg + %3:gr64 = LEA64r $rip, 1, $noreg, @.str, $noreg + + bb.1.do.body: + successors: %bb.6(0x20000000), %bb.2(0x60000000) + + INLINEASM &"", 1, 12, implicit-def dead early-clobber $r10, 12, implicit-def dead early-clobber $r11, 12, implicit-def dead early-clobber $r12, 12, implicit-def dead early-clobber $r13, 12, implicit-def dead early-clobber $r14, 12, implicit-def dead early-clobber $r15, 12, implicit-def dead early-clobber $eflags, !3 + CMP32ri8 %0, 2, implicit-def $eflags + JE_1 %bb.6, implicit killed $eflags + JMP_1 %bb.2 + + bb.2.do.body: + successors: %bb.5(0x2aaaaaab), %bb.3(0x55555555) + + CMP32ri8 %0, 1, implicit-def $eflags + JE_1 %bb.5, implicit killed $eflags + JMP_1 %bb.3 + + bb.3.do.body: + successors: %bb.4, %bb.7 + + TEST32rr %0, %0, implicit-def $eflags + JNE_1 %bb.7, implicit killed $eflags + JMP_1 %bb.4 + + bb.4.sw.bb: + ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + $rdi = COPY %3 + CALL64pcrel32 target-flags(x86-plt) @_Z3gooPKc, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + + bb.5.sw.bb1: + ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + $rdi = COPY %4 + CALL64pcrel32 target-flags(x86-plt) @_Z3gooPKc, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + + bb.6.sw.bb2: + ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + $rdi = COPY %5 + CALL64pcrel32 target-flags(x86-plt) @_Z3gooPKc, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + + bb.7.do.cond: + successors: %bb.8(0x04000000), %bb.1(0x7c000000) + + CMP32mi8 %6, 1, $noreg, 0, $noreg, 5, implicit-def $eflags :: (dereferenceable load 4 from @m, !tbaa !4) + JNE_1 %bb.1, implicit killed $eflags + JMP_1 %bb.8 + + bb.8.do.end: + RET 0 + +...