Don't inline dynamic allocas that simplify to huge static allocas.

author Amara Emerson <aemerson@apple.com>

Fri, 12 Jun 2020 17:19:28 +0000 (10:19 -0700)

committer Amara Emerson <aemerson@apple.com>

Thu, 25 Jun 2020 00:39:03 +0000 (17:39 -0700)
author Amara Emerson <aemerson@apple.com>
Fri, 12 Jun 2020 17:19:28 +0000 (10:19 -0700)
committer Amara Emerson <aemerson@apple.com>
Thu, 25 Jun 2020 00:39:03 +0000 (17:39 -0700)
diff --git a/llvm/include/llvm/Analysis/InlineCost.h b/llvm/include/llvm/Analysis/InlineCost.h

index c697e01..7f04a8c 100644 (file)
--- a/llvm/include/llvm/Analysis/InlineCost.h
+++ b/llvm/include/llvm/Analysis/InlineCost.h
@@ -49,6 +49,9 @@ const int ColdccPenalty = 2000;
  /// Do not inline functions which allocate this many bytes on the stack
  /// when the caller is recursive.
  const unsigned TotalAllocaSizeRecursiveCaller = 1024;
+/// Do not inline dynamic allocas that have been constant propagated to be
+/// static allocas above this amount in bytes.
+const uint64_t MaxSimplifiedDynamicAllocaToInline = 65536;
  } // namespace InlineConstants
  
  /// Represents the cost of inlining a function.
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp

index f69c7f3..c05d1ee 100644 (file)
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -853,10 +853,22 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
    if (I.isArrayAllocation()) {
      Constant *Size = SimplifiedValues.lookup(I.getArraySize());
      if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
+      // Sometimes a dynamic alloca could be converted into a static alloca
+      // after this constant prop, and become a huge static alloca on an
+      // unconditional CFG path. Avoid inlining if this is going to happen above
+      // a threshold.
+      // FIXME: If the threshold is removed or lowered too much, we could end up
+      // being too pessimistic and prevent inlining non-problematic code. This
+      // could result in unintended perf regressions. A better overall strategy
+      // is needed to track stack usage during inlining.
        Type *Ty = I.getAllocatedType();
        AllocatedSize = SaturatingMultiplyAdd(
            AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty).getFixedSize(),
            AllocatedSize);
+      if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline) {
+        HasDynamicAlloca = true;
+        return false;
+      }
        return Base::visitAlloca(I);
      }
    }
diff --git a/llvm/test/Transforms/Inline/dynamic-alloca-simplified-large.ll b/llvm/test/Transforms/Inline/dynamic-alloca-simplified-large.ll

new file mode 100644 (file)

index 0000000..11c38bf
--- /dev/null
+++ b/llvm/test/Transforms/Inline/dynamic-alloca-simplified-large.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -inline < %s -S -o - | FileCheck %s
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.15.0"
+
+define void @caller1(i8 *%p1, i1 %b) {
+; CHECK-LABEL: @caller1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i1 [[B:%.*]], true
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[SPLIT:%.*]]
+; CHECK:       split:
+; CHECK-NEXT:    call void @callee(i8* [[P1:%.*]], i32 0, i32 -1)
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i1 %b, true
+  br i1 %cond, label %exit, label %split
+
+split:
+  ; This path may be generated from CS splitting and never taken at runtime.
+  call void @callee(i8* %p1, i32 0, i32 -1)
+  br label %exit
+
+exit:
+  ret void
+}
+
+define  void @callee(i8* %p1, i32 %l1, i32 %l2) {
+entry:
+  %ext = zext i32 %l2 to i64
+  %vla = alloca float, i64 %ext, align 16
+  call void @extern_call(float* nonnull %vla) #3
+  ret void
+}
+
+
+define void @caller2_below_threshold(i8 *%p1, i1 %b) {
+; CHECK-LABEL: @caller2_below_threshold(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VLA_I:%.*]] = alloca float, i64 15000, align 16
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i1 [[B:%.*]], true
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[SPLIT:%.*]]
+; CHECK:       split:
+; CHECK-NEXT:    [[SAVEDSTACK:%.*]] = call i8* @llvm.stacksave()
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[VLA_I]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 60000, i8* [[TMP0]])
+; CHECK-NEXT:    call void @extern_call(float* nonnull [[VLA_I]]) #2
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[VLA_I]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 60000, i8* [[TMP1]])
+; CHECK-NEXT:    call void @llvm.stackrestore(i8* [[SAVEDSTACK]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i1 %b, true
+  br i1 %cond, label %exit, label %split
+
+split:
+  call void @callee(i8* %p1, i32 0, i32 15000)
+  br label %exit
+
+exit:
+  ret void
+}
+
+define  void @callee2_not_in_entry(i8* %p1, i32 %l1, i32 %l2) {
+entry:
+  %ext = zext i32 %l2 to i64
+  %c = icmp eq i32 %l1, 42
+  br i1 %c, label %bb2, label %bb3
+bb2:
+  %vla = alloca float, i64 %ext, align 16
+  call void @extern_call(float* nonnull %vla) #3
+  ret void
+bb3:
+  ret void
+}
+
+define void @caller3_alloca_not_in_entry(i8 *%p1, i1 %b) {
+; CHECK-LABEL: @caller3_alloca_not_in_entry(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i1 [[B:%.*]], true
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[SPLIT:%.*]]
+; CHECK:       split:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i1 %b, true
+  br i1 %cond, label %exit, label %split
+
+split:
+  call void @callee2_not_in_entry(i8* %p1, i32 0, i32 -1)
+  br label %exit
+
+exit:
+  ret void
+}
+
+define void @caller4_over_threshold(i8 *%p1, i1 %b, i32 %len) {
+; CHECK-LABEL: @caller4_over_threshold(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i1 [[B:%.*]], true
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[SPLIT:%.*]]
+; CHECK:       split:
+; CHECK-NEXT:    call void @callee(i8* [[P1:%.*]], i32 0, i32 16500)
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i1 %b, true
+  br i1 %cond, label %exit, label %split
+
+split:
+  call void @callee(i8* %p1, i32 0, i32 16500)
+  br label %exit
+
+exit:
+  ret void
+}
+
+declare noalias i8* @malloc(i64)
+define i8* @stack_allocate(i32 %size) #2 {
+entry:
+  %cmp = icmp ult i32 %size, 100
+  %conv = zext i32 %size to i64
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %0 = alloca i8, i64 %conv, align 8
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %call = tail call i8* @malloc(i64 %conv) #3
+  br label %return
+
+return:                                           ; preds = %if.end, %if.then
+  %retval.0 = phi i8* [ %0, %if.then ], [ %call, %if.end ]
+  ret i8* %retval.0
+}
+
+define i8* @test_stack_allocate_always(i32 %size) {
+; CHECK-LABEL: @test_stack_allocate_always(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SAVEDSTACK:%.*]] = call i8* @llvm.stacksave()
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i32 [[SIZE:%.*]], 100
+; CHECK-NEXT:    [[CONV_I:%.*]] = zext i32 [[SIZE]] to i64
+; CHECK-NEXT:    br i1 [[CMP_I]], label [[IF_THEN_I:%.*]], label [[IF_END_I:%.*]]
+; CHECK:       if.then.i:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i8, i64 [[CONV_I]], align 8
+; CHECK-NEXT:    br label [[STACK_ALLOCATE_EXIT:%.*]]
+; CHECK:       if.end.i:
+; CHECK-NEXT:    [[CALL_I:%.*]] = tail call i8* @malloc(i64 [[CONV_I]]) #2
+; CHECK-NEXT:    br label [[STACK_ALLOCATE_EXIT]]
+; CHECK:       stack_allocate.exit:
+; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = phi i8* [ [[TMP0]], [[IF_THEN_I]] ], [ [[CALL_I]], [[IF_END_I]] ]
+; CHECK-NEXT:    call void @llvm.stackrestore(i8* [[SAVEDSTACK]])
+; CHECK-NEXT:    ret i8* [[RETVAL_0_I]]
+;
+entry:
+  %call = tail call i8* @stack_allocate(i32 %size)
+  ret i8* %call
+}
+
+declare void @extern_call(float*)
+
+attributes #1 = { argmemonly nounwind willreturn writeonly }
+attributes #2 = { alwaysinline }
+attributes #3 = { nounwind }
+
author	Amara Emerson <aemerson@apple.com>
	Fri, 12 Jun 2020 17:19:28 +0000 (10:19 -0700)
committer	Amara Emerson <aemerson@apple.com>
	Thu, 25 Jun 2020 00:39:03 +0000 (17:39 -0700)
llvm/include/llvm/Analysis/InlineCost.h		patch \| blob \| history
llvm/lib/Analysis/InlineCost.cpp		patch \| blob \| history
llvm/test/Transforms/Inline/dynamic-alloca-simplified-large.ll	[new file with mode: 0644]	patch \| blob