Add branch weights to branches for static initializers.
authorRichard Smith <richard-llvm@metafoo.co.uk>
Wed, 26 Jul 2017 22:01:09 +0000 (22:01 +0000)
committerRichard Smith <richard-llvm@metafoo.co.uk>
Wed, 26 Jul 2017 22:01:09 +0000 (22:01 +0000)
The initializer for a static local variable cannot be hot, because it runs at
most once per program. That's not quite the same thing as having a low branch
probability, but under the assumption that the function is invoked many times,
modeling this as a branch probability seems reasonable.

For TLS variables, the situation is less clear, since the initialization side
of the branch can run multiple times in a program execution, but we still
expect initialization to be rare relative to non-initialization uses. It would
seem worthwhile to add a PGO counter along this path to make this estimation
more accurate in future.

For globals with guarded initialization, we don't yet apply any branch weights.
Due to our use of COMDATs, the guard will be reached exactly once per DSO, but
we have no idea how many DSOs will define the variable.

llvm-svn: 309195

clang/lib/CodeGen/CGDeclCXX.cpp
clang/lib/CodeGen/CodeGenFunction.h
clang/lib/CodeGen/ItaniumCXXABI.cpp
clang/lib/CodeGen/MicrosoftCXXABI.cpp
clang/test/CodeGenCXX/microsoft-abi-static-initializers.cpp
clang/test/CodeGenCXX/microsoft-abi-thread-safe-statics.cpp
clang/test/CodeGenCXX/static-init-wasm.cpp
clang/test/CodeGenCXX/static-initializer-branch-weights.cpp [new file with mode: 0644]

index d8768be..98d9e36 100644 (file)
@@ -18,6 +18,7 @@
 #include "clang/Frontend/CodeGenOptions.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/Support/Path.h"
 
 using namespace clang;
@@ -259,6 +260,43 @@ void CodeGenFunction::EmitCXXGuardedInit(const VarDecl &D,
   CGM.getCXXABI().EmitGuardedInit(*this, D, DeclPtr, PerformInit);
 }
 
+void CodeGenFunction::EmitCXXGuardedInitBranch(llvm::Value *NeedsInit,
+                                               llvm::BasicBlock *InitBlock,
+                                               llvm::BasicBlock *NoInitBlock,
+                                               GuardKind Kind,
+                                               const VarDecl *D) {
+  assert((Kind == GuardKind::TlsGuard || D) && "no guarded variable");
+
+  // A guess at how many times we will enter the initialization of a
+  // variable, depending on the kind of variable.
+  static const uint64_t InitsPerTLSVar = 1024;
+  static const uint64_t InitsPerLocalVar = 1024 * 1024;
+
+  llvm::MDNode *Weights;
+  if (Kind == GuardKind::VariableGuard && !D->isLocalVarDecl()) {
+    // For non-local variables, don't apply any weighting for now. Due to our
+    // use of COMDATs, we expect there to be at most one initialization of the
+    // variable per DSO, but we have no way to know how many DSOs will try to
+    // initialize the variable.
+    Weights = nullptr;
+  } else {
+    uint64_t NumInits;
+    // FIXME: For the TLS case, collect and use profiling information to
+    // determine a more accurate brach weight.
+    if (Kind == GuardKind::TlsGuard || D->getTLSKind())
+      NumInits = InitsPerTLSVar;
+    else
+      NumInits = InitsPerLocalVar;
+
+    // The probability of us entering the initializer is
+    //   1 / (total number of times we attempt to initialize the variable).
+    llvm::MDBuilder MDHelper(CGM.getLLVMContext());
+    Weights = MDHelper.createBranchWeights(1, NumInits - 1);
+  }
+
+  Builder.CreateCondBr(NeedsInit, InitBlock, NoInitBlock, Weights);
+}
+
 llvm::Function *CodeGenModule::CreateGlobalInitOrDestructFunction(
     llvm::FunctionType *FTy, const Twine &Name, const CGFunctionInfo &FI,
     SourceLocation Loc, bool TLS) {
@@ -539,7 +577,8 @@ CodeGenFunction::GenerateCXXGlobalInitFunc(llvm::Function *Fn,
                                                  "guard.uninitialized");
       llvm::BasicBlock *InitBlock = createBasicBlock("init");
       ExitBlock = createBasicBlock("exit");
-      Builder.CreateCondBr(Uninit, InitBlock, ExitBlock);
+      EmitCXXGuardedInitBranch(Uninit, InitBlock, ExitBlock,
+                               GuardKind::TlsGuard, nullptr);
       EmitBlock(InitBlock);
       // Mark as initialized before initializing anything else. If the
       // initializers use previously-initialized thread_local vars, that's
index 753dd92..2e31be8 100644 (file)
@@ -3496,6 +3496,14 @@ public:
   void EmitCXXGuardedInit(const VarDecl &D, llvm::GlobalVariable *DeclPtr,
                           bool PerformInit);
 
+  enum class GuardKind { VariableGuard, TlsGuard };
+
+  /// Emit a branch to select whether or not to perform guarded initialization.
+  void EmitCXXGuardedInitBranch(llvm::Value *NeedsInit,
+                                llvm::BasicBlock *InitBlock,
+                                llvm::BasicBlock *NoInitBlock,
+                                GuardKind Kind, const VarDecl *D);
+
   /// GenerateCXXGlobalInitFunc - Generates code for initializing global
   /// variables.
   void GenerateCXXGlobalInitFunc(llvm::Function *Fn,
index d1f47b6..de9fd04 100644 (file)
@@ -2113,13 +2113,14 @@ void ItaniumCXXABI::EmitGuardedInit(CodeGenFunction &CGF,
       (UseARMGuardVarABI && !useInt8GuardVariable)
           ? Builder.CreateAnd(LI, llvm::ConstantInt::get(CGM.Int8Ty, 1))
           : LI;
-  llvm::Value *isInitialized = Builder.CreateIsNull(V, "guard.uninitialized");
+  llvm::Value *NeedsInit = Builder.CreateIsNull(V, "guard.uninitialized");
 
   llvm::BasicBlock *InitCheckBlock = CGF.createBasicBlock("init.check");
   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("init.end");
 
   // Check if the first byte of the guard variable is zero.
-  Builder.CreateCondBr(isInitialized, InitCheckBlock, EndBlock);
+  CGF.EmitCXXGuardedInitBranch(NeedsInit, InitCheckBlock, EndBlock,
+                               CodeGenFunction::GuardKind::VariableGuard, &D);
 
   CGF.EmitBlock(InitCheckBlock);
 
index 78b510b..409bad7 100644 (file)
@@ -2463,11 +2463,12 @@ void MicrosoftCXXABI::EmitGuardedInit(CodeGenFunction &CGF, const VarDecl &D,
     // Test our bit from the guard variable.
     llvm::ConstantInt *Bit = llvm::ConstantInt::get(GuardTy, 1ULL << GuardNum);
     llvm::LoadInst *LI = Builder.CreateLoad(GuardAddr);
-    llvm::Value *IsInitialized =
-        Builder.CreateICmpNE(Builder.CreateAnd(LI, Bit), Zero);
+    llvm::Value *NeedsInit =
+        Builder.CreateICmpEQ(Builder.CreateAnd(LI, Bit), Zero);
     llvm::BasicBlock *InitBlock = CGF.createBasicBlock("init");
     llvm::BasicBlock *EndBlock = CGF.createBasicBlock("init.end");
-    Builder.CreateCondBr(IsInitialized, EndBlock, InitBlock);
+    CGF.EmitCXXGuardedInitBranch(NeedsInit, InitBlock, EndBlock,
+                                 CodeGenFunction::GuardKind::VariableGuard, &D);
 
     // Set our bit in the guard variable and emit the initializer and add a global
     // destructor if appropriate.
@@ -2502,7 +2503,8 @@ void MicrosoftCXXABI::EmitGuardedInit(CodeGenFunction &CGF, const VarDecl &D,
         Builder.CreateICmpSGT(FirstGuardLoad, InitThreadEpoch);
     llvm::BasicBlock *AttemptInitBlock = CGF.createBasicBlock("init.attempt");
     llvm::BasicBlock *EndBlock = CGF.createBasicBlock("init.end");
-    Builder.CreateCondBr(IsUninitialized, AttemptInitBlock, EndBlock);
+    CGF.EmitCXXGuardedInitBranch(IsUninitialized, AttemptInitBlock, EndBlock,
+                                 CodeGenFunction::GuardKind::VariableGuard, &D);
 
     // This BasicBlock attempts to determine whether or not this thread is
     // responsible for doing the initialization.
index 57a72d4..0b84f07 100644 (file)
@@ -146,7 +146,7 @@ inline S &getS() {
 // CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.S* @"\01?getS@@YAAAUS@@XZ"() {{.*}} comdat
 // CHECK: load i32, i32* @"\01??_B?1??getS@@YAAAUS@@XZ@51"
 // CHECK: and i32 {{.*}}, 1
-// CHECK: icmp ne i32 {{.*}}, 0
+// CHECK: icmp eq i32 {{.*}}, 0
 // CHECK: br i1
 //   init:
 // CHECK: or i32 {{.*}}, 1
index 0202586..3f53e63 100644 (file)
@@ -24,8 +24,8 @@ extern inline S &f() {
   static thread_local S s;
 // CHECK:       %[[guard:.*]] = load i32, i32* @"\01??__J?1??f@@YAAAUS@@XZ@51"
 // CHECK-NEXT:  %[[mask:.*]] = and i32 %[[guard]], 1
-// CHECK-NEXT:  %[[cmp:.*]] = icmp ne i32 %[[mask]], 0
-// CHECK-NEXT:  br i1 %[[cmp]], label %[[init_end:.*]], label %[[init:.*]]
+// CHECK-NEXT:  %[[cmp:.*]] = icmp eq i32 %[[mask]], 0
+// CHECK-NEXT:  br i1 %[[cmp]], label %[[init:.*]], label %[[init_end:.*]], !prof ![[unlikely_threadlocal:.*]]
 //
 // CHECK:     [[init]]:
 // CHECK-NEXT:  %[[or:.*]] = or i32 %[[guard]], 1
@@ -56,7 +56,7 @@ extern inline S &g() {
 // CHECK:  %[[guard:.*]] = load atomic i32, i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ@4HA" unordered, align 4
 // CHECK-NEXT:  %[[epoch:.*]] = load i32, i32* @_Init_thread_epoch
 // CHECK-NEXT:  %[[cmp:.*]] = icmp sgt i32 %[[guard]], %[[epoch]]
-// CHECK-NEXT:  br i1 %[[cmp]], label %[[init_attempt:.*]], label %[[init_end:.*]]
+// CHECK-NEXT:  br i1 %[[cmp]], label %[[init_attempt:.*]], label %[[init_end:.*]], !prof ![[unlikely_staticlocal:.*]]
 //
 // CHECK:     [[init_attempt]]:
 // CHECK-NEXT:  call void @_Init_thread_header(i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ@4HA")
@@ -95,3 +95,6 @@ int g1() {
   static int i = f1();
   return i;
 }
+
+// CHECK-DAG: ![[unlikely_threadlocal]] = !{!"branch_weights", i32 1, i32 1023}
+// CHECK-DAG: ![[unlikely_staticlocal]] = !{!"branch_weights", i32 1, i32 1048575}
index 289c3ea..5f2f94f 100644 (file)
@@ -20,7 +20,7 @@ void g() {
 // WEBASSEMBLY32:       %[[R0:.+]] = load atomic i8, i8* bitcast (i32* @_ZGVZ1gvE1a to i8*) acquire, align 4
 // WEBASSEMBLY32-NEXT:  %[[R1:.+]] = and i8 %[[R0]], 1
 // WEBASSEMBLY32-NEXT:  %[[R2:.+]] = icmp eq i8 %[[R1]], 0
-// WEBASSEMBLY32-NEXT:  br i1 %[[R2]], label %[[CHECK:.+]], label %[[END:.+]]
+// WEBASSEMBLY32-NEXT:  br i1 %[[R2]], label %[[CHECK:.+]], label %[[END:.+]],
 // WEBASSEMBLY32:       [[CHECK]]
 // WEBASSEMBLY32:       call i32 @__cxa_guard_acquire
 // WEBASSEMBLY32:       [[END]]
@@ -30,7 +30,7 @@ void g() {
 // WEBASSEMBLY64:       %[[R0:.+]] = load atomic i8, i8* bitcast (i64* @_ZGVZ1gvE1a to i8*) acquire, align 8
 // WEBASSEMBLY64-NEXT:  %[[R1:.+]] = and i8 %[[R0]], 1
 // WEBASSEMBLY64-NEXT:  %[[R2:.+]] = icmp eq i8 %[[R1]], 0
-// WEBASSEMBLY64-NEXT:  br i1 %[[R2]], label %[[CHECK:.+]], label %[[END:.+]]
+// WEBASSEMBLY64-NEXT:  br i1 %[[R2]], label %[[CHECK:.+]], label %[[END:.+]],
 // WEBASSEMBLY64:       [[CHECK]]
 // WEBASSEMBLY64:       call i32 @__cxa_guard_acquire
 // WEBASSEMBLY64:       [[END]]
diff --git a/clang/test/CodeGenCXX/static-initializer-branch-weights.cpp b/clang/test/CodeGenCXX/static-initializer-branch-weights.cpp
new file mode 100644 (file)
index 0000000..f9e7781
--- /dev/null
@@ -0,0 +1,126 @@
+// RUN: %clang_cc1 -emit-llvm -std=c++1z %s -o - -triple=x86_64-linux-gnu | FileCheck %s
+
+struct S { S(); ~S(); };
+
+// CHECK-LABEL: define {{.*}}global_var_init
+// CHECK-NOT: br
+// CHECK: call void @_ZN1SC1Ev({{.*}}* @global)
+S global;
+
+// CHECK-LABEL: define {{.*}}global_var_init
+// FIXME: Do we really need thread-safe initialization here? We don't run
+// global ctors on multiple threads. (If we were to do so, we'd need thread-safe
+// init for B<int>::member and B<int>::inline_member too.)
+// CHECK: load atomic i8, i8* bitcast (i64* @_ZGV13inline_global to i8*) acquire,
+// CHECK: icmp eq i8 {{.*}}, 0
+// CHECK: br i1
+// CHECK-NOT: !prof
+// CHECK: call void @_ZN1SC1Ev({{.*}}* @inline_global)
+inline S inline_global;
+
+// CHECK-LABEL: define {{.*}}global_var_init
+// CHECK-NOT: br
+// CHECK: call void @_ZN1SC1Ev({{.*}}* @thread_local_global)
+thread_local S thread_local_global;
+
+// CHECK-LABEL: define {{.*}}global_var_init
+// CHECK: load i8, i8* bitcast (i64* @_ZGV26thread_local_inline_global to i8*)
+// CHECK: icmp eq i8 {{.*}}, 0
+// CHECK: br i1
+// CHECK-NOT: !prof
+// CHECK: call void @_ZN1SC1Ev({{.*}}* @thread_local_inline_global)
+thread_local inline S thread_local_inline_global;
+
+struct A {
+  static S member;
+  static thread_local S thread_local_member;
+
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: load atomic i8, i8* bitcast (i64* @_ZGVN1A13inline_memberE to i8*) acquire,
+  // CHECK: icmp eq i8 {{.*}}, 0
+  // CHECK: br i1
+  // CHECK-NOT: !prof
+  // CHECK: call void @_ZN1SC1Ev({{.*}}* @_ZN1A13inline_memberE)
+  static inline S inline_member;
+
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: load i8, i8* bitcast (i64* @_ZGVN1A26thread_local_inline_memberE to i8*)
+  // CHECK: icmp eq i8 {{.*}}, 0
+  // CHECK: br i1
+  // CHECK-NOT: !prof
+  // CHECK: call void @_ZN1SC1Ev({{.*}}* @_ZN1A26thread_local_inline_memberE)
+  static thread_local inline S thread_local_inline_member;
+};
+
+// CHECK-LABEL: define void @_Z1fv()
+void f() {
+  // CHECK: load atomic i8, i8* bitcast (i64* @_ZGVZ1fvE12static_local to i8*) acquire,
+  // CHECK: icmp eq i8 {{.*}}, 0
+  // CHECK: br i1 {{.*}}, !prof ![[WEIGHTS_LOCAL:[0-9]*]]
+  static S static_local;
+
+  // CHECK: load i8, i8* @_ZGVZ1fvE19static_thread_local,
+  // CHECK: icmp eq i8 {{.*}}, 0
+  // CHECK: br i1 {{.*}}, !prof ![[WEIGHTS_THREAD_LOCAL:[0-9]*]]
+  static thread_local S static_thread_local;
+}
+
+// CHECK-LABEL: define {{.*}}global_var_init
+// CHECK-NOT: br
+// CHECK: call void @_ZN1SC1Ev({{.*}}* @_ZN1A6memberE)
+S A::member;
+
+// CHECK-LABEL: define {{.*}}global_var_init
+// CHECK-NOT: br
+// CHECK: call void @_ZN1SC1Ev({{.*}}* @_ZN1A19thread_local_memberE)
+thread_local S A::thread_local_member;
+
+template <typename T> struct B {
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: load i8, i8* bitcast (i64* @_ZGVN1BIiE6memberE to i8*)
+  // CHECK: icmp eq i8 {{.*}}, 0
+  // CHECK: br i1
+  // CHECK-NOT: !prof
+  // CHECK: call void @_ZN1SC1Ev({{.*}}* @_ZN1BIiE6memberE)
+  static S member;
+
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: load i8, i8* bitcast (i64* @_ZGVN1BIiE13inline_memberE to i8*)
+  // CHECK: icmp eq i8 {{.*}}, 0
+  // CHECK: br i1
+  // CHECK-NOT: !prof
+  // CHECK: call void @_ZN1SC1Ev({{.*}}* @_ZN1BIiE13inline_memberE)
+  static inline S inline_member;
+
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: load i8, i8* bitcast (i64* @_ZGVN1BIiE19thread_local_memberE to i8*)
+  // CHECK: icmp eq i8 {{.*}}, 0
+  // CHECK: br i1
+  // CHECK-NOT: !prof
+  // CHECK: call void @_ZN1SC1Ev({{.*}}* @_ZN1BIiE19thread_local_memberE)
+  static thread_local S thread_local_member;
+
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: load i8, i8* bitcast (i64* @_ZGVN1BIiE26thread_local_inline_memberE to i8*)
+  // CHECK: icmp eq i8 {{.*}}, 0
+  // CHECK: br i1
+  // CHECK-NOT: !prof
+  // CHECK: call void @_ZN1SC1Ev({{.*}}* @_ZN1BIiE26thread_local_inline_memberE)
+  static thread_local inline S thread_local_inline_member;
+};
+template<typename T> S B<T>::member;
+template<typename T> thread_local S B<T>::thread_local_member;
+
+template<typename ...T> void use(T &...);
+void use_b() {
+  use(B<int>::member, B<int>::inline_member, B<int>::thread_local_member,
+      B<int>::thread_local_inline_member);
+}
+
+// CHECK-LABEL: define {{.*}}tls_init()
+// CHECK: load i8, i8* @__tls_guard, align 1
+// CHECK: icmp eq i8 {{.*}}, 0
+// CHECK: br i1 {{.*}}, !prof ![[WEIGHTS_THREAD_LOCAL]]
+
+// CHECK-DAG: ![[WEIGHTS_THREAD_LOCAL]] = !{!"branch_weights", i32 1, i32 1023}
+// CHECK-DAG: ![[WEIGHTS_LOCAL]] = !{!"branch_weights", i32 1, i32 1048575}