From c66d254ded501057a593260639a2bf80058b544c Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Wed, 27 Jun 2018 18:32:51 +0000
Subject: [PATCH] [CUDA] Use atexit() to call module destructor.

This matches the way NVCC does it. Doing module cleanup at global
destructor phase used to work, but is, apparently, too late for
the CUDA runtime in CUDA-9.2, which ends up crashing with double-free.

Differential Revision: https://reviews.llvm.org/D48613

llvm-svn: 335763
---
 clang/lib/CodeGen/CGCUDANV.cpp        | 13 +++++++++++++
 clang/lib/CodeGen/CodeGenModule.cpp   |  5 ++---
 clang/test/CodeGenCUDA/device-stub.cu |  4 ++--
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 7322095..f8a7825 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -472,6 +472,19 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
     CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
   }
 
+  // Create destructor and register it with atexit() the way NVCC does it. Doing
+  // it during regular destructor phase worked in CUDA before 9.2 but results in
+  // double-free in 9.2.
+  if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
+    // extern "C" int atexit(void (*f)(void));
+    llvm::FunctionType *AtExitTy =
+        llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
+    llvm::Constant *AtExitFunc =
+        CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
+                                  /*Local=*/true);
+    CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
+  }
+
   CtorBuilder.CreateRetVoid();
   return ModuleCtorFunc;
 }
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 7752dbc..13afd4b 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -404,10 +404,9 @@ void CodeGenModule::Release() {
       AddGlobalCtor(ObjCInitFunction);
   if (Context.getLangOpts().CUDA && !Context.getLangOpts().CUDAIsDevice &&
       CUDARuntime) {
-    if (llvm::Function *CudaCtorFunction = CUDARuntime->makeModuleCtorFunction())
+    if (llvm::Function *CudaCtorFunction =
+            CUDARuntime->makeModuleCtorFunction())
       AddGlobalCtor(CudaCtorFunction);
-    if (llvm::Function *CudaDtorFunction = CUDARuntime->makeModuleDtorFunction())
-      AddGlobalDtor(CudaDtorFunction);
   }
   if (OpenMPRuntime) {
     if (llvm::Function *OpenMPRegistrationFunction =
diff --git a/clang/test/CodeGenCUDA/device-stub.cu b/clang/test/CodeGenCUDA/device-stub.cu
index 3798b8c..7aae4fb 100644
--- a/clang/test/CodeGenCUDA/device-stub.cu
+++ b/clang/test/CodeGenCUDA/device-stub.cu
@@ -86,8 +86,6 @@ void use_pointers() {
 // HIPRDC-SAME: c"[[MODULE_ID:.+]]\00", section "__hip_module_id", align 32
 // * Make sure our constructor was added to global ctor list.
 // ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor
-// * In separate mode we also register a destructor.
-// NORDC: @llvm.global_dtors = appending global {{.*}}@__[[PREFIX]]_module_dtor
 // * Alias to global symbol containing the NVModuleID.
 // RDC: @__fatbinwrap[[MODULE_ID]] = alias { i32, i32, i8*, i8* }
 // RDC-SAME: { i32, i32, i8*, i8* }* @__[[PREFIX]]_fatbin_wrapper
@@ -127,6 +125,8 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
 // NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
 //   .. and then calls __[[PREFIX]]_register_globals
 // NORDC-NEXT: call void @__[[PREFIX]]_register_globals
+// * In separate mode we also register a destructor.
+// NORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor)
 
 // With relocatable device code we call __[[PREFIX]]RegisterLinkedBinary%NVModuleID%
 // RDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]](
-- 
2.7.4