From 34fc1db9a8b22300a90e71fe7285501e7bcdc90e Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 29 Jun 2022 14:39:42 -0400 Subject: [PATCH] [LinkerWrapper] Change wrapping to include jumps for other variables Summary: We don't currently support other variable types, like managed or surface. This patch simply adds code that checks the flags and does nothing. This prevents us from registering a surface as a variable as we do now. In the future, registering these will require adding the flags to the entry struct. --- clang/test/Driver/linker-wrapper-image.c | 62 ++++++++++++++-------- .../tools/clang-linker-wrapper/OffloadWrapper.cpp | 50 +++++++++++++++-- 2 files changed, 87 insertions(+), 25 deletions(-) diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c index 524fc25..fd4a9cc 100644 --- a/clang/test/Driver/linker-wrapper-image.c +++ b/clang/test/Driver/linker-wrapper-image.c @@ -35,7 +35,7 @@ // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple x86_64-unknown-linux-gnu \ // RUN: -linker-path /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA -// CUDA: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".nv_fatbin" +// CUDA: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".nv_fatbin" // CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8 // CUDA-NEXT: @__dummy.cuda_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "cuda_offloading_entries" // CUDA-NEXT: @.cuda.binary_handle = internal global ptr null @@ -43,7 +43,7 @@ // CUDA-NEXT: @__stop_cuda_offloading_entries = external hidden constant [0 x %__tgt_offload_entry] // CUDA-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }] -// CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" { +// CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" { // CUDA-NEXT: entry: // CUDA-NEXT: %0 = call ptr @__cudaRegisterFatBinary(ptr @.fatbin_wrapper) // CUDA-NEXT: store ptr %0, ptr @.cuda.binary_handle, align 8 @@ -53,41 +53,61 @@ // CUDA-NEXT: ret void // CUDA-NEXT: } -// CUDA: define internal void @.cuda.fatbin_unreg() section ".text.startup" { +// CUDA: define internal void @.cuda.fatbin_unreg() section ".text.startup" { // CUDA-NEXT: entry: // CUDA-NEXT: %0 = load ptr, ptr @.cuda.binary_handle, align 8 // CUDA-NEXT: call void @__cudaUnregisterFatBinary(ptr %0) // CUDA-NEXT: ret void // CUDA-NEXT: } -// CUDA: define internal void @.cuda.globals_reg(ptr %0) section ".text.startup" { +// CUDA: define internal void @.cuda.globals_reg(ptr %0) section ".text.startup" { // CUDA-NEXT: entry: // CUDA-NEXT: br i1 icmp ne (ptr @__start_cuda_offloading_entries, ptr @__stop_cuda_offloading_entries), label %while.entry, label %while.end -// CUDA: while.entry: -// CUDA-NEXT: %entry1 = phi ptr [ @__start_cuda_offloading_entries, %entry ], [ %7, %if.end ] -// CUDA-NEXT: %1 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 0 -// CUDA-NEXT: %addr = load ptr, ptr %1, align 8 -// CUDA-NEXT: %2 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 1 -// CUDA-NEXT: %name = load ptr, ptr %2, align 8 -// CUDA-NEXT: %3 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 2 -// CUDA-NEXT: %size = load i64, ptr %3, align 4 -// CUDA-NEXT: %4 = icmp eq i64 %size, 0 -// CUDA-NEXT: br i1 %4, label %if.then, label %if.else - -// CUDA: if.then: -// CUDA-NEXT: %5 = call i32 @__cudaRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null) +// CUDA: while.entry: +// CUDA-NEXT: %entry1 = phi ptr [ @__start_cuda_offloading_entries, %entry ], [ %7, %if.end ] +// CUDA-NEXT: %1 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 0 +// CUDA-NEXT: %addr = load ptr, ptr %1, align 8 +// CUDA-NEXT: %2 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 1 +// CUDA-NEXT: %name = load ptr, ptr %2, align 8 +// CUDA-NEXT: %3 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 2 +// CUDA-NEXT: %size = load i64, ptr %3, align 4 +// CUDA-NEXT: %4 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 3 +// CUDA-NEXT: %flag = load i32, ptr %4, align 4 +// CUDA-NEXT: %5 = icmp eq i64 %size, 0 +// CUDA-NEXT: br i1 %5, label %if.then, label %if.else + + +// CUDA: if.then: +// CUDA-NEXT: %6 = call i32 @__cudaRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null) +// CUDA-NEXT: br label %if.end + +// CUDA: if.else: +// CUDA-NEXT: switch i32 %flag, label %if.end [ +// CUDA-NEXT: i32 0, label %sw.global +// CUDA-NEXT: i32 1, label %sw.managed +// CUDA-NEXT: i32 2, label %sw.surface +// CUDA-NEXT: i32 3, label %sw.texture +// CUDA-NEXT: ] + +// CUDA: sw.global: +// CUDA-NEXT: call void @__cudaRegisterVar(ptr %0, ptr %addr, ptr %name, ptr %name, i32 0, i64 %size, i32 0, i32 0) +// CUDA-NEXT: br label %if.end + +// CUDA: sw.managed: +// CUDA-NEXT: br label %if.end + +// CUDA: sw.surface: // CUDA-NEXT: br label %if.end -// CUDA: if.else: -// CUDA-NEXT: %6 = call i32 @__cudaRegisterVar(ptr %0, ptr %addr, ptr %name, ptr %name, i32 0, i64 %size, i32 0, i32 0) +// CUDA: sw.texture: // CUDA-NEXT: br label %if.end -// CUDA: if.end: +// CUDA: if.end: // CUDA-NEXT: %7 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 1 // CUDA-NEXT: %8 = icmp eq ptr %7, @__stop_cuda_offloading_entries // CUDA-NEXT: br i1 %8, label %while.end, label %while.entry -// CUDA: while.end: +// CUDA: while.end: // CUDA-NEXT: ret void // CUDA-NEXT: } diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp index cc03058..847113e 100644 --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp @@ -23,6 +23,19 @@ namespace { /// Magic number that begins the section containing the CUDA fatbinary. constexpr unsigned CudaFatMagic = 0x466243b1; +/// Copied from clang/CGCudaRuntime.h. +enum OffloadEntryKindFlag : uint32_t { + /// Mark the entry as a global entry. This indicates the presense of a + /// kernel if the size size field is zero and a variable otherwise. + OffloadGlobalEntry = 0x0, + /// Mark the entry as a managed global variable. + OffloadGlobalManagedEntry = 0x1, + /// Mark the entry as a surface variable. + OffloadGlobalSurfaceEntry = 0x2, + /// Mark the entry as a texture variable. + OffloadGlobalTextureEntry = 0x3, +}; + IntegerType *getSizeTTy(Module &M) { LLVMContext &C = M.getContext(); switch (M.getDataLayout().getPointerTypeSize(Type::getInt8PtrTy(C))) { @@ -345,9 +358,6 @@ GlobalVariable *createFatbinDesc(Module &M, ArrayRef Image) { /// 0, entry->size, 0, 0); /// } /// } -/// -/// TODO: This only registers functions are variables. Additional support is -/// required for texture / surface / managed variables. Function *createRegisterGlobalsFunction(Module &M) { LLVMContext &C = M.getContext(); // Get the __cudaRegisterFunction function declaration. @@ -363,7 +373,7 @@ Function *createRegisterGlobalsFunction(Module &M) { // Get the __cudaRegisterVar function declaration. auto *RegVarTy = FunctionType::get( - Type::getInt32Ty(C), + Type::getVoidTy(C), {Type::getInt8PtrTy(C)->getPointerTo(), Type::getInt8PtrTy(C), Type::getInt8PtrTy(C), Type::getInt8PtrTy(C), Type::getInt32Ty(C), getSizeTTy(M), Type::getInt32Ty(C), Type::getInt32Ty(C)}, @@ -394,6 +404,10 @@ Function *createRegisterGlobalsFunction(Module &M) { auto *EntryBB = BasicBlock::Create(C, "while.entry", RegGlobalsFn); auto *IfThenBB = BasicBlock::Create(C, "if.then", RegGlobalsFn); auto *IfElseBB = BasicBlock::Create(C, "if.else", RegGlobalsFn); + auto *SwGlobalBB = BasicBlock::Create(C, "sw.global", RegGlobalsFn); + auto *SwManagedBB = BasicBlock::Create(C, "sw.managed", RegGlobalsFn); + auto *SwSurfaceBB = BasicBlock::Create(C, "sw.surface", RegGlobalsFn); + auto *SwTextureBB = BasicBlock::Create(C, "sw.texture", RegGlobalsFn); auto *IfEndBB = BasicBlock::Create(C, "if.end", RegGlobalsFn); auto *ExitBB = BasicBlock::Create(C, "while.end", RegGlobalsFn); @@ -416,9 +430,16 @@ Function *createRegisterGlobalsFunction(Module &M) { {ConstantInt::get(getSizeTTy(M), 0), ConstantInt::get(Type::getInt32Ty(C), 2)}); auto *Size = Builder.CreateLoad(getSizeTTy(M), SizePtr, "size"); + auto *FlagsPtr = + Builder.CreateInBoundsGEP(getEntryTy(M), Entry, + {ConstantInt::get(getSizeTTy(M), 0), + ConstantInt::get(Type::getInt32Ty(C), 3)}); + auto *Flags = Builder.CreateLoad(Type::getInt32Ty(C), FlagsPtr, "flag"); auto *FnCond = Builder.CreateICmpEQ(Size, ConstantInt::getNullValue(getSizeTTy(M))); Builder.CreateCondBr(FnCond, IfThenBB, IfElseBB); + + // Create kernel registration code. Builder.SetInsertPoint(IfThenBB); Builder.CreateCall(RegFunc, {RegGlobalsFn->arg_begin(), Addr, Name, Name, @@ -430,11 +451,32 @@ Function *createRegisterGlobalsFunction(Module &M) { ConstantPointerNull::get(Type::getInt32PtrTy(C))}); Builder.CreateBr(IfEndBB); Builder.SetInsertPoint(IfElseBB); + + auto *Switch = Builder.CreateSwitch(Flags, IfEndBB); + // Create global variable registration code. + Builder.SetInsertPoint(SwGlobalBB); Builder.CreateCall(RegVar, {RegGlobalsFn->arg_begin(), Addr, Name, Name, ConstantInt::get(Type::getInt32Ty(C), 0), Size, ConstantInt::get(Type::getInt32Ty(C), 0), ConstantInt::get(Type::getInt32Ty(C), 0)}); Builder.CreateBr(IfEndBB); + Switch->addCase(Builder.getInt32(OffloadGlobalEntry), SwGlobalBB); + + // Create managed variable registration code. + Builder.SetInsertPoint(SwManagedBB); + Builder.CreateBr(IfEndBB); + Switch->addCase(Builder.getInt32(OffloadGlobalManagedEntry), SwManagedBB); + + // Create surface variable registration code. + Builder.SetInsertPoint(SwSurfaceBB); + Builder.CreateBr(IfEndBB); + Switch->addCase(Builder.getInt32(OffloadGlobalSurfaceEntry), SwSurfaceBB); + + // Create texture variable registration code. + Builder.SetInsertPoint(SwTextureBB); + Builder.CreateBr(IfEndBB); + Switch->addCase(Builder.getInt32(OffloadGlobalTextureEntry), SwTextureBB); + Builder.SetInsertPoint(IfEndBB); auto *NewEntry = Builder.CreateInBoundsGEP( getEntryTy(M), Entry, ConstantInt::get(getSizeTTy(M), 1)); -- 2.7.4