From c0e42750da5f4eaecb00ce46e4a5cae8e4cddc3d Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Thu, 28 Jan 2016 23:58:28 +0000 Subject: [PATCH] [CUDA] Generate CUDA's printf alloca in its function's entry block. Summary: This is necessary to prevent llvm from generating stacksave intrinsics around this alloca. NVVM doesn't have a stack, and we don't handle said intrinsics. Reviewers: rnk, echristo Subscribers: cfe-commits, jhen, tra Differential Revision: http://reviews.llvm.org/D16664 llvm-svn: 259122 --- clang/lib/CodeGen/CGCUDABuiltin.cpp | 57 ++++++++++++------------------------- clang/test/CodeGenCUDA/printf.cu | 56 +++++++++++++++--------------------- 2 files changed, 41 insertions(+), 72 deletions(-) diff --git a/clang/lib/CodeGen/CGCUDABuiltin.cpp b/clang/lib/CodeGen/CGCUDABuiltin.cpp index 6b9b970..0ccba89 100644 --- a/clang/lib/CodeGen/CGCUDABuiltin.cpp +++ b/clang/lib/CodeGen/CGCUDABuiltin.cpp @@ -52,10 +52,13 @@ static llvm::Function *GetVprintfDeclaration(llvm::Module &M) { // // is converted into something resembling // -// char* buf = alloca(...); -// *reinterpret_cast(buf) = arg1; -// *reinterpret_cast(buf + ...) = arg2; -// *reinterpret_cast(buf + ...) = arg3; +// struct Tmp { +// Arg1 a1; +// Arg2 a2; +// Arg3 a3; +// }; +// char* buf = alloca(sizeof(Tmp)); +// *(Tmp*)buf = {a1, a2, a3}; // vprintf("format string", buf); // // buf is aligned to the max of {alignof(Arg1), ...}. Furthermore, each of the @@ -80,48 +83,24 @@ CodeGenFunction::EmitCUDADevicePrintfCallExpr(const CallExpr *E, E->arguments(), E->getDirectCallee(), /* ParamsToSkip = */ 0); - // Figure out how large of a buffer we need to hold our varargs and how - // aligned the buffer needs to be. We start iterating at Arg[1], because - // that's our first vararg. - unsigned BufSize = 0; - unsigned BufAlign = 0; - for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) { - const RValue& RV = Args[I].RV; - llvm::Type* Ty = RV.getScalarVal()->getType(); - - auto Align = DL.getPrefTypeAlignment(Ty); - BufAlign = std::max(BufAlign, Align); - // Add padding required to keep the current arg aligned. - BufSize = llvm::alignTo(BufSize, Align); - BufSize += DL.getTypeAllocSize(Ty); - } - - // Construct and fill the buffer. - llvm::Value* BufferPtr = nullptr; - if (BufSize == 0) { + // Construct and fill the args buffer that we'll pass to vprintf. + llvm::Value *BufferPtr; + if (Args.size() <= 1) { // If there are no args, pass a null pointer to vprintf. BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx)); } else { - BufferPtr = Builder.Insert(new llvm::AllocaInst( - llvm::Type::getInt8Ty(Ctx), llvm::ConstantInt::get(Int32Ty, BufSize), - BufAlign, "printf_arg_buf")); + llvm::SmallVector ArgTypes; + for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) + ArgTypes.push_back(Args[I].RV.getScalarVal()->getType()); + llvm::Type *AllocaTy = llvm::StructType::create(ArgTypes, "printf_args"); + llvm::Value *Alloca = CreateTempAlloca(AllocaTy); - unsigned Offset = 0; for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) { + llvm::Value *P = Builder.CreateStructGEP(AllocaTy, Alloca, I - 1); llvm::Value *Arg = Args[I].RV.getScalarVal(); - llvm::Type *Ty = Arg->getType(); - auto Align = DL.getPrefTypeAlignment(Ty); - - // Pad the buffer to Arg's alignment. - Offset = llvm::alignTo(Offset, Align); - - // Store Arg into the buffer at Offset. - llvm::Value *GEP = - Builder.CreateGEP(BufferPtr, llvm::ConstantInt::get(Int32Ty, Offset)); - llvm::Value *Cast = Builder.CreateBitCast(GEP, Ty->getPointerTo()); - Builder.CreateAlignedStore(Arg, Cast, Align); - Offset += DL.getTypeAllocSize(Ty); + Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlignment(Arg->getType())); } + BufferPtr = Builder.CreatePointerCast(Alloca, llvm::Type::getInt8PtrTy(Ctx)); } // Invoke vprintf and return. diff --git a/clang/test/CodeGenCUDA/printf.cu b/clang/test/CodeGenCUDA/printf.cu index f91aba7..dc3f4ea 100644 --- a/clang/test/CodeGenCUDA/printf.cu +++ b/clang/test/CodeGenCUDA/printf.cu @@ -9,45 +9,35 @@ extern "C" __device__ int vprintf(const char*, const char*); // Check a simple call to printf end-to-end. +// CHECK: [[SIMPLE_PRINTF_TY:%[a-zA-Z0-9_]+]] = type { i32, i64, double } __device__ int CheckSimple() { + // CHECK: [[BUF:%[a-zA-Z0-9_]+]] = alloca [[SIMPLE_PRINTF_TY]] // CHECK: [[FMT:%[0-9]+]] = load{{.*}}%fmt - const char* fmt = "%d"; - // CHECK: [[BUF:%[a-zA-Z0-9_]+]] = alloca i8, i32 4, align 4 - // CHECK: [[PTR:%[0-9]+]] = getelementptr i8, i8* [[BUF]], i32 0 - // CHECK: [[CAST:%[0-9]+]] = bitcast i8* [[PTR]] to i32* - // CHECK: store i32 42, i32* [[CAST]], align 4 - // CHECK: [[RET:%[0-9]+]] = call i32 @vprintf(i8* [[FMT]], i8* [[BUF]]) + const char* fmt = "%d %lld %f"; + // CHECK: [[PTR0:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 0 + // CHECK: store i32 1, i32* [[PTR0]], align 4 + // CHECK: [[PTR1:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 1 + // CHECK: store i64 2, i64* [[PTR1]], align 8 + // CHECK: [[PTR2:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 2 + // CHECK: store double 3.0{{[^,]*}}, double* [[PTR2]], align 8 + // CHECK: [[BUF_CAST:%[0-9]+]] = bitcast [[SIMPLE_PRINTF_TY]]* [[BUF]] to i8* + // CHECK: [[RET:%[0-9]+]] = call i32 @vprintf(i8* [[FMT]], i8* [[BUF_CAST]]) // CHECK: ret i32 [[RET]] - return printf(fmt, 42); -} - -// Check that the args' types are promoted correctly when we call printf. -__device__ void CheckTypes() { - // CHECK: alloca {{.*}} align 8 - // CHECK: getelementptr {{.*}} i32 0 - // CHECK: bitcast {{.*}} to i32* - // CHECK: getelementptr {{.*}} i32 4 - // CHECK: bitcast {{.*}} to i32* - // CHECK: getelementptr {{.*}} i32 8 - // CHECK: bitcast {{.*}} to double* - // CHECK: getelementptr {{.*}} i32 16 - // CHECK: bitcast {{.*}} to double* - printf("%d %d %f %f", (char)1, (short)2, 3.0f, 4.0); -} - -// Check that the args are aligned properly in the buffer. -__device__ void CheckAlign() { - // CHECK: alloca i8, i32 40, align 8 - // CHECK: getelementptr {{.*}} i32 0 - // CHECK: getelementptr {{.*}} i32 8 - // CHECK: getelementptr {{.*}} i32 16 - // CHECK: getelementptr {{.*}} i32 20 - // CHECK: getelementptr {{.*}} i32 24 - // CHECK: getelementptr {{.*}} i32 32 - printf("%d %f %d %d %d %lld", 1, 2.0, 3, 4, 5, (long long)6); + return printf(fmt, 1, 2ll, 3.0); } __device__ void CheckNoArgs() { // CHECK: call i32 @vprintf({{.*}}, i8* null){{$}} printf("hello, world!"); } + +// Check that printf's alloca happens in the entry block, not inside the if +// statement. +__device__ bool foo(); +__device__ void CheckAllocaIsInEntryBlock() { + // CHECK: alloca %printf_args + // CHECK: call {{.*}} @_Z3foov() + if (foo()) { + printf("%d", 42); + } +} -- 2.7.4