From a90be207c60cf3bcbcdf452f4443ba0b73d868ae Mon Sep 17 00:00:00 2001 From: Siddharth Bhat Date: Tue, 9 May 2017 10:45:52 +0000 Subject: [PATCH] [Polly][PPCGCodeGen] OpenCL now gets kernel argument size from PPCG CodeGen Summary: PPCGCodeGeneration now attaches the size of the kernel launch parameters at the end of the parameter list. For the existing CUDA Runtime, this gets ignored, but the OpenCL Runtime knows to check for kernel-argument size at the end of the parameter list. (The resulting parameters list is twice as long. This has been accounted for in the corresponding test cases). Reviewers: grosser, Meinersbur, bollu Reviewed By: bollu Subscribers: nemanjai, yaxunl, Anastasia, pollydev, llvm-commits Tags: #polly Differential Revision: https://reviews.llvm.org/D32961 llvm-svn: 302515 --- polly/lib/CodeGen/PPCGCodeGeneration.cpp | 71 ++++++++++++++++------ polly/test/GPGPU/cuda-managed-memory-simple.ll | 18 ++++-- polly/test/GPGPU/host-control-flow.ll | 2 +- polly/test/GPGPU/kernel-params-only-some-arrays.ll | 4 +- polly/test/GPGPU/parametric-loop-bound.ll | 2 +- polly/tools/GPURuntime/GPUJIT.c | 26 ++------ 6 files changed, 76 insertions(+), 47 deletions(-) diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index 45e570c..4b09faa 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -142,6 +142,14 @@ static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( return RefToExpr; } +/// Given a LLVM Type, compute its size in bytes, +static int computeSizeInBytes(const Type *T) { + int bytes = T->getPrimitiveSizeInBits() / 8; + if (bytes == 0) + bytes = T->getScalarSizeInBits() / 8; + return bytes; +} + /// Generate code for a GPU specific isl AST. /// /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which @@ -272,6 +280,16 @@ private: /// @returns A tuple with thread block sizes for X, Y, and Z dimensions. std::tuple getBlockSizes(ppcg_kernel *Kernel); + /// Store a specific kernel launch parameter in the array of kernel launch + /// parameters. + /// + /// @param Parameters The list of parameters in which to store. + /// @param Param The kernel launch parameter to store. + /// @param Index The index in the parameter list, at which to store the + /// parameter. + void insertStoreParameter(Instruction *Parameters, Instruction *Param, + int Index); + /// Create kernel launch parameters. /// /// @param Kernel The kernel to create parameters for. @@ -1192,11 +1210,21 @@ GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) { return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]); } +void GPUNodeBuilder::insertStoreParameter(Instruction *Parameters, + Instruction *Param, int Index) { + Value *Slot = Builder.CreateGEP( + Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); + Value *ParamTyped = Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); + Builder.CreateStore(ParamTyped, Slot); +} + Value * GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, SetVector SubtreeValues) { - Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), - std::distance(F->arg_begin(), F->arg_end())); + const int NumArgs = F->arg_size(); + std::vector ArgSizes(NumArgs); + + Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs); BasicBlock *EntryBlock = &Builder.GetInsertBlock()->getParent()->getEntryBlock(); @@ -1213,6 +1241,8 @@ GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); + ArgSizes[Index] = SAI->getElemSizeInBytes(); + Value *DevArray = nullptr; if (ManagedMemory) { DevArray = getOrCreateManagedDeviceArray( @@ -1265,16 +1295,15 @@ GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); Value *Val = IDToValue[Id]; isl_id_free(Id); + + ArgSizes[Index] = computeSizeInBytes(Val->getType()); + Instruction *Param = new AllocaInst(Val->getType(), AddressSpace, Launch + "_param_" + std::to_string(Index), EntryBlock->getTerminator()); Builder.CreateStore(Val, Param); - Value *Slot = Builder.CreateGEP( - Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); - Value *ParamTyped = - Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); - Builder.CreateStore(ParamTyped, Slot); + insertStoreParameter(Parameters, Param, Index); Index++; } @@ -1284,30 +1313,38 @@ GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); Value *Val = IDToValue[Id]; isl_id_free(Id); + + ArgSizes[Index] = computeSizeInBytes(Val->getType()); + Instruction *Param = new AllocaInst(Val->getType(), AddressSpace, Launch + "_param_" + std::to_string(Index), EntryBlock->getTerminator()); Builder.CreateStore(Val, Param); - Value *Slot = Builder.CreateGEP( - Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); - Value *ParamTyped = - Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); - Builder.CreateStore(ParamTyped, Slot); + insertStoreParameter(Parameters, Param, Index); Index++; } for (auto Val : SubtreeValues) { + ArgSizes[Index] = computeSizeInBytes(Val->getType()); + Instruction *Param = new AllocaInst(Val->getType(), AddressSpace, Launch + "_param_" + std::to_string(Index), EntryBlock->getTerminator()); Builder.CreateStore(Val, Param); - Value *Slot = Builder.CreateGEP( - Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); - Value *ParamTyped = - Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); - Builder.CreateStore(ParamTyped, Slot); + insertStoreParameter(Parameters, Param, Index); + Index++; + } + + for (int i = 0; i < NumArgs; i++) { + Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]); + Instruction *Param = + new AllocaInst(Builder.getInt32Ty(), AddressSpace, + Launch + "_param_size_" + std::to_string(i), + EntryBlock->getTerminator()); + Builder.CreateStore(Val, Param); + insertStoreParameter(Parameters, Param, Index); Index++; } diff --git a/polly/test/GPGPU/cuda-managed-memory-simple.ll b/polly/test/GPGPU/cuda-managed-memory-simple.ll index 4a97ec5..a8a1d6a 100644 --- a/polly/test/GPGPU/cuda-managed-memory-simple.ll +++ b/polly/test/GPGPU/cuda-managed-memory-simple.ll @@ -37,18 +37,26 @@ ; CHECK: %13 = call i8* @polly_initContextCUDA() ; CHECK-NEXT: %14 = bitcast i32* %A to i8* -; CHECK-NEXT: %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; CHECK-NEXT: %15 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 0 ; CHECK-NEXT: store i8* %14, i8** %polly_launch_0_param_0 ; CHECK-NEXT: %16 = bitcast i8** %polly_launch_0_param_0 to i8* ; CHECK-NEXT: store i8* %16, i8** %15 ; CHECK-NEXT: %17 = bitcast i32* %R to i8* -; CHECK-NEXT: %18 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 +; CHECK-NEXT: %18 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1 ; CHECK-NEXT: store i8* %17, i8** %polly_launch_0_param_1 ; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8* ; CHECK-NEXT: store i8* %19, i8** %18 -; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0)) -; CHECK-NEXT: call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) -; CHECK-NEXT: call void @polly_freeKernel(i8* %20) +; CHECK-NEXT: store i32 4, i32* %polly_launch_0_param_size_0 +; CHECK-NEXT: %20 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 2 +; CHECK-NEXT: %21 = bitcast i32* %polly_launch_0_param_size_0 to i8* +; CHECK-NEXT: store i8* %21, i8** %20 +; CHECK-NEXT: store i32 4, i32* %polly_launch_0_param_size_1 +; CHECK-NEXT: %22 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 3 +; CHECK-NEXT: %23 = bitcast i32* %polly_launch_0_param_size_1 to i8* +; CHECK-NEXT: store i8* %23, i8** %22 +; CHECK-NEXT: %24 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0)) +; CHECK-NEXT: call void @polly_launchKernel(i8* %24, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) +; CHECK-NEXT: call void @polly_freeKernel(i8* %24) ; CHECK-NEXT: call void @polly_synchronizeDevice() ; CHECK-NEXT: call void @polly_freeContext(i8* %13) diff --git a/polly/test/GPGPU/host-control-flow.ll b/polly/test/GPGPU/host-control-flow.ll index 9e940aa..bc66dc0 100644 --- a/polly/test/GPGPU/host-control-flow.ll +++ b/polly/test/GPGPU/host-control-flow.ll @@ -32,7 +32,7 @@ ; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ] ; ... ; IR: store i64 %polly.indvar, i64* %polly_launch_0_param_1 -; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 +; IR-NEXT: [[REGA:%.+]] = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1 ; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8* ; IR-NEXT: store i8* [[REGB]], i8** [[REGA]] ; IR: call i8* @polly_getKernel diff --git a/polly/test/GPGPU/kernel-params-only-some-arrays.ll b/polly/test/GPGPU/kernel-params-only-some-arrays.ll index 193de95..b6f3172 100644 --- a/polly/test/GPGPU/kernel-params-only-some-arrays.ll +++ b/polly/test/GPGPU/kernel-params-only-some-arrays.ll @@ -48,13 +48,13 @@ ; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) -; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 ; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0 ; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8* ; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] ; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B) -; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_1_params, i64 0, i64 0 +; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_1_params, i64 0, i64 0 ; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0 ; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8* ; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] diff --git a/polly/test/GPGPU/parametric-loop-bound.ll b/polly/test/GPGPU/parametric-loop-bound.ll index 687658e..1ca5151 100644 --- a/polly/test/GPGPU/parametric-loop-bound.ll +++ b/polly/test/GPGPU/parametric-loop-bound.ll @@ -31,7 +31,7 @@ ; CODE-NEXT: Stmt_bb2(32 * b0 + t0 + 1048576 * c0); ; IR: store i64 %n, i64* %polly_launch_0_param_1 -; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 +; IR-NEXT: [[REGA:%.+]] = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1 ; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8* ; IR-NEXT: store i8* [[REGB]], i8** [[REGA]] diff --git a/polly/tools/GPURuntime/GPUJIT.c b/polly/tools/GPURuntime/GPUJIT.c index 5a00775..02dba03 100644 --- a/polly/tools/GPURuntime/GPUJIT.c +++ b/polly/tools/GPURuntime/GPUJIT.c @@ -554,28 +554,12 @@ static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX, sizeof(cl_uint), &NumArgs, NULL); checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n"); - // TODO: Pass the size of the kernel arguments in to launchKernelCL, along - // with the arguments themselves. This is a dirty workaround that can be - // broken. + /* Argument sizes are stored at the end of the Parameters array. */ for (cl_uint i = 0; i < NumArgs; i++) { - Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 8, (void *)Parameters[i]); - if (Ret == CL_INVALID_ARG_SIZE) { - Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 4, (void *)Parameters[i]); - if (Ret == CL_INVALID_ARG_SIZE) { - Ret = - clSetKernelArgFcnPtr(CLKernel->Kernel, i, 2, (void *)Parameters[i]); - if (Ret == CL_INVALID_ARG_SIZE) { - Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 1, - (void *)Parameters[i]); - checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i); - } - } - } - if (Ret != CL_SUCCESS && Ret != CL_INVALID_ARG_SIZE) { - fprintf(stderr, "Failed to set Kernel argument.\n"); - printOpenCLError(Ret); - exit(-1); - } + Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, + *((int *)Parameters[NumArgs + i]), + (void *)Parameters[i]); + checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i); } unsigned int GridDimZ = 1; -- 2.7.4