From aaabbbf8863827cb9e286e1e04cc41f088f9383f Mon Sep 17 00:00:00 2001 From: Tobias Grosser Date: Thu, 15 Sep 2016 14:05:58 +0000 Subject: [PATCH] GPGPU: Do not assume arrays start at 0 Our alias checks precisely check that the minimal and maximal accessed elements do not overlap in a kernel. Hence, we must ensure that our host <-> device transfers do not touch additional memory locations that are not covered in the alias check. To ensure this, we make sure that the data we copy for a given array is only the data from the smallest element accessed to the largest element accessed. We also adjust the size of the array according to the offset at which the array is actually accessed. An interesting result of this is: In case array are accessed with negative subscripts ,e.g., A[-100], we automatically allocate and transfer _more_ data to cover the full array. This is important as such code indeed exists in the wild. llvm-svn: 281611 --- polly/lib/CodeGen/PPCGCodeGeneration.cpp | 86 +++++++++++++++++++++ polly/test/GPGPU/non-zero-array-offset.ll | 124 ++++++++++++++++++++++++++++++ polly/test/GPGPU/phi-nodes-in-kernel.ll | 12 +-- 3 files changed, 216 insertions(+), 6 deletions(-) create mode 100644 polly/test/GPGPU/non-zero-array-offset.ll diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index 076a94a..48444d6 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -284,6 +284,21 @@ private: /// @param Array The array for which to compute a size. Value *getArraySize(gpu_array_info *Array); + /// Generate code to compute the minimal offset at which an array is accessed. + /// + /// The offset of an array is the minimal array location accessed in a scop. + /// + /// Example: + /// + /// for (long i = 0; i < 100; i++) + /// A[i + 42] += ... + /// + /// getArrayOffset(A) results in 42. + /// + /// @param Array The array for which to compute the offset. + /// @returns An llvm::Value that contains the offset of the array. + Value *getArrayOffset(gpu_array_info *Array); + /// Prepare the kernel arguments for kernel code generation /// /// @param Kernel The kernel to generate code for. @@ -468,6 +483,12 @@ void GPUNodeBuilder::allocateDeviceArrays() { DevArrayName.append(Array->name); Value *ArraySize = getArraySize(Array); + Value *Offset = getArrayOffset(Array); + if (Offset) + ArraySize = Builder.CreateSub( + ArraySize, + Builder.CreateMul(Offset, + Builder.getInt64(ScopArray->getElemSizeInBytes()))); Value *DevArray = createCallAllocateMemoryForDevice(ArraySize); DevArray->setName(DevArrayName); DeviceAllocations[ScopArray] = DevArray; @@ -721,6 +742,48 @@ Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) { return ArraySize; } +Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) { + if (gpu_array_is_scalar(Array)) + return nullptr; + + isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); + + isl_set *Min = isl_set_lexmin(isl_set_copy(Array->extent)); + + isl_set *ZeroSet = isl_set_universe(isl_set_get_space(Min)); + + for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) + ZeroSet = isl_set_fix_si(ZeroSet, isl_dim_set, i, 0); + + if (isl_set_is_subset(Min, ZeroSet)) { + isl_set_free(Min); + isl_set_free(ZeroSet); + isl_ast_build_free(Build); + return nullptr; + } + isl_set_free(ZeroSet); + + isl_ast_expr *Result = + isl_ast_expr_from_val(isl_val_int_from_si(isl_set_get_ctx(Min), 0)); + + for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) { + if (i > 0) { + isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i - 1]); + isl_ast_expr *BExpr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); + Result = isl_ast_expr_mul(Result, BExpr); + } + isl_pw_aff *DimMin = isl_set_dim_min(isl_set_copy(Min), i); + isl_ast_expr *MExpr = isl_ast_build_expr_from_pw_aff(Build, DimMin); + Result = isl_ast_expr_add(Result, MExpr); + } + + Value *ResultValue = ExprBuilder.create(Result); + isl_set_free(Min); + isl_ast_build_free(Build); + + return ResultValue; +} + void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, enum DataDirection Direction) { isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); @@ -730,6 +793,7 @@ void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, auto ScopArray = (ScopArrayInfo *)(Array->user); Value *Size = getArraySize(Array); + Value *Offset = getArrayOffset(Array); Value *DevPtr = DeviceAllocations[ScopArray]; Value *HostPtr; @@ -739,8 +803,20 @@ void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, else HostPtr = ScopArray->getBasePtr(); + if (Offset) { + HostPtr = Builder.CreatePointerCast( + HostPtr, ScopArray->getElementType()->getPointerTo()); + HostPtr = Builder.CreateGEP(HostPtr, Offset); + } + HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); + if (Offset) { + Size = Builder.CreateSub( + Size, Builder.CreateMul( + Offset, Builder.getInt64(ScopArray->getElemSizeInBytes()))); + } + if (Direction == HOST_TO_DEVICE) createCallCopyFromHostToDevice(HostPtr, DevPtr, Size); else @@ -1000,6 +1076,16 @@ GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, Value *DevArray = DeviceAllocations[const_cast(SAI)]; DevArray = createCallGetDevicePtr(DevArray); + + Value *Offset = getArrayOffset(&Prog->array[i]); + + if (Offset) { + DevArray = Builder.CreatePointerCast( + DevArray, SAI->getElementType()->getPointerTo()); + DevArray = Builder.CreateGEP(DevArray, Builder.CreateNeg(Offset)); + DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy()); + } + Instruction *Param = new AllocaInst( Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index), EntryBlock->getTerminator()); diff --git a/polly/test/GPGPU/non-zero-array-offset.ll b/polly/test/GPGPU/non-zero-array-offset.ll new file mode 100644 index 0000000..b3c1d91 --- /dev/null +++ b/polly/test/GPGPU/non-zero-array-offset.ll @@ -0,0 +1,124 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck -check-prefix=CODE %s + +; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ +; RUN: FileCheck %s -check-prefix=IR +; +; REQUIRES: pollyacc + +; CODE: Code +; CODE-NEXT: ==== +; CODE-NEXT: # host +; CODE-NEXT: { +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (16) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (8) * sizeof(float), cudaMemcpyHostToDevice)); +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock(8); +; CODE-NEXT: dim3 k0_dimGrid(1); +; CODE-NEXT: kernel0 <<>> (dev_MemRef_B); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: { +; CODE-NEXT: dim3 k1_dimBlock(8); +; CODE-NEXT: dim3 k1_dimGrid(1); +; CODE-NEXT: kernel1 <<>> (dev_MemRef_A); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (16) * sizeof(float), cudaMemcpyDeviceToHost)); +; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost)); +; CODE-NEXT: } + +; CODE: # kernel0 +; CODE-NEXT: Stmt_bb3(t0); + +; CODE: # kernel1 +; CODE-NEXT: Stmt_bb11(t0); + +; IR: %p_dev_array_MemRef_B = call i8* @polly_allocateMemoryForDevice(i64 32) +; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 32) +; IR-NEXT: [[REG0:%.+]] = getelementptr float, float* %B, i64 8 +; IR-NEXT: [[REG1:%.+]] = bitcast float* [[REG0]] to i8* +; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[REG1]], i8* %p_dev_array_MemRef_B, i64 32) + +; IR: [[REGA:%.+]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B) +; IR-NEXT: [[REGB:%.+]] = bitcast i8* [[REGA]] to float* +; IR-NEXT: [[REGC:%.+]] = getelementptr float, float* [[REGB]], i64 -8 +; IR-NEXT: [[REGD:%.+]] = bitcast float* [[REGC]] to i8* + +; void foo(float A[], float B[]) { +; for (long i = 0; i < 8; i++) +; B[i + 8] *= 4; +; +; for (long i = 0; i < 8; i++) +; A[i] *= 12; +; } +; +; #ifdef OUTPUT +; int main() { +; float A[16]; +; +; for (long i = 0; i < 16; i++) { +; __sync_synchronize(); +; A[i] = i; +; } +; +; foo(A, A); +; +; float sum = 0; +; for (long i = 0; i < 16; i++) { +; __sync_synchronize(); +; sum += A[i]; +; } +; +; printf("%f\n", sum); +; } +; #endif +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @foo(float* %A, float* %B) { +bb: + br label %bb2 + +bb2: ; preds = %bb7, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp8, %bb7 ] + %exitcond1 = icmp ne i64 %i.0, 8 + br i1 %exitcond1, label %bb3, label %bb9 + +bb3: ; preds = %bb2 + %tmp = add nuw nsw i64 %i.0, 8 + %tmp4 = getelementptr inbounds float, float* %B, i64 %tmp + %tmp5 = load float, float* %tmp4, align 4 + %tmp6 = fmul float %tmp5, 4.000000e+00 + store float %tmp6, float* %tmp4, align 4 + br label %bb7 + +bb7: ; preds = %bb3 + %tmp8 = add nuw nsw i64 %i.0, 1 + br label %bb2 + +bb9: ; preds = %bb2 + br label %bb10 + +bb10: ; preds = %bb15, %bb9 + %i1.0 = phi i64 [ 0, %bb9 ], [ %tmp16, %bb15 ] + %exitcond = icmp ne i64 %i1.0, 8 + br i1 %exitcond, label %bb11, label %bb17 + +bb11: ; preds = %bb10 + %tmp12 = getelementptr inbounds float, float* %A, i64 %i1.0 + %tmp13 = load float, float* %tmp12, align 4 + %tmp14 = fmul float %tmp13, 1.200000e+01 + store float %tmp14, float* %tmp12, align 4 + br label %bb15 + +bb15: ; preds = %bb11 + %tmp16 = add nuw nsw i64 %i1.0, 1 + br label %bb10 + +bb17: ; preds = %bb10 + ret void +} diff --git a/polly/test/GPGPU/phi-nodes-in-kernel.ll b/polly/test/GPGPU/phi-nodes-in-kernel.ll index 5befa36..f367096 100644 --- a/polly/test/GPGPU/phi-nodes-in-kernel.ll +++ b/polly/test/GPGPU/phi-nodes-in-kernel.ll @@ -38,13 +38,13 @@ target triple = "x86_64-unknown-linux-gnu" ; CODE-NEXT: Stmt_for_cond15_for_cond12_loopexit_crit_edge(0); ; CODE-NEXT: } -; IR: %1 = bitcast i32* %out_l.055.phiops to i8* -; IR-NEXT: call void @polly_copyFromHostToDevice(i8* %1, i8* %p_dev_array_MemRef_out_l_055__phi, i64 4) +; IR: [[REGA:%.+]] = bitcast i32* %out_l.055.phiops to i8* +; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[REGA]], i8* %p_dev_array_MemRef_out_l_055__phi, i64 4) -; IR: %14 = bitcast i32* %out_l.055.phiops to i8* -; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* %14, i64 4) -; IR-NEXT: %15 = bitcast i32* %out_l.055.s2a to i8* -; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* %15, i64 4) +; IR: [[REGB:%.+]] = bitcast i32* %out_l.055.phiops to i8* +; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* [[REGB]], i64 4) +; IR-NEXT: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8* +; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* [[REGC]], i64 4) ; KERNEL-IR: entry: ; KERNEL-IR-NEXT: %out_l.055.s2a = alloca i32 -- 2.7.4