/// @param Array The array for which to compute a size.
Value *getArraySize(gpu_array_info *Array);
+ /// Generate code to compute the minimal offset at which an array is accessed.
+ ///
+ /// The offset of an array is the minimal array location accessed in a scop.
+ ///
+ /// Example:
+ ///
+ /// for (long i = 0; i < 100; i++)
+ /// A[i + 42] += ...
+ ///
+ /// getArrayOffset(A) results in 42.
+ ///
+ /// @param Array The array for which to compute the offset.
+ /// @returns An llvm::Value that contains the offset of the array.
+ Value *getArrayOffset(gpu_array_info *Array);
+
/// Prepare the kernel arguments for kernel code generation
///
/// @param Kernel The kernel to generate code for.
DevArrayName.append(Array->name);
Value *ArraySize = getArraySize(Array);
+ Value *Offset = getArrayOffset(Array);
+ if (Offset)
+ ArraySize = Builder.CreateSub(
+ ArraySize,
+ Builder.CreateMul(Offset,
+ Builder.getInt64(ScopArray->getElemSizeInBytes())));
Value *DevArray = createCallAllocateMemoryForDevice(ArraySize);
DevArray->setName(DevArrayName);
DeviceAllocations[ScopArray] = DevArray;
return ArraySize;
}
+Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) {
+ if (gpu_array_is_scalar(Array))
+ return nullptr;
+
+ isl_ast_build *Build = isl_ast_build_from_context(S.getContext());
+
+ isl_set *Min = isl_set_lexmin(isl_set_copy(Array->extent));
+
+ isl_set *ZeroSet = isl_set_universe(isl_set_get_space(Min));
+
+ for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++)
+ ZeroSet = isl_set_fix_si(ZeroSet, isl_dim_set, i, 0);
+
+ if (isl_set_is_subset(Min, ZeroSet)) {
+ isl_set_free(Min);
+ isl_set_free(ZeroSet);
+ isl_ast_build_free(Build);
+ return nullptr;
+ }
+ isl_set_free(ZeroSet);
+
+ isl_ast_expr *Result =
+ isl_ast_expr_from_val(isl_val_int_from_si(isl_set_get_ctx(Min), 0));
+
+ for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) {
+ if (i > 0) {
+ isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i - 1]);
+ isl_ast_expr *BExpr = isl_ast_build_expr_from_pw_aff(Build, Bound_I);
+ Result = isl_ast_expr_mul(Result, BExpr);
+ }
+ isl_pw_aff *DimMin = isl_set_dim_min(isl_set_copy(Min), i);
+ isl_ast_expr *MExpr = isl_ast_build_expr_from_pw_aff(Build, DimMin);
+ Result = isl_ast_expr_add(Result, MExpr);
+ }
+
+ Value *ResultValue = ExprBuilder.create(Result);
+ isl_set_free(Min);
+ isl_ast_build_free(Build);
+
+ return ResultValue;
+}
+
void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
enum DataDirection Direction) {
isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt);
auto ScopArray = (ScopArrayInfo *)(Array->user);
Value *Size = getArraySize(Array);
+ Value *Offset = getArrayOffset(Array);
Value *DevPtr = DeviceAllocations[ScopArray];
Value *HostPtr;
else
HostPtr = ScopArray->getBasePtr();
+ if (Offset) {
+ HostPtr = Builder.CreatePointerCast(
+ HostPtr, ScopArray->getElementType()->getPointerTo());
+ HostPtr = Builder.CreateGEP(HostPtr, Offset);
+ }
+
HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
+ if (Offset) {
+ Size = Builder.CreateSub(
+ Size, Builder.CreateMul(
+ Offset, Builder.getInt64(ScopArray->getElemSizeInBytes())));
+ }
+
if (Direction == HOST_TO_DEVICE)
createCallCopyFromHostToDevice(HostPtr, DevPtr, Size);
else
Value *DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
DevArray = createCallGetDevicePtr(DevArray);
+
+ Value *Offset = getArrayOffset(&Prog->array[i]);
+
+ if (Offset) {
+ DevArray = Builder.CreatePointerCast(
+ DevArray, SAI->getElementType()->getPointerTo());
+ DevArray = Builder.CreateGEP(DevArray, Builder.CreateNeg(Offset));
+ DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy());
+ }
+
Instruction *Param = new AllocaInst(
Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index),
EntryBlock->getTerminator());
--- /dev/null
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
+; RUN: FileCheck %s -check-prefix=IR
+;
+; REQUIRES: pollyacc
+
+; CODE: Code
+; CODE-NEXT: ====
+; CODE-NEXT: # host
+; CODE-NEXT: {
+; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (16) * sizeof(float), cudaMemcpyHostToDevice));
+; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (8) * sizeof(float), cudaMemcpyHostToDevice));
+; CODE-NEXT: {
+; CODE-NEXT: dim3 k0_dimBlock(8);
+; CODE-NEXT: dim3 k0_dimGrid(1);
+; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B);
+; CODE-NEXT: cudaCheckKernel();
+; CODE-NEXT: }
+
+; CODE: {
+; CODE-NEXT: dim3 k1_dimBlock(8);
+; CODE-NEXT: dim3 k1_dimGrid(1);
+; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A);
+; CODE-NEXT: cudaCheckKernel();
+; CODE-NEXT: }
+
+; CODE: cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (16) * sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT: }
+
+; CODE: # kernel0
+; CODE-NEXT: Stmt_bb3(t0);
+
+; CODE: # kernel1
+; CODE-NEXT: Stmt_bb11(t0);
+
+; IR: %p_dev_array_MemRef_B = call i8* @polly_allocateMemoryForDevice(i64 32)
+; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 32)
+; IR-NEXT: [[REG0:%.+]] = getelementptr float, float* %B, i64 8
+; IR-NEXT: [[REG1:%.+]] = bitcast float* [[REG0]] to i8*
+; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[REG1]], i8* %p_dev_array_MemRef_B, i64 32)
+
+; IR: [[REGA:%.+]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
+; IR-NEXT: [[REGB:%.+]] = bitcast i8* [[REGA]] to float*
+; IR-NEXT: [[REGC:%.+]] = getelementptr float, float* [[REGB]], i64 -8
+; IR-NEXT: [[REGD:%.+]] = bitcast float* [[REGC]] to i8*
+
+; void foo(float A[], float B[]) {
+; for (long i = 0; i < 8; i++)
+; B[i + 8] *= 4;
+;
+; for (long i = 0; i < 8; i++)
+; A[i] *= 12;
+; }
+;
+; #ifdef OUTPUT
+; int main() {
+; float A[16];
+;
+; for (long i = 0; i < 16; i++) {
+; __sync_synchronize();
+; A[i] = i;
+; }
+;
+; foo(A, A);
+;
+; float sum = 0;
+; for (long i = 0; i < 16; i++) {
+; __sync_synchronize();
+; sum += A[i];
+; }
+;
+; printf("%f\n", sum);
+; }
+; #endif
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(float* %A, float* %B) {
+bb:
+ br label %bb2
+
+bb2: ; preds = %bb7, %bb
+ %i.0 = phi i64 [ 0, %bb ], [ %tmp8, %bb7 ]
+ %exitcond1 = icmp ne i64 %i.0, 8
+ br i1 %exitcond1, label %bb3, label %bb9
+
+bb3: ; preds = %bb2
+ %tmp = add nuw nsw i64 %i.0, 8
+ %tmp4 = getelementptr inbounds float, float* %B, i64 %tmp
+ %tmp5 = load float, float* %tmp4, align 4
+ %tmp6 = fmul float %tmp5, 4.000000e+00
+ store float %tmp6, float* %tmp4, align 4
+ br label %bb7
+
+bb7: ; preds = %bb3
+ %tmp8 = add nuw nsw i64 %i.0, 1
+ br label %bb2
+
+bb9: ; preds = %bb2
+ br label %bb10
+
+bb10: ; preds = %bb15, %bb9
+ %i1.0 = phi i64 [ 0, %bb9 ], [ %tmp16, %bb15 ]
+ %exitcond = icmp ne i64 %i1.0, 8
+ br i1 %exitcond, label %bb11, label %bb17
+
+bb11: ; preds = %bb10
+ %tmp12 = getelementptr inbounds float, float* %A, i64 %i1.0
+ %tmp13 = load float, float* %tmp12, align 4
+ %tmp14 = fmul float %tmp13, 1.200000e+01
+ store float %tmp14, float* %tmp12, align 4
+ br label %bb15
+
+bb15: ; preds = %bb11
+ %tmp16 = add nuw nsw i64 %i1.0, 1
+ br label %bb10
+
+bb17: ; preds = %bb10
+ ret void
+}
; CODE-NEXT: Stmt_for_cond15_for_cond12_loopexit_crit_edge(0);
; CODE-NEXT: }
-; IR: %1 = bitcast i32* %out_l.055.phiops to i8*
-; IR-NEXT: call void @polly_copyFromHostToDevice(i8* %1, i8* %p_dev_array_MemRef_out_l_055__phi, i64 4)
+; IR: [[REGA:%.+]] = bitcast i32* %out_l.055.phiops to i8*
+; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[REGA]], i8* %p_dev_array_MemRef_out_l_055__phi, i64 4)
-; IR: %14 = bitcast i32* %out_l.055.phiops to i8*
-; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* %14, i64 4)
-; IR-NEXT: %15 = bitcast i32* %out_l.055.s2a to i8*
-; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* %15, i64 4)
+; IR: [[REGB:%.+]] = bitcast i32* %out_l.055.phiops to i8*
+; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* [[REGB]], i64 4)
+; IR-NEXT: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8*
+; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* [[REGC]], i64 4)
; KERNEL-IR: entry:
; KERNEL-IR-NEXT: %out_l.055.s2a = alloca i32