From c3054aeb5a3ba7778b1296722cfb90b494819b60 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 1 Jan 2023 14:27:15 -0500 Subject: [PATCH] OpenMPOpt: Fix using wrong address space for alloca Using the function's address space makes no sense. Copied from the existing test, with more addrspace variation. Could just replace the existing one with this version if it's redundant. --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 12 ++- .../Transforms/OpenMP/hide_mem_transfer_latency.ll | 4 +- .../OpenMP/values_in_offload_arrays.alloca.ll | 95 ++++++++++++++++++++++ 3 files changed, 105 insertions(+), 6 deletions(-) create mode 100644 llvm/test/Transforms/OpenMP/values_in_offload_arrays.alloca.ll diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 84469ea..90f0196 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1765,10 +1765,14 @@ private: // function. Used for storing information of the async transfer, allowing to // wait on it later. auto &IRBuilder = OMPInfoCache.OMPBuilder; - auto *F = RuntimeCall.getCaller(); - Instruction *FirstInst = &(F->getEntryBlock().front()); - AllocaInst *Handle = new AllocaInst( - IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst); + Function *F = RuntimeCall.getCaller(); + BasicBlock &Entry = F->getEntryBlock(); + IRBuilder.Builder.SetInsertPoint(&Entry, + Entry.getFirstNonPHIOrDbgOrAlloca()); + Value *Handle = IRBuilder.Builder.CreateAlloca( + IRBuilder.AsyncInfo, /*ArraySize=*/nullptr, "handle"); + Handle = + IRBuilder.Builder.CreateAddrSpaceCast(Handle, IRBuilder.AsyncInfoPtr); // Add "issue" runtime call declaration: // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32, diff --git a/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll b/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll index e75a7aa..1a342cc 100644 --- a/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll +++ b/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll @@ -43,12 +43,12 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16 define dso_local double @heavyComputation1() { ; CHECK-LABEL: define {{[^@]+}}@heavyComputation1() { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[HANDLE:%.*]] = alloca [[STRUCT___TGT_ASYNC_INFO:%.*]], align 8 ; CHECK-NEXT: [[A:%.*]] = alloca double, align 8 ; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x i8*], align 8 ; CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x i8*], align 8 ; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS4:%.*]] = alloca [1 x i8*], align 8 ; CHECK-NEXT: [[DOTOFFLOAD_PTRS5:%.*]] = alloca [1 x i8*], align 8 +; CHECK-NEXT: [[HANDLE:%.*]] = alloca [[STRUCT___TGT_ASYNC_INFO:%.*]], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A]] to i8* ; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @rand() ; CHECK-NEXT: [[REM:%.*]] = srem i32 [[CALL]], 777 @@ -461,10 +461,10 @@ define dso_local i32 @dataTransferOnly1(double* noalias %a, i32 %size) { ; CHECK-LABEL: define {{[^@]+}}@dataTransferOnly1 ; CHECK-SAME: (double* noalias [[A:%.*]], i32 [[SIZE:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[HANDLE:%.*]] = alloca [[STRUCT___TGT_ASYNC_INFO:%.*]], align 8 ; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x i8*], align 8 ; CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x i8*], align 8 ; CHECK-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [1 x i64], align 8 +; CHECK-NEXT: [[HANDLE:%.*]] = alloca [[STRUCT___TGT_ASYNC_INFO:%.*]], align 8 ; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @rand() ; CHECK-NEXT: [[CONV:%.*]] = zext i32 [[SIZE]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[CONV]], 3 diff --git a/llvm/test/Transforms/OpenMP/values_in_offload_arrays.alloca.ll b/llvm/test/Transforms/OpenMP/values_in_offload_arrays.alloca.ll new file mode 100644 index 0000000..a16138be --- /dev/null +++ b/llvm/test/Transforms/OpenMP/values_in_offload_arrays.alloca.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=openmp-opt-cgscc -aa-pipeline=basic-aa -openmp-hide-memory-transfer-latency < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" + +@.__omp_offloading_heavyComputation.region_id = weak constant i8 0 +@.offload_maptypes. = private unnamed_addr constant [2 x i64] [i64 35, i64 35] + +%struct.ident_t = type { i32, i32, i32, i32, ptr } + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @.str }, align 8 + +;int heavyComputation(ptr a, unsigned size) { +; int random = rand() % 7; +; +; //#pragma omp target data map(a[0:size], size) +; ptr args[2]; +; args[0] = &a; +; args[1] = &size; +; __tgt_target_data_begin(..., args, ...) +; +; #pragma omp target teams +; for (int i = 0; i < size; ++i) { +; a[i] = ++aptr 3.141624; +; } +; +; return random; +;} +define dso_local i32 @heavyComputation(ptr %a, i32 %size) { +; CHECK-LABEL: @heavyComputation( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SIZE_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x ptr], align 8, addrspace(5) +; CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x ptr], align 8, addrspace(5) +; CHECK-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [2 x i64], align 8, addrspace(5) +; CHECK-NEXT: [[HANDLE:%.*]] = alloca [[STRUCT___TGT_ASYNC_INFO:%.*]], align 8, addrspace(5) +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[HANDLE]] to ptr +; CHECK-NEXT: store i32 [[SIZE:%.*]], ptr addrspace(5) [[SIZE_ADDR]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @rand() +; CHECK-NEXT: [[CONV:%.*]] = zext i32 [[SIZE]] to i64 +; CHECK-NEXT: [[SHL:%.*]] = shl nuw nsw i64 [[CONV]], 3 +; CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[DOTOFFLOAD_BASEPTRS]], align 8 +; CHECK-NEXT: store ptr [[A]], ptr addrspace(5) [[DOTOFFLOAD_PTRS]], align 8 +; CHECK-NEXT: store i64 [[SHL]], ptr addrspace(5) [[DOTOFFLOAD_SIZES]], align 8 +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds [2 x ptr], ptr addrspace(5) [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 1 +; CHECK-NEXT: store ptr addrspace(5) [[SIZE_ADDR]], ptr addrspace(5) [[GEP0]], align 8 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds [2 x ptr], ptr addrspace(5) [[DOTOFFLOAD_PTRS]], i64 0, i64 1 +; CHECK-NEXT: store ptr addrspace(5) [[SIZE_ADDR]], ptr addrspace(5) [[GEP1]], align 8 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds [2 x i64], ptr addrspace(5) [[DOTOFFLOAD_SIZES]], i64 0, i64 1 +; CHECK-NEXT: store i64 4, ptr addrspace(5) [[GEP2]], align 8 +; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS_FLAT:%.*]] = addrspacecast ptr addrspace(5) [[DOTOFFLOAD_BASEPTRS]] to ptr +; CHECK-NEXT: [[DOTOFFLOAD_PTRS_FLAT:%.*]] = addrspacecast ptr addrspace(5) [[DOTOFFLOAD_PTRS]] to ptr +; CHECK-NEXT: [[DOTOFFLOAD_SIZES_FLAT:%.*]] = addrspacecast ptr addrspace(5) [[DOTOFFLOAD_SIZES]] to ptr +; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_issue(ptr @[[GLOB0:[0-9]+]], i64 -1, i32 2, ptr [[DOTOFFLOAD_BASEPTRS_FLAT]], ptr [[DOTOFFLOAD_PTRS_FLAT]], ptr [[DOTOFFLOAD_SIZES_FLAT]], ptr @.offload_maptypes., ptr null, ptr null, ptr [[TMP0]]) +; CHECK-NEXT: [[REM:%.*]] = srem i32 [[CALL]], 7 +; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, ptr [[TMP0]]) +; CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr @[[GLOB0]], i64 -1, i32 2, ptr nonnull [[DOTOFFLOAD_BASEPTRS_FLAT]], ptr nonnull [[DOTOFFLOAD_PTRS_FLAT]], ptr nonnull [[DOTOFFLOAD_SIZES_FLAT]], ptr @.offload_maptypes., ptr null, ptr null) +; CHECK-NEXT: ret i32 [[REM]] +; +entry: + %size.addr = alloca i32, align 4, addrspace(5) + %.offload_baseptrs = alloca [2 x ptr], align 8, addrspace(5) + %.offload_ptrs = alloca [2 x ptr], align 8, addrspace(5) + %.offload_sizes = alloca [2 x i64], align 8, addrspace(5) + store i32 %size, ptr addrspace(5) %size.addr, align 4 + %call = tail call i32 (...) @rand() + %conv = zext i32 %size to i64 + %shl = shl nuw nsw i64 %conv, 3 + store ptr %a, ptr addrspace(5) %.offload_baseptrs, align 8 + store ptr %a, ptr addrspace(5) %.offload_ptrs, align 8 + store i64 %shl, ptr addrspace(5) %.offload_sizes, align 8 + %gep0 = getelementptr inbounds [2 x ptr], ptr addrspace(5) %.offload_baseptrs, i64 0, i64 1 + store ptr addrspace(5) %size.addr, ptr addrspace(5) %gep0, align 8 + %gep1 = getelementptr inbounds [2 x ptr], ptr addrspace(5) %.offload_ptrs, i64 0, i64 1 + store ptr addrspace(5) %size.addr, ptr addrspace(5) %gep1, align 8 + %gep2 = getelementptr inbounds [2 x i64], ptr addrspace(5) %.offload_sizes, i64 0, i64 1 + store i64 4, ptr addrspace(5) %gep2, align 8 + %.offload_baseptrs.flat = addrspacecast ptr addrspace(5) %.offload_baseptrs to ptr + %.offload_ptrs.flat = addrspacecast ptr addrspace(5) %.offload_ptrs to ptr + %.offload_sizes.flat = addrspacecast ptr addrspace(5) %.offload_sizes to ptr + call void @__tgt_target_data_begin_mapper(ptr @0, i64 -1, i32 2, ptr nonnull %.offload_baseptrs.flat, ptr nonnull %.offload_ptrs.flat, ptr nonnull %.offload_sizes.flat, ptr @.offload_maptypes., ptr null, ptr null) + %rem = srem i32 %call, 7 + call void @__tgt_target_data_end_mapper(ptr @0, i64 -1, i32 2, ptr nonnull %.offload_baseptrs.flat, ptr nonnull %.offload_ptrs.flat, ptr nonnull %.offload_sizes.flat, ptr @.offload_maptypes., ptr null, ptr null) + ret i32 %rem +} + +declare void @__tgt_target_data_begin_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr) +declare void @__tgt_target_data_end_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr) + +declare dso_local i32 @rand(...) + +!llvm.module.flags = !{!0} + +!0 = !{i32 7, !"openmp", i32 50} -- 2.7.4