From 22c898dbfd3a79c90a78dfc2af28928eeb167ecf Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Thu, 22 Dec 2022 18:18:40 -0800 Subject: [PATCH] [OpenMP] Use Attributor to find underlying objects of stores When we see a store in generic mode we need to decide if we should guard it for SPMDzation. This patch changes the getUnderlyingObjects call to the more optimistic getAssumedUnderlyingObjects call to identify more thread local pointers. --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 24 ++++++++++------------ .../OpenMP/reduced_pointer_info_assertion.ll | 1 + .../test/Transforms/OpenMP/spmdization_guarding.ll | 22 ++++++++++++++++++++ 3 files changed, 34 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 5cf29bb..44871e9 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -4070,23 +4070,21 @@ struct AAKernelInfoFunction : AAKernelInfo { if (!I.mayWriteToMemory()) return true; if (auto *SI = dyn_cast(&I)) { - SmallVector Objects; - getUnderlyingObjects(SI->getPointerOperand(), Objects); - if (llvm::all_of(Objects, - [](const Value *Obj) { return isa(Obj); })) - return true; - // Check for AAHeapToStack moved objects which must not be guarded. + const auto &UnderlyingObjsAA = A.getAAFor( + *this, IRPosition::value(*SI->getPointerOperand()), + DepClassTy::OPTIONAL); auto &HS = A.getAAFor( *this, IRPosition::function(*I.getFunction()), DepClassTy::OPTIONAL); - if (llvm::all_of(Objects, [&HS](const Value *Obj) { - auto *CB = dyn_cast(Obj); - if (!CB) - return false; - return HS.isAssumedHeapToStack(*CB); - })) { + if (UnderlyingObjsAA.forallUnderlyingObjects([&](Value &Obj) { + if (AA::isAssumedThreadLocalObject(A, Obj, *this)) + return true; + // Check for AAHeapToStack moved objects which must not be + // guarded. + auto *CB = dyn_cast(&Obj); + return CB && HS.isAssumedHeapToStack(*CB); + })) return true; - } } // Insert instruction that needs guarding. diff --git a/llvm/test/Transforms/OpenMP/reduced_pointer_info_assertion.ll b/llvm/test/Transforms/OpenMP/reduced_pointer_info_assertion.ll index 0f8c0be..822b701 100644 --- a/llvm/test/Transforms/OpenMP/reduced_pointer_info_assertion.ll +++ b/llvm/test/Transforms/OpenMP/reduced_pointer_info_assertion.ll @@ -12,6 +12,7 @@ define internal i32 @nblist() { ret i32 0 } + define fastcc void @rec(ptr %0, i64 %1) { ; CHECK-LABEL: define {{[^@]+}}@rec( ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[TMP0:%.*]], i64 [[TMP1:%.*]] diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll index 8d8b333..e99ad4a 100644 --- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll @@ -40,6 +40,7 @@ target triple = "nvptx64" @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8 @__omp_offloading_2a_fbfa7a_sequential_loop_l6_exec_mode = weak constant i8 1 @llvm.compiler.used = appending global [1 x ptr] [ptr @__omp_offloading_2a_fbfa7a_sequential_loop_l6_exec_mode], section "llvm.metadata" +@LocGlob = private unnamed_addr addrspace(5) global i32 43 ; Function Attrs: convergent norecurse nounwind ;. @@ -47,12 +48,14 @@ target triple = "nvptx64" ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8 ; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 ; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x ptr] [ptr @__omp_offloading_2a_fbfa7a_sequential_loop_l6_exec_mode], section "llvm.metadata" +; CHECK: @[[LOCGLOB:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(5) global i32 43 ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 ;. ; CHECK-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; CHECK-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8 ; CHECK-DISABLED: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 ; CHECK-DISABLED: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x ptr] [ptr @__omp_offloading_2a_fbfa7a_sequential_loop_l6_exec_mode], section "llvm.metadata" +; CHECK-DISABLED: @[[LOCGLOB:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(5) global i32 43 ; CHECK-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef ;. define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %x, i64 %N) #0 { @@ -60,11 +63,16 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %x, i64 %N) ; CHECK-SAME: (ptr [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[HEAP2STACK_H2S:%.*]] = alloca i8, i64 8, align 8 +; CHECK-NEXT: [[LOC:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: [[AL32:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[N_ADDR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[N]] to i32 ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr nonnull @[[GLOB1]], i8 2, i1 false) #[[ATTR6:[0-9]+]] ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK: user_code.entry: +; CHECK-NEXT: [[C:%.*]] = icmp eq i64 [[N]], 42 +; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[C]], ptr [[AL32]], ptr addrspacecast (ptr addrspace(5) @LocGlob to ptr) +; CHECK-NEXT: store ptr [[SELECT]], ptr [[LOC]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]] ; CHECK-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1 ; CHECK-NEXT: [[SEXT:%.*]] = shl i64 [[N]], 32 @@ -179,6 +187,8 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %x, i64 %N) ; CHECK-DISABLED-NEXT: entry: ; CHECK-DISABLED-NEXT: [[HEAP2STACK_H2S:%.*]] = alloca i8, i64 8, align 8 ; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8 +; CHECK-DISABLED-NEXT: [[LOC:%.*]] = alloca ptr, align 8 +; CHECK-DISABLED-NEXT: [[AL32:%.*]] = alloca i32, align 4 ; CHECK-DISABLED-NEXT: [[N_ADDR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[N]] to i32 ; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr nonnull @[[GLOB1]], i8 1, i1 false) #[[ATTR6:[0-9]+]] ; CHECK-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 @@ -217,6 +227,9 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %x, i64 %N) ; CHECK-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK-DISABLED: user_code.entry: +; CHECK-DISABLED-NEXT: [[C:%.*]] = icmp eq i64 [[N]], 42 +; CHECK-DISABLED-NEXT: [[SELECT:%.*]] = select i1 [[C]], ptr [[AL32]], ptr addrspacecast (ptr addrspace(5) @LocGlob to ptr) +; CHECK-DISABLED-NEXT: store ptr [[SELECT]], ptr [[LOC]], align 8 ; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]] ; CHECK-DISABLED-NEXT: store i32 0, ptr [[X]], align 4, !noalias !8 ; CHECK-DISABLED-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1 @@ -262,12 +275,17 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %x, i64 %N) ; CHECK-DISABLED-NEXT: ret void ; entry: + %loc = alloca ptr + %al32 = alloca i32 %N.addr.sroa.0.0.extract.trunc = trunc i64 %N to i32 %0 = call i32 @__kmpc_target_init(ptr nonnull @1, i8 1, i1 true) #3 %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit user_code.entry: ; preds = %entry + %c = icmp eq i64 %N, 42 + %select = select i1 %c, ptr %al32, ptr addrspacecast (ptr addrspace(5) @LocGlob to ptr) + store ptr %select, ptr %loc %1 = call i32 @__kmpc_global_thread_num(ptr nonnull @1) store i32 0, ptr %x, align 4, !noalias !8 %arrayidx1.i = getelementptr inbounds i32, ptr %x, i64 1 @@ -292,6 +310,9 @@ for.body.i: ; preds = %for.cond.i %idxprom4.i = zext i32 %i.0.i to i64 %arrayidx5.i = getelementptr inbounds i32, ptr %x, i64 %idxprom4.i store i32 %sub3.i, ptr %arrayidx5.i, align 4, !noalias !8 + ; No Need to guard these accesses + %l = load ptr, ptr %loc + store i32 4711, ptr %l %inc.i = add nuw nsw i32 %i.0.i, 1 br label %for.cond.i, !llvm.loop !11 @@ -359,6 +380,7 @@ define weak i32 @__kmpc_target_init(ptr, i8, i1) { ; Function Attrs: convergent declare i32 @no_openmp(ptr) #1 +declare void @no_openmp_i32(i32) #1 ; Function Attrs: convergent nounwind readonly willreturn declare void @pure() #2 -- 2.7.4