From b2a8d2c69bce98604f1feaad9e6bebd3759b5635 Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Tue, 10 Jan 2023 13:51:28 -0800 Subject: [PATCH] [OpenMP] Avoid running openmp-opt on dead functions The Attributor has logic to run only on assumed live functions and this is exposed to users now. OpenMP-opt will (mostly) ignore dead internal functions now but run the same deduction as before if an internal function is marked live. This should lower compile time as we run on less code and delete more code early on. For the full OpenMC module compiled with noinline and JITed at runtime, we save ~25%, or ~10s on my machine during JITing. --- llvm/include/llvm/Transforms/IPO/Attributor.h | 11 ++-- llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 59 +++++++++++++++------- .../openmp_opt_dont_follow_gep_without_value.ll | 14 ----- llvm/test/Transforms/OpenMP/global_constructor.ll | 18 +++---- .../OpenMP/reduced_pointer_info_assertion.ll | 13 ----- .../Transforms/OpenMP/single_threaded_execution.ll | 14 ++--- 6 files changed, 64 insertions(+), 65 deletions(-) diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index dbf2892..272d710 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -1406,12 +1406,13 @@ struct AttributorConfig { bool RewriteSignatures = true; /// Flag to determine if we want to initialize all default AAs for an internal - /// function marked live. - /// TODO: This should probably be a callback, or maybe - /// identifyDefaultAbstractAttributes should be virtual, something to allow - /// customizable lazy initialization for internal functions. + /// function marked live. See also: InitializationCallback> bool DefaultInitializeLiveInternals = true; + /// Callback function to be invoked on internal functions marked live. + std::function InitializationCallback = + nullptr; + /// Helper to update an underlying call graph and to delete functions. CallGraphUpdater &CGUpdater; @@ -1738,6 +1739,8 @@ struct Attributor { if (Configuration.DefaultInitializeLiveInternals) identifyDefaultAbstractAttributes(const_cast(F)); + if (Configuration.InitializationCallback) + Configuration.InitializationCallback(*this, F); } /// Helper function to remove callsite. diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index eebe8d3..8bd61a2 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -2163,8 +2163,13 @@ private: void registerFoldRuntimeCall(RuntimeFunction RF); /// Populate the Attributor with abstract attribute opportunities in the - /// function. + /// functions. void registerAAs(bool IsModulePass); + +public: + /// Callback to register AAs for live functions, including internal functions + /// marked live during the traversal. + static void registerAAsForFunction(Attributor &A, const Function &F); }; Kernel OpenMPOpt::getUniqueKernelFor(Function &F) { @@ -4849,20 +4854,35 @@ void OpenMPOpt::registerAAs(bool IsModulePass) { if (F->isDeclaration()) continue; - if (!DisableOpenMPOptDeglobalization) - A.getOrCreateAAFor(IRPosition::function(F)); - A.getOrCreateAAFor(IRPosition::function(*F)); - if (!DisableOpenMPOptDeglobalization) - A.getOrCreateAAFor(IRPosition::function(*F)); - - for (auto &I : instructions(*F)) { - if (auto *LI = dyn_cast(&I)) { - bool UsedAssumedInformation = false; - A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr, - UsedAssumedInformation, AA::Interprocedural); - } else if (auto *SI = dyn_cast(&I)) { - A.getOrCreateAAFor(IRPosition::value(*SI)); - } + // We look at internal functions only on-demand but if any use is not a + // direct call or outside the current set of analyzed functions, we have + // to do it eagerly. + if (F->hasLocalLinkage()) { + if (llvm::all_of(F->uses(), [this](const Use &U) { + const auto *CB = dyn_cast(U.getUser()); + return CB && CB->isCallee(&U) && + !A.isRunOn(const_cast(CB->getCaller())); + })) + continue; + } + registerAAsForFunction(A, *F); + } +} + +void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) { + if (!DisableOpenMPOptDeglobalization) + A.getOrCreateAAFor(IRPosition::function(F)); + A.getOrCreateAAFor(IRPosition::function(F)); + if (!DisableOpenMPOptDeglobalization) + A.getOrCreateAAFor(IRPosition::function(F)); + + for (auto &I : instructions(F)) { + if (auto *LI = dyn_cast(&I)) { + bool UsedAssumedInformation = false; + A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr, + UsedAssumedInformation, AA::Interprocedural); + } else if (auto *SI = dyn_cast(&I)) { + A.getOrCreateAAFor(IRPosition::value(*SI)); } } } @@ -5033,10 +5053,13 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { } // Look at every function in the Module unless it was internalized. + SetVector Functions; SmallVector SCC; for (Function &F : M) - if (!F.isDeclaration() && !InternalizedMap.lookup(&F)) + if (!F.isDeclaration() && !InternalizedMap.lookup(&F)) { SCC.push_back(&F); + Functions.insert(&F); + } if (SCC.empty()) return PreservedAnalyses::all(); @@ -5057,12 +5080,13 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { AttributorConfig AC(CGUpdater); AC.DefaultInitializeLiveInternals = false; + AC.IsModulePass = true; AC.RewriteSignatures = false; AC.MaxFixpointIterations = MaxFixpointIterations; AC.OREGetter = OREGetter; AC.PassName = DEBUG_TYPE; + AC.InitializationCallback = OpenMPOpt::registerAAsForFunction; - SetVector Functions; Attributor A(Functions, InfoCache, AC); OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); @@ -5137,6 +5161,7 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C, AC.MaxFixpointIterations = MaxFixpointIterations; AC.OREGetter = OREGetter; AC.PassName = DEBUG_TYPE; + AC.InitializationCallback = OpenMPOpt::registerAAsForFunction; Attributor A(Functions, InfoCache, AC); diff --git a/llvm/test/Transforms/Attributor/reduced/openmp_opt_dont_follow_gep_without_value.ll b/llvm/test/Transforms/Attributor/reduced/openmp_opt_dont_follow_gep_without_value.ll index 5005a09..57cbcb9 100644 --- a/llvm/test/Transforms/Attributor/reduced/openmp_opt_dont_follow_gep_without_value.ll +++ b/llvm/test/Transforms/Attributor/reduced/openmp_opt_dont_follow_gep_without_value.ll @@ -29,19 +29,6 @@ define weak_odr ptr @h(ptr %0) { ; CHECK-NEXT: ret void ; ; -; CHECK: Function Attrs: norecurse nounwind memory(none) -; CHECK-LABEL: define {{[^@]+}}@g -; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: ret double 0.000000e+00 -; -; -; CHECK: Function Attrs: norecurse nosync nounwind memory(none) -; CHECK-LABEL: define {{[^@]+}}@h.internalized -; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, ptr [[TMP0]], i64 undef -; CHECK-NEXT: ret ptr [[TMP2]] -; -; ; CHECK-LABEL: define {{[^@]+}}@h ; CHECK-SAME: (ptr [[TMP0:%.*]]) { ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP0]], align 4 @@ -50,7 +37,6 @@ define weak_odr ptr @h(ptr %0) { ; ;. ; CHECK: attributes #[[ATTR0]] = { norecurse nounwind memory(none) } -; CHECK: attributes #[[ATTR1]] = { norecurse nosync nounwind memory(none) } ;. ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} diff --git a/llvm/test/Transforms/OpenMP/global_constructor.ll b/llvm/test/Transforms/OpenMP/global_constructor.ll index 1c304ce..a0b608b 100644 --- a/llvm/test/Transforms/OpenMP/global_constructor.ll +++ b/llvm/test/Transforms/OpenMP/global_constructor.ll @@ -37,7 +37,7 @@ declare i32 @__kmpc_target_init(ptr, i8, i1) local_unnamed_addr declare void @__kmpc_target_deinit(ptr, i8) local_unnamed_addr -define internal void @__omp_offloading__fd02_85283c04_Device_l6_ctor() { +define weak void @__omp_offloading__fd02_85283c04_Device_l6_ctor() { entry: %call.i = tail call double @__nv_log(double noundef 2.000000e+00) #1 %call.i2 = tail call double @__nv_log(double noundef 2.000000e+00) #1 @@ -78,31 +78,29 @@ attributes #1 = { convergent nounwind } ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_fd02_85283c04_main_l11 ; CHECK-SAME: (ptr nonnull align 8 dereferenceable(8) [[X:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr nonnull @[[GLOB1:[0-9]+]], i8 2, i1 false) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr nonnull @[[GLOB1:[0-9]+]], i8 2, i1 false) #[[ATTR1:[0-9]+]] ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] ; CHECK: common.ret: ; CHECK-NEXT: ret void ; CHECK: user_code.entry: ; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr @_ZL6Device, align 8, !tbaa [[TBAA11:![0-9]+]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @__kmpc_get_hardware_thread_id_in_block() #[[ATTR2]] +; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @__kmpc_get_hardware_thread_id_in_block() #[[ATTR1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] ; CHECK: region.guarded: ; CHECK-NEXT: store double [[TMP1]], ptr [[X]], align 8, !tbaa [[TBAA11]] ; CHECK-NEXT: br label [[REGION_BARRIER]] ; CHECK: region.barrier: -; CHECK-NEXT: tail call void @__kmpc_barrier_simple_spmd(ptr nonnull @[[GLOB1]], i32 [[TMP2]]) #[[ATTR2]] -; CHECK-NEXT: tail call void @__kmpc_target_deinit(ptr nonnull @[[GLOB1]], i8 2) #[[ATTR2]] +; CHECK-NEXT: tail call void @__kmpc_barrier_simple_spmd(ptr nonnull @[[GLOB1]], i32 [[TMP2]]) #[[ATTR1]] +; CHECK-NEXT: tail call void @__kmpc_target_deinit(ptr nonnull @[[GLOB1]], i8 2) #[[ATTR1]] ; CHECK-NEXT: br label [[COMMON_RET]] ; ; -; CHECK: Function Attrs: norecurse -; CHECK-LABEL: define {{[^@]+}}@__omp_offloading__fd02_85283c04_Device_l6_ctor -; CHECK-SAME: () #[[ATTR0:[0-9]+]] { +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading__fd02_85283c04_Device_l6_ctor() { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL_I:%.*]] = tail call double @__nv_log(double noundef 2.000000e+00) #[[ATTR1:[0-9]+]] -; CHECK-NEXT: [[CALL_I2:%.*]] = tail call double @__nv_log(double noundef 2.000000e+00) #[[ATTR1]] +; CHECK-NEXT: [[CALL_I:%.*]] = tail call double @__nv_log(double noundef 2.000000e+00) #[[ATTR0:[0-9]+]] +; CHECK-NEXT: [[CALL_I2:%.*]] = tail call double @__nv_log(double noundef 2.000000e+00) #[[ATTR0]] ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[CALL_I]], [[CALL_I2]] ; CHECK-NEXT: store double [[DIV]], ptr @_ZL6Device, align 8, !tbaa [[TBAA11]] ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/OpenMP/reduced_pointer_info_assertion.ll b/llvm/test/Transforms/OpenMP/reduced_pointer_info_assertion.ll index 822b701..855bab8 100644 --- a/llvm/test/Transforms/OpenMP/reduced_pointer_info_assertion.ll +++ b/llvm/test/Transforms/OpenMP/reduced_pointer_info_assertion.ll @@ -29,19 +29,6 @@ define fastcc void @rec(ptr %0, i64 %1) { !0 = !{i32 7, !"openmp", i32 50} !1 = !{i32 7, !"openmp-device", i32 50} -; MODULE-LABEL: define {{[^@]+}}@nblist -; MODULE-SAME: () #[[ATTR0:[0-9]+]] { -; MODULE-NEXT: [[TMP1:%.*]] = call ptr @alloc() -; MODULE-NEXT: call fastcc void @rec.internalized(ptr [[TMP1]], i64 0) -; MODULE-NEXT: ret i32 0 -; -; -; MODULE-LABEL: define {{[^@]+}}@rec.internalized -; MODULE-SAME: (ptr nocapture writeonly [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; MODULE-NEXT: call fastcc void @rec.internalized(ptr nocapture writeonly [[TMP0]], i64 0) #[[ATTR2:[0-9]+]] -; MODULE-NEXT: ret void -; -; ; MODULE-LABEL: define {{[^@]+}}@rec ; MODULE-SAME: (ptr [[TMP0:%.*]], i64 [[TMP1:%.*]]) { ; MODULE-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[TMP0]], i64 [[TMP1]] diff --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll index 16ccd2d..fb52321 100644 --- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll +++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll @@ -51,6 +51,13 @@ if.end: ret void } +; CHECK: [openmp-opt] Basic block @foo entry is executed by a single thread. +; Function Attrs: noinline +define internal void @foo() { +entry: + ret void +} + ; CHECK-NOT: [openmp-opt] Basic block @amdgcn entry is executed by a single thread. ; CHECK-DAG: [openmp-opt] Basic block @amdgcn if.then is executed by a single thread. ; CHECK-NOT: [openmp-opt] Basic block @amdgcn if.end is executed by a single thread. @@ -72,13 +79,6 @@ if.end: ret void } -; CHECK: [openmp-opt] Basic block @foo entry is executed by a single thread. -; Function Attrs: noinline -define internal void @foo() { -entry: - ret void -} - ; CHECK: [openmp-opt] Basic block @bar.internalized entry is executed by a single thread. ; Function Attrs: noinline define void @bar() { -- 2.7.4