From 3dab7fede2019c399d793c43ca9ea5a4f2d5031f Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Fri, 23 Sep 2022 10:08:58 +0200 Subject: [PATCH] [CMake] Add clang-bolt target This patch adds `CLANG_BOLT_INSTRUMENT` option that applies BOLT instrumentation to Clang, performs a bootstrap build with the resulting Clang, merges resulting fdata files into a single profile file, and uses it to perform BOLT optimization on the original Clang binary. The projects and targets used for bootstrap/profile collection are configurable via `CLANG_BOLT_INSTRUMENT_PROJECTS` and `CLANG_BOLT_INSTRUMENT_TARGETS`. The defaults are "llvm" and "count" respectively, which results in a profile with ~5.3B dynamically executed instructions. The intended use of the functionality is through BOLT CMake cache file, similar to PGO 2-stage build: ``` cmake /llvm -C /clang/cmake/caches/BOLT.cmake ninja clang++-bolt # pulls clang-bolt ``` Stats with a recent checkout (clang-16), pre-built BOLT and Clang, 72vCPU/224G | CMake configure with host Clang + BOLT.cmake | 1m6.592s | Instrumenting Clang with BOLT | 2m50.508s | CMake configure `llvm` with instrumented Clang | 5m46.364s (~5x slowdown) | CMake build `not` with instrumented Clang |0m6.456s | Merging fdata files | 0m9.439s | Optimizing Clang with BOLT | 0m39.201s Building Clang: ```cmake ../llvm-project/llvm -DCMAKE_C_COMPILER=... -DCMAKE_CXX_COMPILER=... -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS=clang -DLLVM_TARGETS_TO_BUILD=Native -GNinja``` | | Release | BOLT-optimized | cmake | 0m24.016s | 0m22.333s | ninja clang | 5m55.692s | 4m35.122s I know it's not rigorous, but shows a ballpark figure. Reviewed By: phosek Differential Revision: https://reviews.llvm.org/D132975 --- clang/CMakeLists.txt | 114 ++++++++++++++++++++++++++++++- clang/cmake/caches/BOLT.cmake | 15 ++++ clang/utils/perf-training/perf-helper.py | 18 ++++- 3 files changed, 143 insertions(+), 4 deletions(-) create mode 100644 clang/cmake/caches/BOLT.cmake diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt index 02ce5de..22b5118 100644 --- a/clang/CMakeLists.txt +++ b/clang/CMakeLists.txt @@ -443,7 +443,7 @@ CMAKE_DEPENDENT_OPTION(CLANG_PLUGIN_SUPPORT "HAVE_CLANG_PLUGIN_SUPPORT" OFF) # If libstdc++ is statically linked, clang-repl needs to statically link libstdc++ -# itself, which is not possible in many platforms because of current limitations in +# itself, which is not possible in many platforms because of current limitations in # JIT stack. (more platforms need to be supported by JITLink) if(NOT LLVM_STATIC_LINK_CXX_STDLIB) set(HAVE_CLANG_REPL_SUPPORT ON) @@ -881,6 +881,118 @@ if (CLANG_ENABLE_BOOTSTRAP) endforeach() endif() +if (CLANG_BOLT_INSTRUMENT) + set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang) + set(CLANGXX_PATH ${CLANG_PATH}++) + set(CLANG_INSTRUMENTED ${CLANG_PATH}-bolt.inst) + set(CLANGXX_INSTRUMENTED ${CLANGXX_PATH}-bolt.inst) + set(CLANG_OPTIMIZED ${CLANG_PATH}-bolt) + set(CLANGXX_OPTIMIZED ${CLANGXX_PATH}-bolt) + + # Instrument clang with BOLT + add_custom_target(clang-instrumented + DEPENDS ${CLANG_INSTRUMENTED} + ) + add_custom_command(OUTPUT ${CLANG_INSTRUMENTED} + DEPENDS clang llvm-bolt + COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED} + -instrument --instrumentation-file-append-pid + --instrumentation-file=${CMAKE_CURRENT_BINARY_DIR}/prof.fdata + COMMENT "Instrumenting clang binary with BOLT" + VERBATIM + ) + + # Make a symlink from clang-bolt.inst to clang++-bolt.inst + add_custom_target(clang++-instrumented + DEPENDS ${CLANGXX_INSTRUMENTED} + ) + add_custom_command(OUTPUT ${CLANGXX_INSTRUMENTED} + DEPENDS clang-instrumented + COMMAND ${CMAKE_COMMAND} -E create_symlink + ${CLANG_INSTRUMENTED} + ${CLANGXX_INSTRUMENTED} + COMMENT "Creating symlink from BOLT instrumented clang to clang++" + VERBATIM + ) + + # Build specified targets with instrumented Clang to collect the profile + set(STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-instrumented-clang-stamps/) + set(BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-instrumented-clang-bins/) + set(build_configuration "$") + include(ExternalProject) + ExternalProject_Add(bolt-instrumentation-profile + DEPENDS clang++-instrumented + PREFIX bolt-instrumentation-profile + SOURCE_DIR ${CMAKE_SOURCE_DIR} + STAMP_DIR ${STAMP_DIR} + BINARY_DIR ${BINARY_DIR} + EXCLUDE_FROM_ALL 1 + CMAKE_ARGS + ${CLANG_BOLT_INSTRUMENT_EXTRA_CMAKE_FLAGS} + # We shouldn't need to set this here, but INSTALL_DIR doesn't + # seem to work, so instead I'm passing this through + -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} + -DCMAKE_C_COMPILER=${CLANG_INSTRUMENTED} + -DCMAKE_CXX_COMPILER=${CLANGXX_INSTRUMENTED} + -DCMAKE_ASM_COMPILER=${CLANG_INSTRUMENTED} + -DCMAKE_ASM_COMPILER_ID=Clang + -DCMAKE_BUILD_TYPE=Release + -DLLVM_ENABLE_PROJECTS=${CLANG_BOLT_INSTRUMENT_PROJECTS} + -DLLVM_TARGETS_TO_BUILD=${LLVM_TARGETS_TO_BUILD} + BUILD_COMMAND ${CMAKE_COMMAND} --build ${BINARY_DIR} + --config ${build_configuration} + --target ${CLANG_BOLT_INSTRUMENT_TARGETS} + INSTALL_COMMAND "" + STEP_TARGETS configure build + USES_TERMINAL_CONFIGURE 1 + USES_TERMINAL_BUILD 1 + USES_TERMINAL_INSTALL 1 + ) + + # Merge profiles into one using merge-fdata + add_custom_target(clang-bolt-profile + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata + ) + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata + DEPENDS merge-fdata bolt-instrumentation-profile-build + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMAND ${Python3_EXECUTABLE} + ${CMAKE_CURRENT_SOURCE_DIR}/utils/perf-training/perf-helper.py merge-fdata + $ ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata + ${CMAKE_CURRENT_BINARY_DIR} + COMMENT "Preparing BOLT profile" + VERBATIM + ) + + # Optimize original (pre-bolt) Clang using the collected profile + add_custom_target(clang-bolt + DEPENDS ${CLANG_OPTIMIZED} + ) + add_custom_command(OUTPUT ${CLANG_OPTIMIZED} + DEPENDS clang-bolt-profile + COMMAND llvm-bolt ${CLANG_PATH} + -o ${CLANG_OPTIMIZED} + -data ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata + -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions + -split-all-cold -split-eh -dyno-stats -icf=1 -use-gnu-stack + COMMENT "Optimizing Clang with BOLT" + VERBATIM + ) + + # Make a symlink from clang-bolt to clang++-bolt + add_custom_target(clang++-bolt + DEPENDS ${CLANGXX_OPTIMIZED} + ) + add_custom_command(OUTPUT ${CLANGXX_OPTIMIZED} + DEPENDS clang-bolt + COMMAND ${CMAKE_COMMAND} -E create_symlink + ${CLANG_OPTIMIZED} + ${CLANGXX_OPTIMIZED} + COMMENT "Creating symlink from BOLT optimized clang to clang++" + VERBATIM + ) +endif() + if (LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION) add_subdirectory(utils/ClangVisualizers) endif() diff --git a/clang/cmake/caches/BOLT.cmake b/clang/cmake/caches/BOLT.cmake new file mode 100644 index 0000000..65444c8 --- /dev/null +++ b/clang/cmake/caches/BOLT.cmake @@ -0,0 +1,15 @@ +set(CMAKE_BUILD_TYPE Release CACHE STRING "") +set(CLANG_BOLT_INSTRUMENT ON CACHE BOOL "") +set(CLANG_BOLT_INSTRUMENT_PROJECTS "llvm" CACHE STRING "") +set(CLANG_BOLT_INSTRUMENT_TARGETS "count" CACHE STRING "") +set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "") +set(CLANG_BOLT_INSTRUMENT_EXTRA_CMAKE_FLAGS "" CACHE STRING "") + +set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "") +set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "") + +# Disable function splitting enabled by default in GCC8+ +if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-reorder-blocks-and-partition") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-reorder-blocks-and-partition") +endif() diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py index 88708a9..c6a815e 100644 --- a/clang/utils/perf-training/perf-helper.py +++ b/clang/utils/perf-training/perf-helper.py @@ -38,7 +38,7 @@ def clean(args): def merge(args): if len(args) != 3: - print('Usage: %s clean \n' % __file__ + + print('Usage: %s merge \n' % __file__ + '\tMerges all profraw files from path into output.') return 1 cmd = [args[0], 'merge', '-o', args[1]] @@ -46,6 +46,16 @@ def merge(args): subprocess.check_call(cmd) return 0 +def merge_fdata(args): + if len(args) != 3: + print('Usage: %s merge-fdata \n' % __file__ + + '\tMerges all fdata files from path into output.') + return 1 + cmd = [args[0], '-o', args[1]] + cmd.extend(findFilesWithExtension(args[2], "fdata")) + subprocess.check_call(cmd) + return 0 + def dtrace(args): parser = argparse.ArgumentParser(prog='perf-helper dtrace', description='dtrace wrapper for order file generation') @@ -395,10 +405,12 @@ def genOrderFile(args): return 0 commands = {'clean' : clean, - 'merge' : merge, + 'merge' : merge, 'dtrace' : dtrace, 'cc1' : cc1, - 'gen-order-file' : genOrderFile} + 'gen-order-file' : genOrderFile, + 'merge-fdata' : merge_fdata, + } def main(): f = commands[sys.argv[1]] -- 2.7.4