From f649599ea93301bd0d0a2b8e450d1f77425ea92e Mon Sep 17 00:00:00 2001 From: Igor Kudrin Date: Fri, 21 Apr 2023 20:01:10 -0700 Subject: [PATCH] [CMake] Use LLVM own tools in extract_symbols.py As for now, 'extract_symbols.py' can use several tools to extract symbols from object files and libraries and to guess if the target is 32-bit Windows. The tools are being found via PATH, so in most cases, they are just system tools. This approach has a number of limitations, in particular: * System tools may not be able to handle the target format in case of cross-platform builds, * They cannot read symbols from LLVM bitcode files, so the staged LTO build with plugins is not supported, * The auto-selected tools may be suboptimal (see D113557), * Support for multiple tools for a single task increases the complexity of the script code. The patch proposes using LLVM's own tools to solve these issues. Specifically, 'llvm-readobj' detects the target platform, and 'llvm-nm' reads symbols from all supported formats, including bitcode files. The tools can be built in Release mode for the host platform or overridden using CMake settings 'LLVM_READOBJ' and 'LLVM_NM' respectively. The implementation also supports using precompiled tools via 'LLVM_NATIVE_TOOL_DIR'. Differential Revision: https://reviews.llvm.org/D149119 --- llvm/CMakeLists.txt | 5 - llvm/cmake/modules/AddLLVM.cmake | 31 ++++-- llvm/cmake/modules/CrossCompile.cmake | 16 +-- llvm/tools/llvm-nm/CMakeLists.txt | 2 + llvm/tools/llvm-readobj/CMakeLists.txt | 2 + llvm/tools/llvm-shlib/CMakeLists.txt | 15 +-- llvm/utils/extract_symbols.py | 177 ++++++++------------------------- 7 files changed, 83 insertions(+), 165 deletions(-) diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 5e2f08f..dfe81ad0 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -1129,11 +1129,6 @@ endif( ${CMAKE_SYSTEM_NAME} MATCHES SunOS ) # use export_executable_symbols(target). set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS "") -set(LLVM_EXTRACT_SYMBOLS_FLAGS "" - CACHE STRING "Additional options to pass to llvm/utils/extract_symbols.py. - These cannot override the options set by cmake, but can add extra options - such as --tools.") - include(AddLLVM) include(TableGen) diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake index 5357e54..91d2c8b 100644 --- a/llvm/cmake/modules/AddLLVM.cmake +++ b/llvm/cmake/modules/AddLLVM.cmake @@ -1246,10 +1246,18 @@ function(export_executable_symbols target) else() set(mangling itanium) endif() + get_host_tool_path(llvm-nm LLVM_NM llvm_nm_exe llvm_nm_target) + get_host_tool_path(llvm-readobj LLVM_READOBJ llvm_readobj_exe llvm_readobj_target) add_custom_command(OUTPUT ${exported_symbol_file} - COMMAND "${Python3_EXECUTABLE}" ${LLVM_MAIN_SRC_DIR}/utils/extract_symbols.py ${LLVM_EXTRACT_SYMBOLS_FLAGS} --mangling=${mangling} ${static_libs} -o ${exported_symbol_file} + COMMAND "${Python3_EXECUTABLE}" + ${LLVM_MAIN_SRC_DIR}/utils/extract_symbols.py + --mangling=${mangling} ${static_libs} + -o ${exported_symbol_file} + --nm=${llvm_nm_exe} + --readobj=${llvm_readobj_exe} WORKING_DIRECTORY ${LLVM_LIBRARY_OUTPUT_INTDIR} - DEPENDS ${LLVM_MAIN_SRC_DIR}/utils/extract_symbols.py ${static_libs} + DEPENDS ${LLVM_MAIN_SRC_DIR}/utils/extract_symbols.py + ${static_libs} ${llvm_nm_target} ${llvm_readobj_target} VERBATIM COMMENT "Generating export list for ${target}") add_llvm_symbol_exports( ${target} ${exported_symbol_file} ) @@ -2423,8 +2431,8 @@ function(find_first_existing_vc_file path out_var) endif() endfunction() -function(setup_host_tool tool_name setting_name exe_var_name target_var_name) - set(${setting_name}_DEFAULT "${tool_name}") +function(get_host_tool_path tool_name setting_name exe_var_name target_var_name) + set(${setting_name}_DEFAULT "") if(LLVM_NATIVE_TOOL_DIR) if(EXISTS "${LLVM_NATIVE_TOOL_DIR}/${tool_name}${LLVM_HOST_EXECUTABLE_SUFFIX}") @@ -2435,11 +2443,11 @@ function(setup_host_tool tool_name setting_name exe_var_name target_var_name) set(${setting_name} "${${setting_name}_DEFAULT}" CACHE STRING "Host ${tool_name} executable. Saves building if cross-compiling.") - if(NOT ${setting_name} STREQUAL "${tool_name}") + if(${setting_name}) set(exe_name ${${setting_name}}) - set(target_name ${${setting_name}}) + set(target_name "") elseif(LLVM_USE_HOST_TOOLS) - build_native_tool(${tool_name} exe_name DEPENDS ${tool_name}) + get_native_tool_path(${tool_name} exe_name) set(target_name ${exe_name}) else() set(exe_name $) @@ -2448,3 +2456,12 @@ function(setup_host_tool tool_name setting_name exe_var_name target_var_name) set(${exe_var_name} "${exe_name}" CACHE STRING "") set(${target_var_name} "${target_name}" CACHE STRING "") endfunction() + +function(setup_host_tool tool_name setting_name exe_var_name target_var_name) + get_host_tool_path(${tool_name} ${setting_name} ${exe_var_name} ${target_var_name}) + # Set up a native tool build if necessary + if(LLVM_USE_HOST_TOOLS AND NOT ${setting_name}) + build_native_tool(${tool_name} exe_name DEPENDS ${tool_name}) + add_custom_target(${target_var_name} DEPENDS ${exe_name}) + endif() +endfunction() diff --git a/llvm/cmake/modules/CrossCompile.cmake b/llvm/cmake/modules/CrossCompile.cmake index 7c750f0..6af47b5 100644 --- a/llvm/cmake/modules/CrossCompile.cmake +++ b/llvm/cmake/modules/CrossCompile.cmake @@ -97,6 +97,15 @@ function(llvm_create_cross_target project_name target_name toolchain buildtype) endfunction() +function(get_native_tool_path target output_path_var) + if(CMAKE_CONFIGURATION_TYPES) + set(output_path "${${PROJECT_NAME}_NATIVE_BUILD}/Release/bin/${target}") + else() + set(output_path "${${PROJECT_NAME}_NATIVE_BUILD}/bin/${target}") + endif() + set(${output_path_var} ${output_path}${LLVM_HOST_EXECUTABLE_SUFFIX} PARENT_SCOPE) +endfunction() + # Sets up a native build for a tool, used e.g. for cross-compilation and # LLVM_OPTIMIZED_TABLEGEN. Always builds in Release. # - target: The target to build natively @@ -105,12 +114,7 @@ endfunction() function(build_native_tool target output_path_var) cmake_parse_arguments(ARG "" "" "DEPENDS" ${ARGN}) - if(CMAKE_CONFIGURATION_TYPES) - set(output_path "${${PROJECT_NAME}_NATIVE_BUILD}/Release/bin/${target}") - else() - set(output_path "${${PROJECT_NAME}_NATIVE_BUILD}/bin/${target}") - endif() - set(output_path ${output_path}${LLVM_HOST_EXECUTABLE_SUFFIX}) + get_native_tool_path(${target} output_path) # Make chain of preceding actions if(CMAKE_GENERATOR MATCHES "Visual Studio") diff --git a/llvm/tools/llvm-nm/CMakeLists.txt b/llvm/tools/llvm-nm/CMakeLists.txt index cd69712..ec04f1e 100644 --- a/llvm/tools/llvm-nm/CMakeLists.txt +++ b/llvm/tools/llvm-nm/CMakeLists.txt @@ -25,6 +25,8 @@ add_llvm_tool(llvm-nm GENERATE_DRIVER ) +setup_host_tool(llvm-nm LLVM_NM llvm_nm_exe llvm_nm_target) + if(LLVM_INSTALL_BINUTILS_SYMLINKS) add_llvm_tool_symlink(nm llvm-nm) endif() diff --git a/llvm/tools/llvm-readobj/CMakeLists.txt b/llvm/tools/llvm-readobj/CMakeLists.txt index c49526b..0051f87 100644 --- a/llvm/tools/llvm-readobj/CMakeLists.txt +++ b/llvm/tools/llvm-readobj/CMakeLists.txt @@ -30,6 +30,8 @@ add_llvm_tool(llvm-readobj GENERATE_DRIVER ) +setup_host_tool(llvm-readobj LLVM_READOBJ llvm_readobj_exe llvm_readobj_target) + add_llvm_tool_symlink(llvm-readelf llvm-readobj) if(LLVM_INSTALL_BINUTILS_SYMLINKS) diff --git a/llvm/tools/llvm-shlib/CMakeLists.txt b/llvm/tools/llvm-shlib/CMakeLists.txt index 90e2904..4f6a2cb 100644 --- a/llvm/tools/llvm-shlib/CMakeLists.txt +++ b/llvm/tools/llvm-shlib/CMakeLists.txt @@ -166,21 +166,10 @@ if(LLVM_BUILD_LLVM_C_DYLIB AND MSVC) set(GEN_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/gen-msvc-exports.py) set(LLVM_EXPORTED_SYMBOL_FILE ${LLVM_BINARY_DIR}/${CMAKE_CFG_INTDIR}/libllvm-c.exports) - if(NOT LLVM_NM) - if(CMAKE_CROSSCOMPILING) - build_native_tool(llvm-nm llvm_nm) - set(llvm_nm_target "${llvm_nm}") - else() - set(llvm_nm $) - set(llvm_nm_target llvm-nm) - endif() - else() - set(llvm_nm ${LLVM_NM}) - set(llvm_nm_target "") - endif() + get_host_tool_path(llvm-nm LLVM_NM llvm_nm_exe llvm_nm_target) add_custom_command(OUTPUT ${LLVM_EXPORTED_SYMBOL_FILE} - COMMAND "${Python3_EXECUTABLE}" ${GEN_SCRIPT} --libsfile ${LIBSFILE} ${GEN_UNDERSCORE} --nm "${llvm_nm}" -o ${LLVM_EXPORTED_SYMBOL_FILE} + COMMAND "${Python3_EXECUTABLE}" ${GEN_SCRIPT} --libsfile ${LIBSFILE} ${GEN_UNDERSCORE} --nm "${llvm_nm_exe}" -o ${LLVM_EXPORTED_SYMBOL_FILE} DEPENDS ${LIB_NAMES} ${llvm_nm_target} COMMENT "Generating export list for LLVM-C" VERBATIM ) diff --git a/llvm/utils/extract_symbols.py b/llvm/utils/extract_symbols.py index 7ec27b3..a2eabd3 100755 --- a/llvm/utils/extract_symbols.py +++ b/llvm/utils/extract_symbols.py @@ -23,30 +23,20 @@ import subprocess import multiprocessing import argparse -# Define functions which extract a list of pairs of (symbols, is_def) from a -# library using several different tools. We use subprocess.Popen and yield a -# symbol at a time instead of using subprocess.check_output and returning a list -# as, especially on Windows, waiting for the entire output to be ready can take -# a significant amount of time. - -def dumpbin_get_symbols(lib): - process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1, - stdout=subprocess.PIPE, stdin=subprocess.PIPE, - universal_newlines=True) - process.stdin.close() - for line in process.stdout: - # Look for external symbols - match = re.match("^.+(SECT|UNDEF).+External\s+\|\s+(\S+).*$", line) - if match: - yield (match.group(2), match.group(1) != "UNDEF") - process.wait() - -def nm_get_symbols(lib): - # -P means the output is in portable format, and -g means we only get global - # symbols. - cmd = ['nm','-P','-g'] - if sys.platform.startswith('aix'): - cmd += ['-Xany','-C','-p'] +# Define a function which extracts a list of pairs of (symbols, is_def) from a +# library using llvm-nm becuase it can work both with regular and bitcode files. +# We use subprocess.Popen and yield a symbol at a time instead of using +# subprocess.check_output and returning a list as, especially on Windows, waiting +# for the entire output to be ready can take a significant amount of time. +def nm_get_symbols(tool, lib): + # '-P' means the output is in portable format, + # '-g' means we only get global symbols, + # '-Xany' enforce handling both 32- and 64-bit objects on AIX, + # '--no-demangle' ensure that C++ symbol names are not demangled; note + # that llvm-nm do not demangle by default, but the system nm on AIX does + # that, so the behavior may change in the future, + # '-p' do not waste time sorting the symbols. + cmd = [tool,'-P','-g','-Xany','--no-demangle','-p'] process = subprocess.Popen(cmd+[lib], bufsize=1, stdout=subprocess.PIPE, stdin=subprocess.PIPE, universal_newlines=True) @@ -68,61 +58,10 @@ def nm_get_symbols(lib): yield (match.group(1), False) process.wait() -def readobj_get_symbols(lib): - process = subprocess.Popen(['llvm-readobj','--symbols',lib], bufsize=1, - stdout=subprocess.PIPE, stdin=subprocess.PIPE, - universal_newlines=True) - process.stdin.close() - for line in process.stdout: - # When looking through the output of llvm-readobj we expect to see Name, - # Section, then StorageClass, so record Name and Section when we see - # them and decide if this is an external symbol when we see - # StorageClass. - match = re.search('Name: (\S+)', line) - if match: - name = match.group(1) - match = re.search('Section: (\S+)', line) - if match: - section = match.group(1) - match = re.search('StorageClass: (\S+)', line) - if match: - storageclass = match.group(1) - if section != 'IMAGE_SYM_ABSOLUTE' and \ - storageclass == 'External': - yield (name, section != 'IMAGE_SYM_UNDEFINED') - process.wait() - -# Define functions which determine if the target is 32-bit Windows (as that's +# Define a function which determines if the target is 32-bit Windows (as that's # where calling convention name decoration happens). - -def dumpbin_is_32bit_windows(lib): - # dumpbin /headers can output a huge amount of data (>100MB in a debug - # build) so we read only up to the 'machine' line then close the output. - process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1, - stdout=subprocess.PIPE, stdin=subprocess.PIPE, - universal_newlines=True) - process.stdin.close() - retval = False - for line in process.stdout: - match = re.match('.+machine \((\S+)\)', line) - if match: - retval = (match.group(1) == 'x86') - break - process.stdout.close() - process.wait() - return retval - -def objdump_is_32bit_windows(lib): - output = subprocess.check_output(['objdump','-f',lib], - universal_newlines=True) - for line in output.splitlines(): - match = re.match('.+file format (\S+)', line) - if match: - return (match.group(1) == 'pe-i386') - return False - -def readobj_is_32bit_windows(lib): - output = subprocess.check_output(['llvm-readobj','--file-header',lib], +def readobj_is_32bit_windows(tool, lib): + output = subprocess.check_output([tool,'--file-header',lib], universal_newlines=True) for line in output.splitlines(): match = re.match('Format: (\S+)', line) @@ -130,11 +69,6 @@ def readobj_is_32bit_windows(lib): return (match.group(1) == 'COFF-i386') return False -# On AIX, there isn't an easy way to detect 32-bit windows objects with the system toolchain, -# so just assume false. -def aix_is_32bit_windows(lib): - return False - # MSVC mangles names to ?@. By examining the # identifier/type mangling we can decide which symbols could possibly be # required and which we can discard. @@ -355,10 +289,10 @@ def parse_microsoft_mangling(arg): return components def extract_symbols(arg): - get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg + llvm_nm_path, should_keep_symbol, calling_convention_decoration, lib = arg symbol_defs = dict() symbol_refs = set() - for (symbol, is_def) in get_symbols(lib): + for (symbol, is_def) in nm_get_symbols(llvm_nm_path, lib): symbol = should_keep_symbol(symbol, calling_convention_decoration) if symbol: if is_def: @@ -392,63 +326,38 @@ def get_template_name(sym, mangling): # Not a template return None +def parse_tool_path(parser, tool, val): + try: + # Close std streams as we don't want any output and we don't + # want the process to wait for something on stdin. + p = subprocess.Popen([val], stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + stdin=subprocess.PIPE, + universal_newlines=True) + p.stdout.close() + p.stderr.close() + p.stdin.close() + p.wait() + return val + except Exception: + parser.error(f'Invalid path for {tool}') + if __name__ == '__main__': - tool_exes = ['dumpbin','nm','objdump','llvm-readobj'] parser = argparse.ArgumentParser( description='Extract symbols to export from libraries') parser.add_argument('--mangling', choices=['itanium','microsoft'], required=True, help='expected symbol mangling scheme') - parser.add_argument('--tools', choices=tool_exes, nargs='*', - help='tools to use to extract symbols and determine the' - ' target') + parser.add_argument('--nm', metavar='path', + type=lambda x: parse_tool_path(parser, 'nm', x), + help='path to the llvm-nm executable') + parser.add_argument('--readobj', metavar='path', + type=lambda x: parse_tool_path(parser, 'readobj', x), + help='path to the llvm-readobj executable') parser.add_argument('libs', metavar='lib', type=str, nargs='+', help='libraries to extract symbols from') parser.add_argument('-o', metavar='file', type=str, help='output to file') args = parser.parse_args() - # Determine the function to use to get the list of symbols from the inputs, - # and the function to use to determine if the target is 32-bit windows. - tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows), - 'nm' : (nm_get_symbols, None), - 'objdump' : (None, objdump_is_32bit_windows), - 'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) } - get_symbols = None - is_32bit_windows = aix_is_32bit_windows if sys.platform.startswith('aix') else None - # If we have a tools argument then use that for the list of tools to check - if args.tools: - tool_exes = args.tools - # Find a tool to use by trying each in turn until we find one that exists - # (subprocess.call will throw OSError when the program does not exist) - get_symbols = None - for exe in tool_exes: - try: - # Close std streams as we don't want any output and we don't - # want the process to wait for something on stdin. - p = subprocess.Popen([exe], stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - stdin=subprocess.PIPE, - universal_newlines=True) - p.stdout.close() - p.stderr.close() - p.stdin.close() - p.wait() - # Keep going until we have a tool to use for both get_symbols and - # is_32bit_windows - if not get_symbols: - get_symbols = tools[exe][0] - if not is_32bit_windows: - is_32bit_windows = tools[exe][1] - if get_symbols and is_32bit_windows: - break - except OSError: - continue - if not get_symbols: - print("Couldn't find a program to read symbols with", file=sys.stderr) - exit(1) - if not is_32bit_windows: - print("Couldn't find a program to determining the target", file=sys.stderr) - exit(1) - # How we determine which symbols to keep and which to discard depends on # the mangling scheme if args.mangling == 'microsoft': @@ -478,7 +387,7 @@ if __name__ == '__main__': # Check if calling convention decoration is used by inspecting the first # library in the list - calling_convention_decoration = is_32bit_windows(libs[0]) + calling_convention_decoration = readobj_is_32bit_windows(args.readobj, libs[0]) # Extract symbols from libraries in parallel. This is a huge time saver when # doing a debug build, as there are hundreds of thousands of symbols in each @@ -489,7 +398,7 @@ if __name__ == '__main__': # use a lambda or local function definition as that doesn't work on # windows, so create a list of tuples which duplicates the arguments # that are the same in all calls. - vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs] + vals = [(args.nm, should_keep_symbol, calling_convention_decoration, x) for x in libs] # Do an async map then wait for the result to make sure that # KeyboardInterrupt gets caught correctly (see # http://bugs.python.org/issue8296) -- 2.7.4