From: Edward Yang Date: Wed, 5 Dec 2018 18:57:00 +0000 (-0800) Subject: Improve HIPify performance (#14803) X-Git-Tag: accepted/tizen/6.5/unified/20211028.231830~2451 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=999690ff3d1e04ff3d23944a230f699eb88e01db;p=platform%2Fupstream%2Fpytorch.git Improve HIPify performance (#14803) Summary: ``` Improve performance of pyHIPIFY Changes: - Pre-compile regexes, don't use regexes when it's not necessary (this saves us ~15%) - Compile all substitutions for mappings into a single, non-backtracking regex using a Trie. This gives big savings. Before, running pyHIPIFY on all files took 15.8s. Now it takes 3.9s. ``` Stacked on #14769 Pull Request resolved: https://github.com/pytorch/pytorch/pull/14803 Differential Revision: D13342620 Pulled By: ezyang fbshipit-source-id: 1cfa36b3236bbe24d07080a31cc788a52d740f40 --- diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py index 817d13b..c530c50 100644 --- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py +++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py @@ -1905,6 +1905,8 @@ CUDA_IDENTIFIER_MAP = collections.OrderedDict([ ("cublasCgemm3mEx", ("rocblas_cgemm_3mex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), ("cublasZgemm_v2", ("rocblas_zgemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), ("cublasZgemm3m", ("rocblas_zgemm_3m", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + # NB: The function rocblas_sgemmex doesn't actually exist in + # rocblas, as of 2018-12-05 ("cublasSgemmEx", ("rocblas_sgemmex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), ("cublasGemmEx", ("rocblas_gemmex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), ("cublasCgemmEx", ("rocblas_cgemmex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), diff --git a/tools/amd_build/pyHIPIFY/hipify_python.py b/tools/amd_build/pyHIPIFY/hipify_python.py index 09c0566..9af02ce 100755 --- a/tools/amd_build/pyHIPIFY/hipify_python.py +++ b/tools/amd_build/pyHIPIFY/hipify_python.py @@ -321,10 +321,13 @@ def add_dim3(kernel_string, cuda_kernel): return cuda_kernel +RE_KERNEL_LAUNCH = re.compile(r'([ ]+)(detail?)::[ ]+\\\n[ ]+') + + def processKernelLaunches(string, stats): """ Replace the CUDA style Kernel launches with the HIP style kernel launches.""" # Concat the namespace with the kernel names. (Find cleaner way of doing this later). - string = re.sub(r'([ ]+)(detail?)::[ ]+\\\n[ ]+', lambda inp: "{0}{1}::".format(inp.group(1), inp.group(2)), string) + string = RE_KERNEL_LAUNCH.sub(lambda inp: "{0}{1}::".format(inp.group(1), inp.group(2)), string) def grab_method_and_template(in_kernel): # The positions for relevant kernel components. @@ -487,12 +490,15 @@ def find_parentheses_group(input_string, start): return find_closure_group(input_string, start, group=["(", ")"]) +RE_ASSERT = re.compile(r"\bassert[ ]*\(") + + def disable_asserts(input_string): """ Disables regular assert statements e.g. "assert(....)" -> "/*assert(....)*/" """ output_string = input_string - asserts = list(re.finditer(r"\bassert[ ]*\(", input_string)) + asserts = list(RE_ASSERT.finditer(input_string)) for assert_item in asserts: p_start, p_end = find_parentheses_group(input_string, assert_item.end() - 1) start = assert_item.start() @@ -506,9 +512,7 @@ def replace_forceinline(input_string): replacing '__forceinline__' with 'inline' as a workaround https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_faq.md#what-if-hip-generates-error-of-symbol-multiply-defined-only-on-amd-machine """ - output_string = input_string - output_string = re.sub("__forceinline__", "inline", output_string) - return output_string + return input_string.replace("__forceinline__", "inline") def replace_math_functions(input_string): @@ -523,6 +527,9 @@ def replace_math_functions(input_string): return output_string +RE_SYNCTHREADS = re.compile(r"[:]?[:]?\b(__syncthreads)\b(\w*\()") + + def hip_header_magic(input_string): """If the file makes kernel builtin calls and does not include the cuda_runtime.h header, then automatically add an #include to match the "magic" includes provided by NVCC. @@ -542,7 +549,7 @@ def hip_header_magic(input_string): hasDeviceLogic = "hipLaunchKernelGGL" in output_string hasDeviceLogic += "__global__" in output_string hasDeviceLogic += "__shared__" in output_string - hasDeviceLogic += re.search(r"[:]?[:]?\b(__syncthreads)\b(\w*\()", output_string) is not None + hasDeviceLogic += RE_SYNCTHREADS.search(output_string) is not None # If device logic found, provide the necessary header. if hasDeviceLogic: @@ -551,6 +558,9 @@ def hip_header_magic(input_string): return output_string +RE_EXTERN_SHARED = re.compile(r"extern\s+([\w\(\)]+)?\s*__shared__\s+([\w:<>\s]+)\s+(\w+)\s*\[\s*\]\s*;") + + def replace_extern_shared(input_string): """Match extern __shared__ type foo[]; syntax and use HIP_DYNAMIC_SHARED() MACRO instead. https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#__shared__ @@ -559,8 +569,7 @@ def replace_extern_shared(input_string): "extern __shared__ unsigned char smem[];" => "HIP_DYNAMIC_SHARED( unsigned char, my_smem)" """ output_string = input_string - output_string = re.sub( - r"extern\s+([\w\(\)]+)?\s*__shared__\s+([\w:<>\s]+)\s+(\w+)\s*\[\s*\]\s*;", + output_string = RE_EXTERN_SHARED.sub( lambda inp: "HIP_DYNAMIC_SHARED({0} {1}, {2})".format( inp.group(1) or "", inp.group(2), inp.group(3)), output_string) @@ -790,6 +799,84 @@ def is_caffe2_gpu_file(filepath): return ('gpu' in filename or ext in ['.cu', '.cuh']) and ('cudnn' not in filename) +# Cribbed from https://stackoverflow.com/questions/42742810/speed-up-millions-of-regex-replacements-in-python-3/42789508#42789508 +class Trie(): + """Regex::Trie in Python. Creates a Trie out of a list of words. The trie can be exported to a Regex pattern. + The corresponding Regex should match much faster than a simple Regex union.""" + + def __init__(self): + self.data = {} + + def add(self, word): + ref = self.data + for char in word: + ref[char] = char in ref and ref[char] or {} + ref = ref[char] + ref[''] = 1 + + def dump(self): + return self.data + + def quote(self, char): + return re.escape(char) + + def _pattern(self, pData): + data = pData + if "" in data and len(data.keys()) == 1: + return None + + alt = [] + cc = [] + q = 0 + for char in sorted(data.keys()): + if isinstance(data[char], dict): + try: + recurse = self._pattern(data[char]) + alt.append(self.quote(char) + recurse) + except: + cc.append(self.quote(char)) + else: + q = 1 + cconly = not len(alt) > 0 + + if len(cc) > 0: + if len(cc) == 1: + alt.append(cc[0]) + else: + alt.append('[' + ''.join(cc) + ']') + + if len(alt) == 1: + result = alt[0] + else: + result = "(?:" + "|".join(alt) + ")" + + if q: + if cconly: + result += "?" + else: + result = "(?:%s)?" % result + return result + + def pattern(self): + return self._pattern(self.dump()) + + +CAFFE2_TRIE = Trie() +CAFFE2_MAP = {} +PYTORCH_TRIE = Trie() +PYTORCH_MAP = {} +for mapping in CUDA_TO_HIP_MAPPINGS: + for src, value in mapping.items(): + dst = value[0] + meta_data = value[1:] + if constants.API_CAFFE2 not in meta_data: + PYTORCH_TRIE.add(src) + PYTORCH_MAP[src] = dst + CAFFE2_TRIE.add(src) + CAFFE2_MAP[src] = dst +RE_CAFFE2_PREPROCESSOR = re.compile(CAFFE2_TRIE.pattern()) +RE_PYTORCH_PREPROCESSOR = re.compile(r'\b{0}\b'.format(PYTORCH_TRIE.pattern())) + def preprocessor(output_directory, filepath, stats): """ Executes the CUDA -> HIP conversion on the specified file. """ fin_path = os.path.join(output_directory, filepath) @@ -801,27 +888,15 @@ def preprocessor(output_directory, filepath, stats): os.makedirs(os.path.dirname(fout_path)) with open(fout_path, 'w') as fout: - # Perform type, method, constant replacements - for mapping in CUDA_TO_HIP_MAPPINGS: - for cuda_type, value in mapping.items(): - # Extract relevant information - hip_type = value[0] - meta_data = value[1:] - - if constants.API_CAFFE2 in meta_data and is_pytorch_file(filepath): - continue - - if output_source.find(cuda_type) > -1: - # Check if supported - if constants.HIP_UNSUPPORTED in meta_data: - stats["unsupported_calls"].append((cuda_type, filepath)) - - if cuda_type in output_source: - if is_pytorch_file(filepath): - pattern = r'(\b{0}\b)'.format(re.escape(cuda_type)) - else: - pattern = r'({0})'.format(re.escape(cuda_type)) - output_source = re.sub(pattern, hip_type, output_source) + # unsupported_calls statistics reporting is broken atm + if is_pytorch_file(filepath): + def pt_repl(m): + return PYTORCH_MAP[m.group(0)] + output_source = RE_PYTORCH_PREPROCESSOR.sub(pt_repl, output_source) + else: + def c2_repl(m): + return CAFFE2_MAP[m.group(0)] + output_source = RE_CAFFE2_PREPROCESSOR.sub(c2_repl, output_source) # Perform Kernel Launch Replacements output_source = processKernelLaunches(output_source, stats) @@ -831,7 +906,7 @@ def preprocessor(output_directory, filepath, stats): # output_source = disable_asserts(output_source) # Replace std:: with non-std:: versions - if re.search(r"\.cu$", filepath) or re.search(r"\.cuh$", filepath): + if filepath.endswith(".cu") or filepath.endswith(".cuh"): output_source = replace_math_functions(output_source) # Replace __forceinline__ with inline @@ -894,6 +969,10 @@ def fix_static_global_kernels(in_txt): # get deleted soon. +RE_KERNEL_TEMPLATE = re.compile(r"(template[ ]*<(.*)>\n.*\n?)?__global__ void[\n| ](\w+(\(.*\))?)\(") +RE_GENERATE_KERNEL = re.compile(r"GENERATE_KERNEL([1-9])\((.*)\)") + + def get_kernel_template_params(output_directory, the_file, KernelTemplateParams, template_param_to_value): """Scan for __global__ kernel definitions then extract its argument types, and static cast as necessary""" # Read the kernel file. @@ -901,8 +980,7 @@ def get_kernel_template_params(output_directory, the_file, KernelTemplateParams, # Extract all kernels with their templates inside of the file string = f.read() - get_kernel_definitions = [k for k in re.finditer( - r"(template[ ]*<(.*)>\n.*\n?)?__global__ void[\n| ](\w+(\(.*\))?)\(", string)] + get_kernel_definitions = [k for k in RE_KERNEL_TEMPLATE.finditer(string)] # Create new launch syntax for kernel in get_kernel_definitions: @@ -967,7 +1045,7 @@ def get_kernel_template_params(output_directory, the_file, KernelTemplateParams, # Extract generated kernels # curandStateMtgp32 *state, int size, T *result, ARG1 - for kernel in re.finditer(r"GENERATE_KERNEL([1-9])\((.*)\)", string): + for kernel in RE_GENERATE_KERNEL.finditer(string): kernel_gen_type = int(kernel.group(1)) kernel_name = kernel.group(2).split(",")[0] kernel_params = kernel.group(2).split(",")[1:] @@ -1020,11 +1098,14 @@ def disable_unsupported_function_call(function, input_string, replacement): return output_string +RE_INCLUDE = re.compile(r"#include .*\n") + + def disable_module(input_file): """Disable a module entirely except for header includes.""" with openf(input_file, "r+") as f: txt = f.read() - last = list(re.finditer(r"#include .*\n", txt))[-1] + last = list(RE_INCLUDE.finditer(txt))[-1] end = last.end() disabled = "{0}#if !defined(__HIP_PLATFORM_HCC__)\n{1}\n#endif".format(txt[0:end], txt[end:]) @@ -1080,6 +1161,9 @@ def extract_arguments(start, string): return arguments +RE_HIP_LAUNCH_KERNEL_GGL = re.compile("hipLaunchKernelGGL\(") + + # Add static_cast to ensure that the type of kernel arguments matches that in the corresponding kernel definition def add_static_casts(orig_filepath, filepath, KernelTemplateParams): """Add static casts to kernel launches in order to keep launch argument types and kernel definition types matching. @@ -1101,7 +1185,7 @@ def add_static_casts(orig_filepath, filepath, KernelTemplateParams): with openf(filepath, "r+") as fileobj: input_source = fileobj.read() new_output_source = input_source - for kernel in re.finditer("hipLaunchKernelGGL\(", input_source): + for kernel in RE_HIP_LAUNCH_KERNEL_GGL.finditer(input_source): arguments = extract_arguments(kernel.end() - 1, input_source) # Check if we have templating + static_cast information