Move existing target determinator to tools (#63809)
authordriazati <driazati@users.noreply.github.com>
Wed, 25 Aug 2021 19:58:24 +0000 (12:58 -0700)
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Wed, 25 Aug 2021 20:03:53 +0000 (13:03 -0700)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63809

This moves out the modulefinder determinator to `tools/testing` since it is supposed to be CI-only. This also simplifies run_test.py a little bit.

Test Plan: Imported from OSS

Reviewed By: malfet, seemethere, janeyx99

Differential Revision: D30497438

Pulled By: driazati

fbshipit-source-id: 1d203037af5af6a20c1e7812da935e7cbb5cd82f

test/run_test.py
test/test_determination.py
tools/testing/modulefinder_determinator.py [new file with mode: 0644]

index ecc93fe..d3c6610 100755 (executable)
@@ -4,8 +4,8 @@ import argparse
 import copy
 from datetime import datetime
 from distutils.util import strtobool
-import modulefinder
 import os
+import pathlib
 import shutil
 import signal
 import subprocess
@@ -24,9 +24,11 @@ from torch.testing._internal.common_utils import (
 import torch.distributed as dist
 from typing import Dict, Optional, List
 
+REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent
+
 try:
     # using tools/ to optimize test run.
-    sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
+    sys.path.append(str(REPO_ROOT))
     from tools.testing.test_selections import (
         export_S3_test_times,
         get_shard_based_on_S3,
@@ -35,6 +37,10 @@ try:
         get_reordered_tests,
         get_test_case_configs,
     )
+    from tools.testing.modulefinder_determinator import (
+        should_run_test,
+        TARGET_DET_LIST,
+    )
 
     HAVE_TEST_SELECTION_TOOLS = True
 except ImportError:
@@ -276,87 +282,12 @@ RUN_PARALLEL_BLOCKLIST = [
 WINDOWS_COVERAGE_BLOCKLIST = []
 
 
-# These tests are slow enough that it's worth calculating whether the patch
-# touched any related files first. This list was manually generated, but for every
-# run with --determine-from, we use another generated list based on this one and the
-# previous test stats.
-TARGET_DET_LIST = [
-    "distributions/test_distributions",
-    "test_nn",
-    "test_autograd",
-    "test_cpp_extensions_jit",
-    "test_jit_legacy",
-    "test_dataloader",
-    "test_overrides",
-    "test_linalg",
-    "test_jit",
-    "test_jit_profiling",
-    "test_torch",
-    "test_binary_ufuncs",
-    "test_numpy_interop",
-    "test_reductions",
-    "test_shape_ops",
-    "test_sort_and_select",
-    "test_testing",
-    "test_view_ops",
-    "distributed/nn/jit/test_instantiator",
-    "distributed/rpc/test_tensorpipe_agent",
-    "distributed/rpc/cuda/test_tensorpipe_agent",
-    "distributed/algorithms/ddp_comm_hooks/test_ddp_hooks",
-    "distributed/test_distributed_spawn",
-    "test_cuda",
-    "test_cuda_primary_ctx",
-    "test_cpp_extensions_aot_ninja",
-    "test_cpp_extensions_aot_no_ninja",
-    "test_serialization",
-    "test_optim",
-    "test_utils",
-    "test_multiprocessing",
-    "test_tensorboard",
-    "distributed/test_c10d_common",
-    "distributed/test_c10d_gloo",
-    "distributed/test_c10d_nccl",
-    "distributed/test_jit_c10d",
-    "distributed/test_c10d_spawn_gloo",
-    "distributed/test_c10d_spawn_nccl",
-    "distributed/test_store",
-    "distributed/test_pg_wrapper",
-    "test_quantization",
-    "test_pruning_op",
-    "test_determination",
-    "test_futures",
-    "distributed/pipeline/sync/skip/test_api",
-    "distributed/pipeline/sync/skip/test_gpipe",
-    "distributed/pipeline/sync/skip/test_inspect_skip_layout",
-    "distributed/pipeline/sync/skip/test_leak",
-    "distributed/pipeline/sync/skip/test_portal",
-    "distributed/pipeline/sync/skip/test_stash_pop",
-    "distributed/pipeline/sync/skip/test_tracker",
-    "distributed/pipeline/sync/skip/test_verify_skippables",
-    "distributed/pipeline/sync/test_balance",
-    "distributed/pipeline/sync/test_bugs",
-    "distributed/pipeline/sync/test_checkpoint",
-    "distributed/pipeline/sync/test_copy",
-    "distributed/pipeline/sync/test_deferred_batch_norm",
-    "distributed/pipeline/sync/test_dependency",
-    "distributed/pipeline/sync/test_inplace",
-    "distributed/pipeline/sync/test_microbatch",
-    "distributed/pipeline/sync/test_phony",
-    "distributed/pipeline/sync/test_pipe",
-    "distributed/pipeline/sync/test_pipeline",
-    "distributed/pipeline/sync/test_stream",
-    "distributed/pipeline/sync/test_transparency",
-    "distributed/pipeline/sync/test_worker",
-]
-
 # the JSON file to store the S3 test stats
 TEST_TIMES_FILE = ".pytorch-test-times.json"
 
 # if a test file takes longer than 5 min, we add it to TARGET_DET_LIST
 SLOW_TEST_THRESHOLD = 300
 
-_DEP_MODULES_CACHE: Dict[str, set] = {}
-
 DISTRIBUTED_TESTS_CONFIG = {}
 
 
@@ -957,136 +888,6 @@ def get_selected_tests(options):
     return selected_tests
 
 
-def test_impact_of_file(filename):
-    """Determine what class of impact this file has on test runs.
-
-    Possible values:
-        TORCH - torch python code
-        CAFFE2 - caffe2 python code
-        TEST - torch test code
-        UNKNOWN - may affect all tests
-        NONE - known to have no effect on test outcome
-        CI - CI configuration files
-    """
-    parts = filename.split(os.sep)
-    if parts[0] in [".jenkins", ".circleci"]:
-        return "CI"
-    if parts[0] in ["docs", "scripts", "CODEOWNERS", "README.md"]:
-        return "NONE"
-    elif parts[0] == "torch":
-        if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"):
-            return "TORCH"
-    elif parts[0] == "caffe2":
-        if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"):
-            return "CAFFE2"
-    elif parts[0] == "test":
-        if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"):
-            return "TEST"
-
-    return "UNKNOWN"
-
-
-def log_test_reason(file_type, filename, test, options):
-    if options.verbose:
-        print_to_stderr(
-            "Determination found {} file {} -- running {}".format(
-                file_type,
-                filename,
-                test,
-            )
-        )
-
-
-def get_dep_modules(test):
-    # Cache results in case of repetition
-    if test in _DEP_MODULES_CACHE:
-        return _DEP_MODULES_CACHE[test]
-
-    repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-    test_location = os.path.join(repo_root, "test", test + ".py")
-    finder = modulefinder.ModuleFinder(
-        # Ideally exclude all third party modules, to speed up calculation.
-        excludes=[
-            "scipy",
-            "numpy",
-            "numba",
-            "multiprocessing",
-            "sklearn",
-            "setuptools",
-            "hypothesis",
-            "llvmlite",
-            "joblib",
-            "email",
-            "importlib",
-            "unittest",
-            "urllib",
-            "json",
-            "collections",
-            # Modules below are excluded because they are hitting https://bugs.python.org/issue40350
-            # Trigger AttributeError: 'NoneType' object has no attribute 'is_package'
-            "mpl_toolkits",
-            "google",
-            "onnx",
-            # Triggers RecursionError
-            "mypy",
-        ],
-    )
-    # HACK: some platforms default to ascii, so we can't just run_script :(
-    with open(test_location, "r", encoding="utf-8") as fp:
-        finder.load_module("__main__", fp, test_location, ("", "r", 1))
-
-    dep_modules = set(finder.modules.keys())
-    _DEP_MODULES_CACHE[test] = dep_modules
-    return dep_modules
-
-
-def determine_target(target_det_list, test, touched_files, options):
-    test = parse_test_module(test)
-    # Some tests are faster to execute than to determine.
-    if test not in target_det_list:
-        if options.verbose:
-            print_to_stderr(f"Running {test} without determination")
-        return True
-    # HACK: "no_ninja" is not a real module
-    if test.endswith("_no_ninja"):
-        test = test[: (-1 * len("_no_ninja"))]
-    if test.endswith("_ninja"):
-        test = test[: (-1 * len("_ninja"))]
-
-    dep_modules = get_dep_modules(test)
-
-    for touched_file in touched_files:
-        file_type = test_impact_of_file(touched_file)
-        if file_type == "NONE":
-            continue
-        elif file_type == "CI":
-            # Force all tests to run if any change is made to the CI
-            # configurations.
-            log_test_reason(file_type, touched_file, test, options)
-            return True
-        elif file_type == "UNKNOWN":
-            # Assume uncategorized source files can affect every test.
-            log_test_reason(file_type, touched_file, test, options)
-            return True
-        elif file_type in ["TORCH", "CAFFE2", "TEST"]:
-            parts = os.path.splitext(touched_file)[0].split(os.sep)
-            touched_module = ".".join(parts)
-            # test/ path does not have a "test." namespace
-            if touched_module.startswith("test."):
-                touched_module = touched_module.split("test.")[1]
-            if touched_module in dep_modules or touched_module == test.replace(
-                "/", "."
-            ):
-                log_test_reason(file_type, touched_file, test, options)
-                return True
-
-    # If nothing has determined the test has run, don't run the test.
-    if options.verbose:
-        print_to_stderr(f"Determination is skipping {test}")
-
-    return False
-
-
 def run_test_module(test: str, test_directory: str, options) -> Optional[str]:
     test_module = parse_test_module(test)
 
@@ -1131,7 +932,7 @@ def main():
             specified_test_cases_filename, TESTS
         )
 
-    test_directory = os.path.dirname(os.path.abspath(__file__))
+    test_directory = str(REPO_ROOT / "test")
     selected_tests = get_selected_tests(options)
 
     if options.verbose:
@@ -1144,10 +945,10 @@ def main():
         slow_tests = get_slow_tests_based_on_S3(
             TESTS, TARGET_DET_LIST, SLOW_TEST_THRESHOLD
         )
-        print(
+        print_to_stderr(
             "Added the following tests to target_det tests as calculated based on S3:"
         )
-        print(slow_tests)
+        print_to_stderr(slow_tests)
         with open(options.determine_from, "r") as fh:
             touched_files = [
                 os.path.normpath(name.strip())
@@ -1155,22 +956,22 @@ def main():
                 if len(name.strip()) > 0
             ]
         # HACK: Ensure the 'test' paths can be traversed by Modulefinder
-        sys.path.append("test")
+        sys.path.append(test_directory)
         selected_tests = [
             test
             for test in selected_tests
-            if determine_target(
+            if should_run_test(
                 TARGET_DET_LIST + slow_tests, test, touched_files, options
             )
         ]
-        sys.path.remove("test")
+        sys.path.remove(test_directory)
 
     if IS_IN_CI:
         selected_tests = get_reordered_tests(
             selected_tests, ENABLE_PR_HISTORY_REORDERING
         )
         # downloading test cases configuration to local environment
-        get_test_case_configs(dirpath=os.path.dirname(os.path.abspath(__file__)))
+        get_test_case_configs(dirpath=test_directory)
 
     has_failed = False
     failure_messages = []
@@ -1191,8 +992,7 @@ def main():
         if options.coverage:
             from coverage import Coverage
 
-            test_dir = os.path.dirname(os.path.abspath(__file__))
-            with set_cwd(test_dir):
+            with set_cwd(test_directory):
                 cov = Coverage()
                 if PYTORCH_COLLECT_COVERAGE:
                     cov.load()
index 6b7fcc0..277bbd2 100644 (file)
@@ -30,7 +30,7 @@ class DeterminationTest(unittest.TestCase):
         return [
             test
             for test in cls.TESTS
-            if run_test.determine_target(run_test.TARGET_DET_LIST, test, changed_files, DummyOptions())
+            if run_test.should_run_test(run_test.TARGET_DET_LIST, test, changed_files, DummyOptions())
         ]
 
     def test_config_change_only(self):
diff --git a/tools/testing/modulefinder_determinator.py b/tools/testing/modulefinder_determinator.py
new file mode 100644 (file)
index 0000000..8acd0ed
--- /dev/null
@@ -0,0 +1,224 @@
+import os
+import modulefinder
+import sys
+import pathlib
+import warnings
+from typing import Dict, Any, List, Set
+
+REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent
+
+# These tests are slow enough that it's worth calculating whether the patch
+# touched any related files first. This list was manually generated, but for every
+# run with --determine-from, we use another generated list based on this one and the
+# previous test stats.
+TARGET_DET_LIST = [
+    "distributions/test_distributions",
+    "test_nn",
+    "test_autograd",
+    "test_cpp_extensions_jit",
+    "test_jit_legacy",
+    "test_dataloader",
+    "test_overrides",
+    "test_linalg",
+    "test_jit",
+    "test_jit_profiling",
+    "test_torch",
+    "test_binary_ufuncs",
+    "test_numpy_interop",
+    "test_reductions",
+    "test_shape_ops",
+    "test_sort_and_select",
+    "test_testing",
+    "test_view_ops",
+    "distributed/nn/jit/test_instantiator",
+    "distributed/rpc/test_tensorpipe_agent",
+    "distributed/rpc/cuda/test_tensorpipe_agent",
+    "distributed/algorithms/ddp_comm_hooks/test_ddp_hooks",
+    "distributed/test_distributed_spawn",
+    "test_cuda",
+    "test_cuda_primary_ctx",
+    "test_cpp_extensions_aot_ninja",
+    "test_cpp_extensions_aot_no_ninja",
+    "test_serialization",
+    "test_optim",
+    "test_utils",
+    "test_multiprocessing",
+    "test_tensorboard",
+    "distributed/test_c10d_common",
+    "distributed/test_c10d_gloo",
+    "distributed/test_c10d_nccl",
+    "distributed/test_jit_c10d",
+    "distributed/test_c10d_spawn_gloo",
+    "distributed/test_c10d_spawn_nccl",
+    "distributed/test_store",
+    "distributed/test_pg_wrapper",
+    "test_quantization",
+    "test_pruning_op",
+    "test_determination",
+    "test_futures",
+    "distributed/pipeline/sync/skip/test_api",
+    "distributed/pipeline/sync/skip/test_gpipe",
+    "distributed/pipeline/sync/skip/test_inspect_skip_layout",
+    "distributed/pipeline/sync/skip/test_leak",
+    "distributed/pipeline/sync/skip/test_portal",
+    "distributed/pipeline/sync/skip/test_stash_pop",
+    "distributed/pipeline/sync/skip/test_tracker",
+    "distributed/pipeline/sync/skip/test_verify_skippables",
+    "distributed/pipeline/sync/test_balance",
+    "distributed/pipeline/sync/test_bugs",
+    "distributed/pipeline/sync/test_checkpoint",
+    "distributed/pipeline/sync/test_copy",
+    "distributed/pipeline/sync/test_deferred_batch_norm",
+    "distributed/pipeline/sync/test_dependency",
+    "distributed/pipeline/sync/test_inplace",
+    "distributed/pipeline/sync/test_microbatch",
+    "distributed/pipeline/sync/test_phony",
+    "distributed/pipeline/sync/test_pipe",
+    "distributed/pipeline/sync/test_pipeline",
+    "distributed/pipeline/sync/test_stream",
+    "distributed/pipeline/sync/test_transparency",
+    "distributed/pipeline/sync/test_worker",
+]
+
+_DEP_MODULES_CACHE: Dict[str, Set[str]] = {}
+
+
+def should_run_test(
+    target_det_list: List[str], test: str, touched_files: List[str], options: Any
+) -> bool:
+    test = parse_test_module(test)
+    # Some tests are faster to execute than to determine.
+    if test not in target_det_list:
+        if options.verbose:
+            print_to_stderr(f"Running {test} without determination")
+        return True
+    # HACK: "no_ninja" is not a real module
+    if test.endswith("_no_ninja"):
+        test = test[: (-1 * len("_no_ninja"))]
+    if test.endswith("_ninja"):
+        test = test[: (-1 * len("_ninja"))]
+
+    dep_modules = get_dep_modules(test)
+
+    for touched_file in touched_files:
+        file_type = test_impact_of_file(touched_file)
+        if file_type == "NONE":
+            continue
+        elif file_type == "CI":
+            # Force all tests to run if any change is made to the CI
+            # configurations.
+            log_test_reason(file_type, touched_file, test, options)
+            return True
+        elif file_type == "UNKNOWN":
+            # Assume uncategorized source files can affect every test.
+            log_test_reason(file_type, touched_file, test, options)
+            return True
+        elif file_type in ["TORCH", "CAFFE2", "TEST"]:
+            parts = os.path.splitext(touched_file)[0].split(os.sep)
+            touched_module = ".".join(parts)
+            # test/ path does not have a "test." namespace
+            if touched_module.startswith("test."):
+                touched_module = touched_module.split("test.")[1]
+            if touched_module in dep_modules or touched_module == test.replace(
+                "/", "."
+            ):
+                log_test_reason(file_type, touched_file, test, options)
+                return True
+
+    # If nothing has determined the test has run, don't run the test.
+    if options.verbose:
+        print_to_stderr(f"Determination is skipping {test}")
+
+    return False
+
+
+def test_impact_of_file(filename: str) -> str:
+    """Determine what class of impact this file has on test runs.
+
+    Possible values:
+        TORCH - torch python code
+        CAFFE2 - caffe2 python code
+        TEST - torch test code
+        UNKNOWN - may affect all tests
+        NONE - known to have no effect on test outcome
+        CI - CI configuration files
+    """
+    parts = filename.split(os.sep)
+    if parts[0] in [".jenkins", ".circleci"]:
+        return "CI"
+    if parts[0] in ["docs", "scripts", "CODEOWNERS", "README.md"]:
+        return "NONE"
+    elif parts[0] == "torch":
+        if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"):
+            return "TORCH"
+    elif parts[0] == "caffe2":
+        if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"):
+            return "CAFFE2"
+    elif parts[0] == "test":
+        if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"):
+            return "TEST"
+
+    return "UNKNOWN"
+
+
+def log_test_reason(file_type: str, filename: str, test: str, options: Any) -> None:
+    if options.verbose:
+        print_to_stderr(
+            "Determination found {} file {} -- running {}".format(
+                file_type,
+                filename,
+                test,
+            )
+        )
+
+
+def get_dep_modules(test: str) -> Set[str]:
+    # Cache results in case of repetition
+    if test in _DEP_MODULES_CACHE:
+        return _DEP_MODULES_CACHE[test]
+
+    test_location = REPO_ROOT / "test" / f"{test}.py"
+
+    # HACK: some platforms default to ascii, so we can't just run_script :(
+    finder = modulefinder.ModuleFinder(
+        # Ideally exclude all third party modules, to speed up calculation.
+        excludes=[
+            "scipy",
+            "numpy",
+            "numba",
+            "multiprocessing",
+            "sklearn",
+            "setuptools",
+            "hypothesis",
+            "llvmlite",
+            "joblib",
+            "email",
+            "importlib",
+            "unittest",
+            "urllib",
+            "json",
+            "collections",
+            # Modules below are excluded because they are hitting https://bugs.python.org/issue40350
+            # Trigger AttributeError: 'NoneType' object has no attribute 'is_package'
+            "mpl_toolkits",
+            "google",
+            "onnx",
+            # Triggers RecursionError
+            "mypy",
+        ],
+    )
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        finder.run_script(str(test_location))
+    dep_modules = set(finder.modules.keys())
+    _DEP_MODULES_CACHE[test] = dep_modules
+    return dep_modules
+
+
+def parse_test_module(test: str) -> str:
+    return test.split(".")[0]
+
+
+def print_to_stderr(message: str) -> None:
+    print(message, file=sys.stderr)