[CI] move distributed test into its own CI job (#62896)
authorRong Rong (AI Infra) <rongr@fb.com>
Thu, 26 Aug 2021 15:00:48 +0000 (08:00 -0700)
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Thu, 26 Aug 2021 15:02:20 +0000 (08:02 -0700)
Summary:
Moving distributed to its own job.

- [x] ensure there should be a distributed test job for every default test job matrix (on GHA)
- [x] ensure that circleci jobs works for distributed as well
- [x] waiting for test distributed to have its own run_test.py launch options, see https://github.com/pytorch/pytorch/issues/63147

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62896

Reviewed By: seemethere

Differential Revision: D30230856

Pulled By: walterddr

fbshipit-source-id: 0cad620f6cd9e56c727c105458d76539a5ae976f

12 files changed:
.circleci/cimodel/data/pytorch_build_definitions.py
.circleci/config.yml
.github/scripts/generate_ci_workflows.py
.github/scripts/generate_pytorch_test_matrix.py
.github/templates/linux_ci_workflow.yml.j2
.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
.jenkins/pytorch/test.sh

index bdc9772..d7b2015 100644 (file)
@@ -415,6 +415,27 @@ def instantiate_configs(only_slow_gradcheck):
             )
             c.dependent_tests.append(bc_breaking_check)
 
+        if (
+            compiler_name != "clang"
+            and not rocm_version
+            and not is_libtorch
+            and not is_vulkan
+            and not is_pure_torch
+            and not is_noarch
+            and not is_slow_gradcheck
+            and not only_slow_gradcheck
+        ):
+            distributed_test = Conf(
+                c.gen_build_name("") + "distributed",
+                [],
+                is_xla=False,
+                restrict_phases=["test"],
+                is_libtorch=False,
+                is_important=True,
+                parent_build=c,
+            )
+            c.dependent_tests.append(distributed_test)
+
         config_list.append(c)
 
     return config_list
index cb3e148..1bb32b5 100644 (file)
@@ -7159,6 +7159,13 @@ workflows:
           build_environment: "pytorch-linux-backward-compatibility-check-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
           resource_class: large
+      - pytorch_linux_test:
+          name: pytorch_linux_pytorch_linux_xenial_py3_6_gcc5_4_distributed_test
+          requires:
+            - pytorch_linux_xenial_py3_6_gcc5_4_build
+          build_environment: "pytorch-linux-pytorch_linux_xenial_py3_6_gcc5_4_distributed-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+          resource_class: large
       - pytorch_linux_build:
           name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build
           requires:
@@ -7184,6 +7191,13 @@ workflows:
           build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
           resource_class: large
+      - pytorch_linux_test:
+          name: pytorch_linux_pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_distributed_test
+          requires:
+            - pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build
+          build_environment: "pytorch-linux-pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_distributed-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+          resource_class: large
       - pytorch_linux_build:
           name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
           requires:
@@ -7209,6 +7223,13 @@ workflows:
           build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
           resource_class: large
+      - pytorch_linux_test:
+          name: pytorch_linux_pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed_test
+          requires:
+            - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
+          build_environment: "pytorch-linux-pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+          resource_class: large
       - pytorch_linux_build:
           name: pytorch_pure_torch_linux_xenial_py3_6_gcc5_4_build
           requires:
@@ -7246,6 +7267,13 @@ workflows:
           build_environment: "pytorch-linux-xenial-py3.6-gcc7-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7"
           resource_class: large
+      - pytorch_linux_test:
+          name: pytorch_linux_pytorch_linux_xenial_py3_6_gcc7_distributed_test
+          requires:
+            - pytorch_linux_xenial_py3_6_gcc7_build
+          build_environment: "pytorch-linux-pytorch_linux_xenial_py3_6_gcc7_distributed-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7"
+          resource_class: large
       - pytorch_linux_build:
           name: pytorch_linux_xenial_py3_clang7_asan_build
           requires:
@@ -7380,6 +7408,13 @@ workflows:
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - pytorch_linux_test:
+          name: pytorch_linux_pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_distributed_test
+          requires:
+            - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
+          build_environment: "pytorch-linux-pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_distributed-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
+          resource_class: large
       - pytorch_linux_build:
           name: pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build
           requires:
@@ -7402,6 +7437,13 @@ workflows:
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - pytorch_linux_test:
+          name: pytorch_linux_pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_distributed_test
+          requires:
+            - pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build
+          build_environment: "pytorch-linux-pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_distributed-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
+          resource_class: large
       - pytorch_linux_build:
           name: pytorch_linux_bionic_py3_6_clang9_noarch_build
           requires:
@@ -7463,6 +7505,13 @@ workflows:
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - pytorch_linux_test:
+          name: pytorch_linux_pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_distributed_test
+          requires:
+            - pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_build
+          build_environment: "pytorch-linux-pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_distributed-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7"
+          resource_class: large
       - pytorch_linux_build:
           name: pytorch_linux_bionic_rocm3_9_py3_6_build
           requires:
index f1b9625..cd7065d 100755 (executable)
@@ -138,10 +138,12 @@ class CIWorkflow:
     only_build_on_pull_request: bool = False
     only_run_smoke_tests_on_pull_request: bool = False
     num_test_shards_on_pull_request: int = -1
+    distributed_test: bool = True
 
     # The following variables will be set as environment variables,
     # so it's easier for both shell and Python scripts to consume it if false is represented as the empty string.
     enable_jit_legacy_test: YamlShellBool = "''"
+    enable_distributed_test: YamlShellBool = "''"
     enable_multigpu_test: YamlShellBool = "''"
     enable_nogpu_no_avx_test: YamlShellBool = "''"
     enable_nogpu_no_avx2_test: YamlShellBool = "''"
@@ -154,6 +156,9 @@ class CIWorkflow:
         if not self.on_pull_request:
             self.only_build_on_pull_request = False
 
+        if self.distributed_test:
+            self.enable_distributed_test = 1
+
         # If num_test_shards_on_pull_request is not user-defined, default to num_test_shards unless we are
         # only running smoke tests on the pull request.
         if self.num_test_shards_on_pull_request == -1:
index d8860a0..75df57c 100755 (executable)
@@ -51,6 +51,8 @@ def main() -> None:
         configs['nogpu_NO_AVX'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
     if NOGPU_RUNNER_TYPE is not None and os.getenv('ENABLE_NOGPU_NO_AVX2_TEST'):
         configs['nogpu_NO_AVX2'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
+    if os.getenv('ENABLE_DISTRIBUTED_TEST'):
+        configs['distributed'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
     if os.getenv('ENABLE_SLOW_TEST'):
         configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
     matrix = {
index f636852..d9af899 100644 (file)
@@ -248,6 +248,7 @@ jobs:
     {%- endif %}
     env:
       TEST_RUNNER_TYPE: !{{ test_runner_type }}
+      ENABLE_DISTRIBUTED_TEST: !{{ enable_distributed_test }}
       ENABLE_JIT_LEGACY_TEST: !{{ enable_jit_legacy_test }}
       ENABLE_MULTIGPU_TEST: !{{ enable_multigpu_test }}
       ENABLE_NOGPU_NO_AVX_TEST: !{{ enable_nogpu_no_avx_test }}
index 0b3dddd..769efca 100644 (file)
@@ -224,6 +224,7 @@ jobs:
     needs: [ciflow_should_run]
     env:
       TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
+      ENABLE_DISTRIBUTED_TEST: 1
       ENABLE_JIT_LEGACY_TEST: ''
       ENABLE_MULTIGPU_TEST: ''
       ENABLE_NOGPU_NO_AVX_TEST: ''
index 624e9d0..ddd81c0 100644 (file)
@@ -224,6 +224,7 @@ jobs:
     needs: [ciflow_should_run]
     env:
       TEST_RUNNER_TYPE: linux.2xlarge
+      ENABLE_DISTRIBUTED_TEST: 1
       ENABLE_JIT_LEGACY_TEST: ''
       ENABLE_MULTIGPU_TEST: ''
       ENABLE_NOGPU_NO_AVX_TEST: ''
index 99a9f1f..5a888d0 100644 (file)
@@ -224,6 +224,7 @@ jobs:
     needs: [ciflow_should_run]
     env:
       TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
+      ENABLE_DISTRIBUTED_TEST: 1
       ENABLE_JIT_LEGACY_TEST: 1
       ENABLE_MULTIGPU_TEST: 1
       ENABLE_NOGPU_NO_AVX_TEST: 1
index be56b56..25d74de 100644 (file)
@@ -224,6 +224,7 @@ jobs:
     needs: [ciflow_should_run]
     env:
       TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
+      ENABLE_DISTRIBUTED_TEST: 1
       ENABLE_JIT_LEGACY_TEST: ''
       ENABLE_MULTIGPU_TEST: ''
       ENABLE_NOGPU_NO_AVX_TEST: ''
index c1b877c..341f9e6 100644 (file)
@@ -224,6 +224,7 @@ jobs:
     needs: [ciflow_should_run]
     env:
       TEST_RUNNER_TYPE: linux.2xlarge
+      ENABLE_DISTRIBUTED_TEST: 1
       ENABLE_JIT_LEGACY_TEST: ''
       ENABLE_MULTIGPU_TEST: ''
       ENABLE_NOGPU_NO_AVX_TEST: ''
index 375c4b6..470fdaa 100644 (file)
@@ -222,6 +222,7 @@ jobs:
     needs: [ciflow_should_run]
     env:
       TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
+      ENABLE_DISTRIBUTED_TEST: 1
       ENABLE_JIT_LEGACY_TEST: ''
       ENABLE_MULTIGPU_TEST: ''
       ENABLE_NOGPU_NO_AVX_TEST: ''
index e27ba3e..daa0da7 100755 (executable)
@@ -19,6 +19,11 @@ BUILD_DIR="build"
 BUILD_RENAMED_DIR="build_renamed"
 BUILD_BIN_DIR="$BUILD_DIR"/bin
 
+# GHA has test config defined for the test job, so we need to add them.
+if [[ -n "${TEST_CONFIG}" ]]; then
+    BUILD_ENVIRONMENT="${BUILD_ENVIRONMENT}-${TEST_CONFIG}"
+fi
+
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 
@@ -522,6 +527,9 @@ elif [[ "${BUILD_ENVIRONMENT}" == *vulkan-linux* ]]; then
   test_vulkan
 elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
   test_bazel
+elif [[ "${BUILD_ENVIRONMENT}" == *distributed* ]]; then
+  test_distributed
+  test_rpc
 else
   install_torchvision
   install_monkeytype
@@ -532,9 +540,7 @@ else
   test_custom_script_ops
   test_custom_backend
   test_torch_function_benchmark
-  test_distributed
   test_benchmarks
-  test_rpc
   if [[ "${BUILD_ENVIRONMENT}" == *linux-xenial-py3.6-gcc7-test* || "${BUILD_ENVIRONMENT}" == *linux-xenial-py3.6-gcc5.4-test* ]]; then
     test_python_gloo_with_tls
   fi