From fb7e40b7eb395c812327018415050bb25b1c5b6b Mon Sep 17 00:00:00 2001 From: andersj Date: Wed, 28 Nov 2018 14:40:50 -0800 Subject: [PATCH] nccl fixes (#14195) Summary: This has 4 changes 1) propagate USE_SYSTEM_NCCL. Previously it was ignored and cmake always did a FindPackage 2) respect SCCACHE_DISABLE in our caffe2 sccache wrapper for circleci 3) use SCCACHE_DISABLE when building nccl, because it triggers the same bug as when using CCACHE (already tracked in https://github.com/pytorch/pytorch/issues/13362). This was hidden because we weren't respecting USE_SYSTEM_NCCL, and were never building nccl ourselves in CI 4) In one particular CI configuration (caffe2, cuda 8, cudnn 7), force USE_SYSTEM_NCCL=1. Building the bundled nccl triggers a bug in nvlink. I've done some investigation, but this looks like a tricky, preexisting bug, so rather than hold up this diff I'm tracking it separately in https://github.com/pytorch/pytorch/issues/14486 Pull Request resolved: https://github.com/pytorch/pytorch/pull/14195 Differential Revision: D13237502 Pulled By: anderspapitto fbshipit-source-id: 1100ac1269c7cd39e2e0b3ba12a56a3ce8977c55 --- .jenkins/caffe2/build.sh | 31 ++++++++++++++++++++++--------- cmake/External/nccl.cmake | 21 ++++++++++++++++----- setup.py | 3 +++ tools/build_pytorch_libs.sh | 3 +++ 4 files changed, 44 insertions(+), 14 deletions(-) diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh index d52b033..d9d1fd7 100755 --- a/.jenkins/caffe2/build.sh +++ b/.jenkins/caffe2/build.sh @@ -25,22 +25,29 @@ if [ "$(which gcc)" != "/root/sccache/gcc" ]; then fi # Setup wrapper scripts - for compiler in cc c++ gcc g++ x86_64-linux-gnu-gcc; do + wrapped="cc c++ gcc g++ x86_64-linux-gnu-gcc" + if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]]; then + wrapped="$wrapped nvcc" + fi + for compiler in $wrapped; do ( echo "#!/bin/sh" + + # TODO: if/when sccache gains native support for an + # SCCACHE_DISABLE flag analogous to ccache's CCACHE_DISABLE, + # this can be removed. Alternatively, this can be removed when + # https://github.com/pytorch/pytorch/issues/13362 is fixed. + # + # NOTE: carefully quoted - we want `which compiler` to be + # resolved as we execute the script, but SCCACHE_DISABLE and + # $@ to be evaluated when we execute the script + echo 'test $SCCACHE_DISABLE && exec '"$(which $compiler)"' "$@"' + echo "exec $SCCACHE $(which $compiler) \"\$@\"" ) > "./sccache/$compiler" chmod +x "./sccache/$compiler" done - if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]]; then - ( - echo "#!/bin/sh" - echo "exec $SCCACHE $(which nvcc) \"\$@\"" - ) > "./sccache/nvcc" - chmod +x "./sccache/nvcc" - fi - export CACHE_WRAPPER_DIR="$PWD/sccache" # CMake must find these wrapper scripts @@ -145,6 +152,12 @@ if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then ${PYTHON} "${ROOT_DIR}/tools/amd_build/build_caffe2_amd.py" fi +# building bundled nccl in this config triggers a bug in nvlink. For +# more, see https://github.com/pytorch/pytorch/issues/14486 +if [[ "${BUILD_ENVIRONMENT}" == *-cuda8*-cudnn7* ]]; then + CMAKE_ARGS+=("-DUSE_SYSTEM_NCCL=ON") +fi + # Try to include Redis support for Linux builds if [ "$(uname)" == "Linux" ]; then CMAKE_ARGS+=("-DUSE_REDIS=ON") diff --git a/cmake/External/nccl.cmake b/cmake/External/nccl.cmake index 4c6d1b8..4274d75 100644 --- a/cmake/External/nccl.cmake +++ b/cmake/External/nccl.cmake @@ -1,12 +1,20 @@ if (NOT __NCCL_INCLUDED) set(__NCCL_INCLUDED TRUE) - # try the system-wide nccl first - find_package(NCCL) - if (NCCL_FOUND) + if (USE_SYSTEM_NCCL) + # if we have explicit paths passed from setup.py, use those + if (NCCL_INCLUDE_DIR) add_library(__caffe2_nccl INTERFACE) - target_link_libraries(__caffe2_nccl INTERFACE ${NCCL_LIBRARIES}) - target_include_directories(__caffe2_nccl INTERFACE ${NCCL_INCLUDE_DIRS}) + target_link_libraries(__caffe2_nccl INTERFACE ${NCCL_SYSTEM_LIB}) + target_include_directories(__caffe2_nccl INTERFACE ${NCCL_INCLUDE_DIR}) + else() + find_package(NCCL) + if (NCCL_FOUND) + add_library(__caffe2_nccl INTERFACE) + target_link_libraries(__caffe2_nccl INTERFACE ${NCCL_LIBRARIES}) + target_include_directories(__caffe2_nccl INTERFACE ${NCCL_INCLUDE_DIRS}) + endif() + endif() else() if (TORCH_CUDA_ARCH_LIST) torch_cuda_get_nvcc_gencode_flag(NVCC_GENCODE) @@ -21,7 +29,10 @@ if (NOT __NCCL_INCLUDED) CONFIGURE_COMMAND "" BUILD_COMMAND env + # TODO: remove these flags when + # https://github.com/pytorch/pytorch/issues/13362 is fixed "CCACHE_DISABLE=1" + "SCCACHE_DISABLE=1" make "CXX=${CMAKE_CXX_COMPILER}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" diff --git a/setup.py b/setup.py index 505a58e..a207498 100644 --- a/setup.py +++ b/setup.py @@ -345,6 +345,8 @@ def build_libs(libs): my_env['CMAKE_INSTALL'] = 'make install' if USE_SYSTEM_NCCL: my_env["NCCL_ROOT_DIR"] = NCCL_ROOT_DIR + my_env["NCCL_INCLUDE_DIR"] = NCCL_INCLUDE_DIR + my_env["NCCL_SYSTEM_LIB"] = NCCL_SYSTEM_LIB if USE_CUDA: my_env["CUDA_BIN_PATH"] = CUDA_HOME build_libs_cmd += ['--use-cuda'] @@ -388,6 +390,7 @@ def build_libs(libs): my_env["USE_OPENCV"] = "ON" if USE_OPENCV else "OFF" my_env["USE_FFMPEG"] = "ON" if USE_FFMPEG else "OFF" my_env["USE_DISTRIBUTED"] = "ON" if USE_DISTRIBUTED else "OFF" + my_env["USE_SYSTEM_NCCL"] = "ON" if USE_SYSTEM_NCCL else "OFF" try: os.mkdir('build') diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index 69e11d4..7f38589 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -214,6 +214,9 @@ function build_caffe2() { -DUSE_FBGEMM=$USE_FBGEMM \ -DUSE_NUMPY=$USE_NUMPY \ -DNUMPY_INCLUDE_DIR=$NUMPY_INCLUDE_DIR \ + -DUSE_SYSTEM_NCCL=$USE_SYSTEM_NCCL \ + -DNCCL_INCLUDE_DIR=$NCCL_INCLUDE_DIR \ + -DNCCL_SYSTEM_LIB=$NCCL_SYSTEM_LIB \ -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \ -DUSE_ROCM=$USE_ROCM \ -DUSE_NNPACK=$USE_NNPACK \ -- 2.7.4