From fb7e40b7eb395c812327018415050bb25b1c5b6b Mon Sep 17 00:00:00 2001
From: andersj <anderspapitto@gmail.com>
Date: Wed, 28 Nov 2018 14:40:50 -0800
Subject: [PATCH] nccl fixes (#14195)

Summary:
This has 4 changes

1) propagate USE_SYSTEM_NCCL. Previously it was ignored and cmake always did a FindPackage
2) respect SCCACHE_DISABLE in our caffe2 sccache wrapper for circleci
3) use SCCACHE_DISABLE when building nccl, because it triggers the same bug as when using CCACHE (already tracked in https://github.com/pytorch/pytorch/issues/13362). This was hidden because we weren't respecting USE_SYSTEM_NCCL, and were never building nccl ourselves in CI
4) In one particular CI configuration (caffe2, cuda 8, cudnn 7), force USE_SYSTEM_NCCL=1. Building the bundled nccl triggers a bug in nvlink. I've done some investigation, but this looks like a tricky, preexisting bug, so rather than hold up this diff I'm tracking it separately in https://github.com/pytorch/pytorch/issues/14486
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14195

Differential Revision: D13237502

Pulled By: anderspapitto

fbshipit-source-id: 1100ac1269c7cd39e2e0b3ba12a56a3ce8977c55
---
 .jenkins/caffe2/build.sh    | 31 ++++++++++++++++++++++---------
 cmake/External/nccl.cmake   | 21 ++++++++++++++++-----
 setup.py                    |  3 +++
 tools/build_pytorch_libs.sh |  3 +++
 4 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh
index d52b033..d9d1fd7 100755
--- a/.jenkins/caffe2/build.sh
+++ b/.jenkins/caffe2/build.sh
@@ -25,22 +25,29 @@ if [ "$(which gcc)" != "/root/sccache/gcc" ]; then
     fi
 
     # Setup wrapper scripts
-    for compiler in cc c++ gcc g++ x86_64-linux-gnu-gcc; do
+    wrapped="cc c++ gcc g++ x86_64-linux-gnu-gcc"
+    if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]]; then
+        wrapped="$wrapped nvcc"
+    fi
+    for compiler in $wrapped; do
       (
         echo "#!/bin/sh"
+
+        # TODO: if/when sccache gains native support for an
+        # SCCACHE_DISABLE flag analogous to ccache's CCACHE_DISABLE,
+        # this can be removed. Alternatively, this can be removed when
+        # https://github.com/pytorch/pytorch/issues/13362 is fixed.
+        #
+        # NOTE: carefully quoted - we want `which compiler` to be
+        # resolved as we execute the script, but SCCACHE_DISABLE and
+        # $@ to be evaluated when we execute the script
+        echo 'test $SCCACHE_DISABLE && exec '"$(which $compiler)"' "$@"'
+
         echo "exec $SCCACHE $(which $compiler) \"\$@\""
       ) > "./sccache/$compiler"
       chmod +x "./sccache/$compiler"
     done
 
-    if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]]; then
-      (
-        echo "#!/bin/sh"
-        echo "exec $SCCACHE $(which nvcc) \"\$@\""
-      ) > "./sccache/nvcc"
-      chmod +x "./sccache/nvcc"
-    fi
-
     export CACHE_WRAPPER_DIR="$PWD/sccache"
 
     # CMake must find these wrapper scripts
@@ -145,6 +152,12 @@ if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
   ${PYTHON} "${ROOT_DIR}/tools/amd_build/build_caffe2_amd.py"
 fi
 
+# building bundled nccl in this config triggers a bug in nvlink. For
+# more, see https://github.com/pytorch/pytorch/issues/14486
+if [[ "${BUILD_ENVIRONMENT}" == *-cuda8*-cudnn7* ]]; then
+    CMAKE_ARGS+=("-DUSE_SYSTEM_NCCL=ON")
+fi
+
 # Try to include Redis support for Linux builds
 if [ "$(uname)" == "Linux" ]; then
   CMAKE_ARGS+=("-DUSE_REDIS=ON")
diff --git a/cmake/External/nccl.cmake b/cmake/External/nccl.cmake
index 4c6d1b8..4274d75 100644
--- a/cmake/External/nccl.cmake
+++ b/cmake/External/nccl.cmake
@@ -1,12 +1,20 @@
 if (NOT __NCCL_INCLUDED)
   set(__NCCL_INCLUDED TRUE)
 
-  # try the system-wide nccl first
-  find_package(NCCL)
-  if (NCCL_FOUND)
+  if (USE_SYSTEM_NCCL)
+    # if we have explicit paths passed from setup.py, use those
+    if (NCCL_INCLUDE_DIR)
       add_library(__caffe2_nccl INTERFACE)
-      target_link_libraries(__caffe2_nccl INTERFACE ${NCCL_LIBRARIES})
-      target_include_directories(__caffe2_nccl INTERFACE ${NCCL_INCLUDE_DIRS})
+      target_link_libraries(__caffe2_nccl INTERFACE ${NCCL_SYSTEM_LIB})
+      target_include_directories(__caffe2_nccl INTERFACE ${NCCL_INCLUDE_DIR})
+    else()
+      find_package(NCCL)
+      if (NCCL_FOUND)
+        add_library(__caffe2_nccl INTERFACE)
+        target_link_libraries(__caffe2_nccl INTERFACE ${NCCL_LIBRARIES})
+        target_include_directories(__caffe2_nccl INTERFACE ${NCCL_INCLUDE_DIRS})
+      endif()
+    endif()
   else()
     if (TORCH_CUDA_ARCH_LIST)
       torch_cuda_get_nvcc_gencode_flag(NVCC_GENCODE)
@@ -21,7 +29,10 @@ if (NOT __NCCL_INCLUDED)
       CONFIGURE_COMMAND ""
       BUILD_COMMAND
         env
+        # TODO: remove these flags when
+        # https://github.com/pytorch/pytorch/issues/13362 is fixed
         "CCACHE_DISABLE=1"
+        "SCCACHE_DISABLE=1"
         make
         "CXX=${CMAKE_CXX_COMPILER}"
         "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}"
diff --git a/setup.py b/setup.py
index 505a58e..a207498 100644
--- a/setup.py
+++ b/setup.py
@@ -345,6 +345,8 @@ def build_libs(libs):
             my_env['CMAKE_INSTALL'] = 'make install'
     if USE_SYSTEM_NCCL:
         my_env["NCCL_ROOT_DIR"] = NCCL_ROOT_DIR
+        my_env["NCCL_INCLUDE_DIR"] = NCCL_INCLUDE_DIR
+        my_env["NCCL_SYSTEM_LIB"] = NCCL_SYSTEM_LIB
     if USE_CUDA:
         my_env["CUDA_BIN_PATH"] = CUDA_HOME
         build_libs_cmd += ['--use-cuda']
@@ -388,6 +390,7 @@ def build_libs(libs):
     my_env["USE_OPENCV"] = "ON" if USE_OPENCV else "OFF"
     my_env["USE_FFMPEG"] = "ON" if USE_FFMPEG else "OFF"
     my_env["USE_DISTRIBUTED"] = "ON" if USE_DISTRIBUTED else "OFF"
+    my_env["USE_SYSTEM_NCCL"] = "ON" if USE_SYSTEM_NCCL else "OFF"
 
     try:
         os.mkdir('build')
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index 69e11d4..7f38589 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -214,6 +214,9 @@ function build_caffe2() {
 		       -DUSE_FBGEMM=$USE_FBGEMM \
 		       -DUSE_NUMPY=$USE_NUMPY \
 		       -DNUMPY_INCLUDE_DIR=$NUMPY_INCLUDE_DIR \
+		       -DUSE_SYSTEM_NCCL=$USE_SYSTEM_NCCL \
+		       -DNCCL_INCLUDE_DIR=$NCCL_INCLUDE_DIR \
+		       -DNCCL_SYSTEM_LIB=$NCCL_SYSTEM_LIB \
 		       -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \
 		       -DUSE_ROCM=$USE_ROCM \
 		       -DUSE_NNPACK=$USE_NNPACK \
-- 
2.7.4