From 8964a2e6e6f035590260ac5d4ab21f6e3d8a5204 Mon Sep 17 00:00:00 2001 From: Jesse Hellemn Date: Mon, 14 Jan 2019 15:10:49 -0800 Subject: [PATCH] Split Caffe2 CI into cmake-only and python builds (#15917) Summary: bypass-lint - Change all Caffe2 builds to use setup.py instead of cmake - Add a -cmake- Caffe2 build configuration that uses cmake and only builds cpp - Move skipIfCI logic from onnx test scripts to the rest of CI logic - Removal of old PYTHONPATH/LD_LIBRARY_PATH/etc. env management Pull Request resolved: https://github.com/pytorch/pytorch/pull/15917 Reviewed By: orionr Differential Revision: D13637583 Pulled By: pjh5 fbshipit-source-id: c5c5639db0251ba12b6e4b51b2ac3b26a8953153 --- .circleci/config.yml | 84 +++++----- .jenkins/caffe2/build.sh | 181 ++++++++++------------ .jenkins/caffe2/common.sh | 20 +-- .jenkins/caffe2/test.sh | 70 ++++++++- caffe2/python/operator_test/layer_norm_op_test.py | 1 + cmake/Dependencies.cmake | 17 +- scripts/onnx/test.sh | 12 +- setup.py | 17 ++ test/onnx/test_models.py | 7 +- test/onnx/test_operators.py | 5 - test/onnx/test_pytorch_common.py | 4 - tools/build_pytorch_libs.sh | 17 +- tools/setup_helpers/build.py | 8 + tools/setup_helpers/configure.py | 10 +- 14 files changed, 255 insertions(+), 198 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 65f6a94..fada881 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -155,24 +155,14 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults no_output_timeout: "1h" command: | set -e - # TODO: merge this into Caffe2 build.sh cat >/home/circleci/project/ci_build_script.sh < /dev/null; then @@ -210,7 +191,11 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults # Push intermediate Docker image for next phase to use if [ -z "${BUILD_ONLY}" ]; then - export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1} + if [[ "$BUILD_ENVIRONMENT" == *cmake* ]]; then + export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-cmake-${CIRCLE_SHA1} + else + export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1} + fi docker commit "$id" ${COMMIT_DOCKER_IMAGE} docker push ${COMMIT_DOCKER_IMAGE} fi @@ -231,42 +216,24 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults # =================== The following code will be executed inside Docker container =================== set -ex + export BUILD_ENVIRONMENT="$BUILD_ENVIRONMENT" + # libdc1394 (dependency of OpenCV) expects /dev/raw1394 to exist... sudo ln /dev/null /dev/raw1394 - # Hotfix, use hypothesis 3.44.6 on Ubuntu 14.04 - # See comments on https://github.com/HypothesisWorks/hypothesis-python/commit/eadd62e467d6cee6216e71b391951ec25b4f5830 - if [[ "$BUILD_ENVIRONMENT" == *ubuntu14.04* ]]; then - sudo pip -q uninstall -y hypothesis - # "pip install hypothesis==3.44.6" from official server is unreliable on CircleCI, so we host a copy on S3 instead - sudo pip -q install attrs==18.1.0 -f https://s3.amazonaws.com/ossci-linux/wheels/attrs-18.1.0-py2.py3-none-any.whl - sudo pip -q install coverage==4.5.1 -f https://s3.amazonaws.com/ossci-linux/wheels/coverage-4.5.1-cp36-cp36m-macosx_10_12_x86_64.whl - sudo pip -q install hypothesis==3.44.6 -f https://s3.amazonaws.com/ossci-linux/wheels/hypothesis-3.44.6-py3-none-any.whl - fi - # conda must be added to the path for Anaconda builds (this location must be # the same as that in install_anaconda.sh used to build the docker image) if [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then export PATH=/opt/conda/bin:$PATH fi - # set the env var for onnx build and test - if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then - export INTEGRATED=1 - fi - # Upgrade SSL module to avoid old SSL warnings pip -q install --user --upgrade pyOpenSSL ndg-httpsclient pyasn1 pip -q install --user -b /tmp/pip_install_onnx "file:///var/lib/jenkins/workspace/third_party/onnx#egg=onnx" - pip -q install --user future # Build - if test -x ".jenkins/caffe2/test.sh"; then - ./.jenkins/caffe2/test.sh - else - ./.jenkins/test.sh - fi + ./.jenkins/caffe2/test.sh # Remove benign core dumps. # These are tests for signal handling (including SIGABRT). @@ -276,7 +243,11 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults EOL chmod +x /home/circleci/project/ci_test_script.sh - export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1} + if [[ "$BUILD_ENVIRONMENT" == *cmake* ]]; then + export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-cmake-${CIRCLE_SHA1} + else + export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1} + fi echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE} docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null if [ -n "${CUDA_VERSION}" ]; then @@ -1197,6 +1168,23 @@ jobs: resource_class: gpu.medium <<: *caffe2_linux_test_defaults + caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_build: + environment: + JOB_BASE_NAME: caffe2-cmake-cuda9.0-cudnn7-ubuntu16.04-build + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:238" + CUDA_VERSION: "9" + BUILD_ENVIRONMENT: "cmake-cuda9.0-cudnn7-ubuntu16.04" + <<: *caffe2_linux_build_defaults + + caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_test: + environment: + JOB_BASE_NAME: caffe2-cmake-cuda9.0-cudnn7-ubuntu16.04-test + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:238" + CUDA_VERSION: "9" + BUILD_ENVIRONMENT: "cmake-cuda9.0-cudnn7-ubuntu16.04" + resource_class: gpu.medium + <<: *caffe2_linux_test_defaults + caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build: environment: JOB_BASE_NAME: caffe2-py2-cuda9.1-cudnn7-ubuntu16.04-build @@ -2971,10 +2959,10 @@ workflows: - pytorch_macos_10_13_cuda9_2_cudnn7_py3_build # Caffe2 builds - - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build - - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test: + - caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_build + - caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_test: requires: - - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build + - caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_build - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test: requires: @@ -2991,10 +2979,12 @@ workflows: - caffe2_py2_gcc4_8_ubuntu14_04_test: requires: - caffe2_py2_gcc4_8_ubuntu14_04_build + - caffe2_onnx_py2_gcc5_ubuntu16_04_build - caffe2_onnx_py2_gcc5_ubuntu16_04_test: requires: - caffe2_onnx_py2_gcc5_ubuntu16_04_build + - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build - caffe2_py2_clang3_8_ubuntu16_04_build - caffe2_py2_clang3_9_ubuntu16_04_build diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh index 4beb22d..b5efd64 100755 --- a/.jenkins/caffe2/build.sh +++ b/.jenkins/caffe2/build.sh @@ -2,6 +2,8 @@ set -ex +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + # TODO: Migrate all centos jobs to use proper devtoolset if [[ "$BUILD_ENVIRONMENT" == "py2-cuda9.0-cudnn7-centos7" ]]; then # There is a bug in pango packge on Centos7 that causes undefined @@ -10,15 +12,20 @@ if [[ "$BUILD_ENVIRONMENT" == "py2-cuda9.0-cudnn7-centos7" ]]; then sudo yum install -y -q glib2-2.56.1 fi -pip install --user --no-cache-dir hypothesis==3.59.0 +# CMAKE_ARGS are only passed to 'cmake' and the -Dfoo=bar does not work with +# setup.py, so we build a list of foo=bars and then either convert it to +# -Dfoo=bars or export them before running setup.py +build_args=() +build_to_cmake () { + cmake_args=() + for build_arg in $*; do + cmake_args+=("-D$build_arg") + done + echo ${cmake_args[@]} +} + -# The INSTALL_PREFIX here must match up with test.sh -INSTALL_PREFIX="/usr/local/caffe2" -LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd) -CMAKE_ARGS=() SCCACHE="$(which sccache)" - if [ "$(which gcc)" != "/root/sccache/gcc" ]; then # Setup SCCACHE ############################################################################### @@ -95,48 +102,39 @@ report_compile_cache_stats() { fi } -############################################################################### -# Explicitly set Python executable. -############################################################################### -# On Ubuntu 16.04 the default Python is still 2.7. -PYTHON="$(which python)" -if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then - PYTHON=$(which "python${BASH_REMATCH[1]}") - CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=${PYTHON}") -fi - ############################################################################### # Use special scripts for Android and setup builds ############################################################################### if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then export ANDROID_NDK=/opt/ndk - CMAKE_ARGS+=("-DBUILD_BINARY=ON") - CMAKE_ARGS+=("-DBUILD_TEST=ON") - CMAKE_ARGS+=("-DUSE_OBSERVERS=ON") - CMAKE_ARGS+=("-DUSE_ZSTD=ON") - "${ROOT_DIR}/scripts/build_android.sh" ${CMAKE_ARGS[*]} "$@" + build_args+=("BUILD_BINARY=ON") + build_args+=("BUILD_TEST=ON") + build_args+=("USE_OBSERVERS=ON") + build_args+=("USE_ZSTD=ON") + "${ROOT_DIR}/scripts/build_android.sh" $(build_to_cmake ${build_args[@]}) "$@" exit 0 fi - ############################################################################### -# Set cmake args +# Set parameters ############################################################################### -CMAKE_ARGS+=("-DBUILD_BINARY=ON") -CMAKE_ARGS+=("-DBUILD_TEST=ON") -CMAKE_ARGS+=("-DINSTALL_TEST=ON") -CMAKE_ARGS+=("-DUSE_OBSERVERS=ON") -CMAKE_ARGS+=("-DUSE_ZSTD=ON") -CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}") - +if [[ "$BUILD_ENVIRONMENT" == *cmake* ]]; then + build_args+=("BUILD_PYTHON=OFF") +else + build_args+=("BUILD_PYTHON=ON") + build_args+=("PYTHON_EXECUTABLE=${PYTHON}") +fi if [[ $BUILD_ENVIRONMENT == *mkl* ]]; then - CMAKE_ARGS+=("-DBLAS=MKL") - CMAKE_ARGS+=("-DUSE_MKLDNN=ON") + build_args+=("BLAS=MKL") + build_args+=("USE_MKLDNN=ON") fi +build_args+=("BUILD_BINARY=ON") +build_args+=("BUILD_TEST=ON") +build_args+=("INSTALL_TEST=ON") +build_args+=("USE_ZSTD=ON") if [[ $BUILD_ENVIRONMENT == py2-cuda9.0-cudnn7-ubuntu16.04 ]]; then - # removing http:// duplicate in favor of nvidia-ml.list # which is https:// version of the same repo sudo rm -f /etc/apt/sources.list.d/nvidia-machine-learning.list @@ -147,16 +145,18 @@ if [[ $BUILD_ENVIRONMENT == py2-cuda9.0-cudnn7-ubuntu16.04 ]]; then sudo apt-get install libnvinfer5 libnvinfer-dev rm ./nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda9.0_1-1_amd64.deb - CMAKE_ARGS+=("-DUSE_TENSORRT=ON") + build_args+=("USE_TENSORRT=ON") fi if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then - CMAKE_ARGS+=("-DUSE_CUDA=ON") - CMAKE_ARGS+=("-DCUDA_ARCH_NAME=Maxwell") - CMAKE_ARGS+=("-DUSE_NNPACK=OFF") + build_args+=("USE_CUDA=ON") + build_args+=("USE_NNPACK=OFF") + + # Target only our CI GPU machine's CUDA arch to speed up the build + build_args+=("TORCH_CUDA_ARCH_LIST=Maxwell") # Explicitly set path to NVCC such that the symlink to ccache or sccache is used - CMAKE_ARGS+=("-DCUDA_NVCC_EXECUTABLE=${CACHE_WRAPPER_DIR}/nvcc") + build_args+=("CUDA_NVCC_EXECUTABLE=${CACHE_WRAPPER_DIR}/nvcc") # Ensure FindCUDA.cmake can infer the right path to the CUDA toolkit. # Setting PATH to resolve to the right nvcc alone isn't enough. @@ -167,10 +167,16 @@ if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then export PATH="/usr/local/cuda/bin:$PATH" fi if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then + build_args+=("USE_ROCM=ON") # This is needed to enable ImageInput operator in resnet50_trainer - CMAKE_ARGS+=("-USE_OPENCV=ON") + build_args+=("USE_OPENCV=ON") # This is needed to read datasets from https://download.caffe2.ai/databases/resnet_trainer.zip - CMAKE_ARGS+=("-USE_LMDB=ON") + build_args+=("USE_LMDB=ON") + # When hcc runs out of memory, it silently exits without stopping + # the build process, leaving undefined symbols in the shared lib + # which will cause undefined symbol errors when later running + # tests. Setting MAX_JOBS to smaller number to make CI less flaky. + export MAX_JOBS=4 ########## HIPIFY Caffe2 operators ${PYTHON} "${ROOT_DIR}/tools/amd_build/build_amd.py" @@ -179,37 +185,25 @@ fi # building bundled nccl in this config triggers a bug in nvlink. For # more, see https://github.com/pytorch/pytorch/issues/14486 if [[ "${BUILD_ENVIRONMENT}" == *-cuda8*-cudnn7* ]]; then - CMAKE_ARGS+=("-DUSE_SYSTEM_NCCL=ON") + build_args+=("USE_SYSTEM_NCCL=ON") fi # Try to include Redis support for Linux builds if [ "$(uname)" == "Linux" ]; then - CMAKE_ARGS+=("-DUSE_REDIS=ON") -fi - -# Currently, on Jenkins mac os, we will use custom protobuf. Mac OS -# contbuild at the moment is minimal dependency - it doesn't use glog -# or gflags either. -if [ "$(uname)" == "Darwin" ]; then - CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=ON") + build_args+=("USE_REDIS=ON") fi # Use a speciallized onnx namespace in CI to catch hardcoded onnx namespace -CMAKE_ARGS+=("-DONNX_NAMESPACE=ONNX_NAMESPACE_FOR_C2_CI") - -# We test the presence of cmake3 (for platforms like Centos and Ubuntu 14.04) -# and use that if so. -if [[ -x "$(command -v cmake3)" ]]; then - CMAKE_BINARY=cmake3 -else - CMAKE_BINARY=cmake -fi +build_args+=("ONNX_NAMESPACE=ONNX_NAMESPACE_FOR_C2_CI") ############################################################################### # Configure and make ############################################################################### -if [[ -z "$INTEGRATED" ]]; then +if [[ "$BUILD_ENVIRONMENT" == *cmake* ]]; then + # cmake-only non-setup.py build, to test cpp only bits. This installs into + # /usr/local/caffe2 and installs no Python tests + build_args+=("CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}") # Run cmake from ./build_caffe2 directory so it doesn't conflict with # standard PyTorch build directory. Eventually these won't need to @@ -218,8 +212,16 @@ if [[ -z "$INTEGRATED" ]]; then mkdir build_caffe2 cd ./build_caffe2 + # We test the presence of cmake3 (for platforms like Centos and Ubuntu 14.04) + # and use that if so. + if [[ -x "$(command -v cmake3)" ]]; then + CMAKE_BINARY=cmake3 + else + CMAKE_BINARY=cmake + fi + # Configure - ${CMAKE_BINARY} "${ROOT_DIR}" ${CMAKE_ARGS[*]} "$@" + ${CMAKE_BINARY} "${ROOT_DIR}" $(build_to_cmake ${build_args[@]}) "$@" # Build if [ "$(uname)" == "Linux" ]; then @@ -235,6 +237,18 @@ if [[ -z "$INTEGRATED" ]]; then ls $INSTALL_PREFIX else + # Python build. Uses setup.py to install into site-packages + build_args+=("USE_LEVELDB=ON") + build_args+=("USE_LMDB=ON") + build_args+=("USE_OPENCV=ON") + build_args+=("BUILD_TEST=ON") + # These flags preserve the flags that were used before this refactor (blame + # me) + build_args+=("USE_GLOG=ON") + build_args+=("USE_GFLAGS=ON") + build_args+=("USE_FBGEMM=OFF") + build_args+=("USE_MKLDNN=OFF") + build_args+=("USE_DISTRIBUTED=ON") # sccache will be stuck if all cores are used for compiling # see https://github.com/pytorch/pytorch/pull/7361 @@ -242,9 +256,15 @@ else export MAX_JOBS=`expr $(nproc) - 1` fi - USE_LEVELDB=1 USE_LMDB=1 USE_OPENCV=1 BUILD_TEST=1 BUILD_BINARY=1 python setup.py install --user + for build_arg in "${build_args[@]}"; do + export $build_arg + done + $PYTHON setup.py install --user - # This is to save test binaries for testing + # This is to save test binaries for testing. Copying caffe2/test to + # INSTALL_PREFIX, which is /usr/local/caffe2/, enables these setup.py builds + # to share cpp-tests test-code with the cmake-only build above. In test.sh + # the cpp tests are run in install_prefix cp -r torch/lib/tmp_install $INSTALL_PREFIX mkdir -p "$INSTALL_PREFIX/cpp_test/" cp -r caffe2/test/* "$INSTALL_PREFIX/cpp_test/" @@ -262,38 +282,3 @@ fi pip install --user -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx" report_compile_cache_stats - -# Symlink the caffe2 base python path into the system python path, -# so that we can import caffe2 without having to change $PYTHONPATH. -# Run in a subshell to contain environment set by /etc/os-release. -# -# This is only done when running on Jenkins! We don't want to pollute -# the user environment with Python symlinks and ld.so.conf.d hacks. -# -if [[ -z "$INTEGRATED" ]]; then - if [ -n "${JENKINS_URL}" ]; then - ( - source /etc/os-release - - function python_version() { - "$PYTHON" -c 'import sys; print("python%d.%d" % sys.version_info[0:2])' - } - - # Debian/Ubuntu - if [[ "$ID_LIKE" == *debian* ]]; then - python_path="/usr/local/lib/$(python_version)/dist-packages" - sudo ln -sf "${INSTALL_PREFIX}/caffe2" "${python_path}" - fi - - # RHEL/CentOS - if [[ "$ID_LIKE" == *rhel* ]]; then - python_path="/usr/lib64/$(python_version)/site-packages/" - sudo ln -sf "${INSTALL_PREFIX}/caffe2" "${python_path}" - fi - - # /etc/ld.so.conf.d is used on both Debian and RHEL - echo "${INSTALL_PREFIX}/lib" | sudo tee /etc/ld.so.conf.d/caffe2.conf - sudo ldconfig - ) - fi -fi diff --git a/.jenkins/caffe2/common.sh b/.jenkins/caffe2/common.sh index 32f8674..7704c11 100644 --- a/.jenkins/caffe2/common.sh +++ b/.jenkins/caffe2/common.sh @@ -2,21 +2,17 @@ set -ex LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd) +TEST_DIR="$ROOT_DIR/caffe2_tests" +gtest_reports_dir="${TEST_DIR}/cpp" +pytest_reports_dir="${TEST_DIR}/python" # Figure out which Python to use -PYTHON="python" +PYTHON="$(which python)" if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then - PYTHON="python${BASH_REMATCH[1]}" + PYTHON=$(which "python${BASH_REMATCH[1]}") fi -# Find where Caffe2 is installed. This will be the absolute path to the -# site-packages of the active Python installation +# /usr/local/caffe2 is where the cpp bits are installed to in in cmake-only +# builds. In +python builds the cpp tests are copied to /usr/local/caffe2 so +# that the test code in .jenkins/test.sh is the same INSTALL_PREFIX="/usr/local/caffe2" -SITE_DIR=$($PYTHON -c "from distutils import sysconfig; print(sysconfig.get_python_lib(prefix=''))") -INSTALL_SITE_DIR="${INSTALL_PREFIX}/${SITE_DIR}" -CAFFE2_PYPATH="$INSTALL_SITE_DIR/caffe2" - -# Set PYTHONPATH and LD_LIBRARY_PATH so that python can find the installed -# Caffe2. -export PYTHONPATH="${PYTHONPATH}:$INSTALL_SITE_DIR" -export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${INSTALL_PREFIX}/lib" diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index 623639b..6800393 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -8,9 +8,6 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then exit 0 fi -cd "$ROOT_DIR" - -TEST_DIR="$ROOT_DIR/caffe2_tests" rm -rf "$TEST_DIR" && mkdir -p "$TEST_DIR" cd "${WORKSPACE}" @@ -20,7 +17,7 @@ cd "${WORKSPACE}" ############# echo "Running C++ tests.." -gtest_reports_dir="${TEST_DIR}/cpp" +export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${INSTALL_PREFIX}/lib" mkdir -p "$gtest_reports_dir" for test in $(find "${INSTALL_PREFIX}/cpp_test" -executable -type f); do case "$test" in @@ -51,11 +48,60 @@ for test in $(find "${INSTALL_PREFIX}/cpp_test" -executable -type f); do esac done -################ +################################################################################ # Python tests # -################ +################################################################################ +if [[ "$BUILD_ENVIRONMENT" == *cmake* ]]; then + exit 0 +fi + +# Ideally this would be where the Python bits get installed to from using +# setup.py. However on our dockers this is not correct for two reasons. +# 1. This lies in /usr/local/lib/pythonM.m, but the dockers don't have the +# right permissions setup so the build doesn't have write access to this +# dir. For this reason we use --user flag in all pip install instructions +# which install into the $HOME/.local directory instead. +# 2. This returns lib/pythonM.m/dist-packages, but we install in site-packages. +# We use this same way of getting the install directory in other places in our +# build, so not really sure why it is not correct here +INSTALL_SITE_DIR=$($PYTHON -c "from distutils import sysconfig; print(sysconfig.get_python_lib())") +if [[ -n "$(find $INSTALL_SITE_DIR -name caffe2 2>/dev/null)" ]]; then + # Caffe2 will probably be found here if using a Python from a virtualenv or + # from conda + CAFFE2_PYPATH="$INSTALL_SITE_DIR/caffe2" +elif [[ -n "$(find $HOME/.local/lib -name caffe2 2>/dev/null)" ]]; then + # Caffe2 will be found here in the case of using no env and adding --user to + # the setup.py call + pyver=($(python --version 2>&1)) + pyver=${pyver[1]} + pyver=${pyver:0:3} + CAFFE2_PYPATH="$HOME/.local/lib/python$pyver/site-packages/caffe2" +else + echo "I do not know where Caffe2 is installed" + find / -name caffe2 2>/dev/null + exit 1 +fi +if [[ ! -d "$CAFFE2_PYPATH" ]]; then + echo "Failed to find where Caffe2 Python bits are installed" + find / -name caffe2 2>/dev/null + exit 1 +fi + + +if [[ "$BUILD_ENVIRONMENT" == *ubuntu14.04* ]]; then + # Hotfix, use hypothesis 3.44.6 on Ubuntu 14.04 + # See comments on + # https://github.com/HypothesisWorks/hypothesis-python/commit/eadd62e467d6cee6216e71b391951ec25b4f5830 + sudo pip -q uninstall -y hypothesis + # "pip install hypothesis==3.44.6" from official server is unreliable on + # CircleCI, so we host a copy on S3 instead + sudo pip -q install attrs==18.1.0 -f https://s3.amazonaws.com/ossci-linux/wheels/attrs-18.1.0-py2.py3-none-any.whl + sudo pip -q install coverage==4.5.1 -f https://s3.amazonaws.com/ossci-linux/wheels/coverage-4.5.1-cp36-cp36m-macosx_10_12_x86_64.whl + sudo pip -q install hypothesis==3.44.6 -f https://s3.amazonaws.com/ossci-linux/wheels/hypothesis-3.44.6-py3-none-any.whl +else + pip install --user --no-cache-dir hypothesis==3.59.0 +fi -pytest_reports_dir="${TEST_DIR}/python" mkdir -p "$pytest_reports_dir" # Collect additional tests to run (outside caffe2/python) @@ -75,6 +121,10 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/piecewise_linear_transform_test.py") rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/softmax_ops_test.py") rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/unique_ops_test.py") + + # On ROCm, RCCL (distributed) development isn't complete. + # https://github.com/ROCmSoftwarePlatform/rccl + rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/data_parallel_model_test.py") fi # NB: Warnings are disabled because they make it harder to see what @@ -95,9 +145,13 @@ pip install --user pytest-sugar "$CAFFE2_PYPATH/python" \ "${EXTRA_TESTS[@]}" +##################### +# torchvision tests # +##################### + cd ${INSTALL_PREFIX} -if [[ -n "$INTEGRATED" ]]; then +if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then pip install --user torchvision "$ROOT_DIR/scripts/onnx/test.sh" fi diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py index 052e2f7..6be3fe0 100644 --- a/caffe2/python/operator_test/layer_norm_op_test.py +++ b/caffe2/python/operator_test/layer_norm_op_test.py @@ -11,6 +11,7 @@ import caffe2.python.serialized_test.serialized_test_util as serial import numpy as np import os import unittest +import torch class TestLayerNormOp(serial.SerializedTestCase): diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index b302d19..84f73d4 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -751,17 +751,14 @@ if(USE_ROCM) else() caffe2_update_option(USE_ROCM OFF) endif() -endif() -# ---[ ROCm -if(USE_ROCM) - include_directories(SYSTEM ${HIP_PATH}/include) - include_directories(SYSTEM ${ROCBLAS_PATH}/include) - include_directories(SYSTEM ${ROCFFT_PATH}/include) - include_directories(SYSTEM ${HIPSPARSE_PATH}/include) - include_directories(SYSTEM ${HIPRAND_PATH}/include) - include_directories(SYSTEM ${ROCRAND_PATH}/include) - include_directories(SYSTEM ${THRUST_PATH}) + include_directories(SYSTEM ${HIP_PATH}/include) + include_directories(SYSTEM ${ROCBLAS_PATH}/include) + include_directories(SYSTEM ${ROCFFT_PATH}/include) + include_directories(SYSTEM ${HIPSPARSE_PATH}/include) + include_directories(SYSTEM ${HIPRAND_PATH}/include) + include_directories(SYSTEM ${ROCRAND_PATH}/include) + include_directories(SYSTEM ${THRUST_PATH}) endif() # ---[ NCCL diff --git a/scripts/onnx/test.sh b/scripts/onnx/test.sh index c928f97..22afe0a 100755 --- a/scripts/onnx/test.sh +++ b/scripts/onnx/test.sh @@ -35,8 +35,14 @@ test_paths=( "$top_dir/test/onnx" ) +args=() +args+=("-v") if [[ $PARALLEL == 1 ]]; then - pytest -n 3 "${test_paths[@]}" -else - pytest "${test_paths[@]}" + args+=("-n") + args+=("3") fi + +pytest "${args[@]}" \ + -k \ + 'not (TestOperators and test_full_like) and not (TestOperators and test_zeros_like) and not (TestOperators and test_ones_like) and not (TestModels and test_super_resolution) and not (TestModels and test_vgg16) and not (TestModels and test_vgg16_bn) and not (TestModels and test_vgg19) and not (TestModels and test_vgg19_bn)' \ + "${test_paths[@]}" diff --git a/setup.py b/setup.py index 8a4cac4..4fd8dca 100644 --- a/setup.py +++ b/setup.py @@ -91,6 +91,20 @@ # specify a namespace for ONNX built here rather than the hard-coded # one in this file; needed to build with other frameworks that share ONNX. # +# BLAS +# BLAS to be used by Caffe2. Can be MKL, Eigen, ATLAS, or OpenBLAS. If set +# then the build will fail if the requested BLAS is not found, otherwise +# the BLAS will be chosen based on what is found on your system. +# +# USE_FBGEMM +# Enables use of FBGEMM +# +# USE_REDIS +# Whether to use Redis for distributed workflows (Linux only) +# +# USE_ZSTD +# Enables use of ZSTD, if the libraries are found +# # Environment variables we respect (these environment variables are # conventional and are often understood/set by other software.) # @@ -102,6 +116,9 @@ # specify a different compiler than the system one to use as the CUDA # host compiler for nvcc. # +# CUDA_NVCC_EXECUTABLE +# Specify a NVCC to use. This is used in our CI to point to a cached nvcc +# # CUDNN_LIB_DIR # CUDNN_INCLUDE_DIR # CUDNN_LIBRARY diff --git a/test/onnx/test_models.py b/test/onnx/test_models.py index 4625a50..f3cbc44 100644 --- a/test/onnx/test_models.py +++ b/test/onnx/test_models.py @@ -12,7 +12,7 @@ from model_defs.srresnet import SRResNet from model_defs.dcgan import _netD, _netG, weights_init, bsz, imgsz, nz from model_defs.op_test import DummyNet, ConcatNet, PermuteNet, PReluNet -from test_pytorch_common import TestCase, run_tests, skipIfNoLapack, skipIfCI +from test_pytorch_common import TestCase, run_tests, skipIfNoLapack import torch import torch.onnx @@ -77,7 +77,6 @@ class TestModels(TestCase): x = Variable(torch.randn(1, 3, 224, 224).fill_(1.0)) self.exportTest(toC(SRResNet(rescale_factor=4, n_filters=64, n_blocks=8)), toC(x)) - @skipIfCI @skipIfNoLapack def test_super_resolution(self): x = Variable( @@ -96,25 +95,21 @@ class TestModels(TestCase): x = Variable(torch.randn(BATCH_SIZE, 1, 28, 28).fill_(1.0)) self.exportTest(toC(MNIST()), toC(x)) - @skipIfCI def test_vgg16(self): # VGG 16-layer model (configuration "D") x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(vgg16()), toC(x)) - @skipIfCI def test_vgg16_bn(self): # VGG 16-layer model (configuration "D") with batch normalization x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(vgg16_bn()), toC(x)) - @skipIfCI def test_vgg19(self): # VGG 19-layer model (configuration "E") x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(vgg19()), toC(x)) - @skipIfCI def test_vgg19_bn(self): # VGG 19-layer model (configuration 'E') with batch normalization x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py index 8484f9e..af53a3e 100644 --- a/test/onnx/test_operators.py +++ b/test/onnx/test_operators.py @@ -17,8 +17,6 @@ import shutil import sys import common_utils as common -from test_pytorch_common import skipIfCI - '''Usage: python test/onnx/test_operators.py [--no-onnx] [--produce-onnx-test-data] --no-onnx: no onnx python dependence @@ -300,7 +298,6 @@ class TestOperators(TestCase): x = torch.randn(3, 4, requires_grad=True) self.assertONNX(lambda x: torch.full(x.shape, 2), x) - @skipIfCI def test_full_like(self): x = torch.randn(3, 4, requires_grad=True) self.assertONNX(lambda x: torch.full_like(x, 2), x) @@ -488,12 +485,10 @@ class TestOperators(TestCase): x = torch.randn(3, 4) self.assertONNX(torch.nn.Linear(4, 5, bias=True), x) - @skipIfCI def test_zeros_like(self): x = torch.randn(5, 8, requires_grad=True) self.assertONNX(lambda x: torch.zeros_like(x), x) - @skipIfCI def test_ones_like(self): x = torch.randn(6, 10, requires_grad=True) self.assertONNX(lambda x: torch.ones_like(x), x) diff --git a/test/onnx/test_pytorch_common.py b/test/onnx/test_pytorch_common.py index 304f9c4..ce61b5c 100644 --- a/test/onnx/test_pytorch_common.py +++ b/test/onnx/test_pytorch_common.py @@ -35,10 +35,6 @@ skipIfNoCuda = _skipper(lambda: not torch.cuda.is_available(), skipIfTravis = _skipper(lambda: os.getenv('TRAVIS'), 'Skip In Travis') -skipIfCI = _skipper(lambda: os.getenv('CI') or os.getenv('TRAVIS') or - os.getenv('JENKINS_URL') or os.getenv('INTEGRATED'), - 'Skip In CI') - def flatten(x): return tuple(function._iter_filter(lambda o: isinstance(o, torch.Tensor))(x)) diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index 82ca8aa..4a550ff 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -204,6 +204,21 @@ function build_caffe2() { if [[ -n $CMAKE_PREFIX_PATH ]]; then EXTRA_CAFFE2_CMAKE_FLAGS+=("-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH") fi + if [[ -n $BLAS ]]; then + EXTRA_CAFFE2_CMAKE_FLAGS+=("-DBLAS=$BLAS") + fi + if [[ -n $CUDA_NVCC_EXECUTABLE ]]; then + EXTRA_CAFFE2_CMAKE_FLAGS+=("-DCUDA_NVCC_EXECUTABLE=$CUDA_NVCC_EXECUTABLE") + fi + if [[ -n $USE_REDIS ]]; then + EXTRA_CAFFE2_CMAKE_FLAGS+=("-DUSE_REDIS=$USE_REDIS") + fi + if [[ -n $USE_GLOG ]]; then + EXTRA_CAFFE2_CMAKE_FLAGS+=("-DUSE_GLOG=$USE_GLOG") + fi + if [[ -n $USE_GFLAGS ]]; then + EXTRA_CAFFE2_CMAKE_FLAGS+=("-DUSE_GFLAGS=$USE_GFLAGS") + fi if [[ $RERUN_CMAKE -eq 1 ]] || [ ! -f CMakeCache.txt ]; then ${CMAKE_COMMAND} $BASE_DIR \ @@ -240,8 +255,6 @@ function build_caffe2() { -DUSE_QNNPACK=$USE_QNNPACK \ -DUSE_TENSORRT=$USE_TENSORRT \ -DUSE_FFMPEG=$USE_FFMPEG \ - -DUSE_GLOG=OFF \ - -DUSE_GFLAGS=OFF \ -DUSE_SYSTEM_EIGEN_INSTALL=OFF \ -DCUDNN_INCLUDE_DIR=$CUDNN_INCLUDE_DIR \ -DCUDNN_LIB_DIR=$CUDNN_LIB_DIR \ diff --git a/tools/setup_helpers/build.py b/tools/setup_helpers/build.py index 252da64..e7f1d9b 100644 --- a/tools/setup_helpers/build.py +++ b/tools/setup_helpers/build.py @@ -1,11 +1,19 @@ +import os from .env import check_env_flag, check_negative_env_flag +BLAS = os.getenv('BLAS') BUILD_BINARY = check_env_flag('BUILD_BINARY') BUILD_TEST = not check_negative_env_flag('BUILD_TEST') BUILD_CAFFE2_OPS = not check_negative_env_flag('BUILD_CAFFE2_OPS') USE_LEVELDB = check_env_flag('USE_LEVELDB') USE_LMDB = check_env_flag('USE_LMDB') USE_OPENCV = check_env_flag('USE_OPENCV') +USE_REDIS = check_env_flag('USE_REDIS') USE_TENSORRT = check_env_flag('USE_TENSORRT') USE_FFMPEG = check_env_flag('USE_FFMPEG') USE_FBGEMM = not (check_env_flag('NO_FBGEMM') or check_negative_env_flag('USE_FBGEMM')) +USE_ZSTD = check_env_flag('USE_ZSTD') + +# These aren't in ./cuda.py because they need to be passed directly to cmake, +# since cmake files expect them +CUDA_NVCC_EXECUTABLE = os.getenv('CUDA_NVCC_EXECUTABLE') diff --git a/tools/setup_helpers/configure.py b/tools/setup_helpers/configure.py index 480000f..d7affe5 100644 --- a/tools/setup_helpers/configure.py +++ b/tools/setup_helpers/configure.py @@ -8,9 +8,9 @@ from .env import (IS_ARM, IS_DARWIN, IS_LINUX, IS_PPC, IS_WINDOWS, hotpatch_build_env_vars() -from .build import (BUILD_BINARY, BUILD_CAFFE2_OPS, BUILD_TEST, USE_FBGEMM, - USE_FFMPEG, USE_LEVELDB, USE_LMDB, USE_OPENCV, - USE_TENSORRT) +from .build import (BLAS, BUILD_BINARY, BUILD_CAFFE2_OPS, BUILD_TEST, + USE_FBGEMM, USE_FFMPEG, USE_LEVELDB, USE_LMDB, USE_OPENCV, + USE_REDIS, USE_TENSORRT, USE_ZSTD, CUDA_NVCC_EXECUTABLE) from .cuda import CUDA_HOME, CUDA_VERSION, USE_CUDA from .cudnn import CUDNN_INCLUDE_DIR, CUDNN_LIB_DIR, CUDNN_LIBRARY, USE_CUDNN from .dist_check import USE_DISTRIBUTED, USE_GLOO_IBVERBS @@ -66,6 +66,8 @@ def get_common_env_with_flags(): my_env = os.environ.copy() my_env["PYTORCH_PYTHON"] = sys.executable my_env["ONNX_NAMESPACE"] = ONNX_NAMESPACE + if BLAS: + my_env["BLAS"] = BLAS if USE_SYSTEM_NCCL: my_env["NCCL_ROOT_DIR"] = NCCL_ROOT_DIR my_env["NCCL_INCLUDE_DIR"] = NCCL_INCLUDE_DIR @@ -75,6 +77,8 @@ def get_common_env_with_flags(): extra_flags += ['--use-cuda'] if IS_WINDOWS: my_env["NVTOOLEXT_HOME"] = NVTOOLEXT_HOME + if CUDA_NVCC_EXECUTABLE: + my_env["CUDA_NVCC_EXECUTABLE"] = CUDA_NVCC_EXECUTABLE if USE_CUDA_STATIC_LINK: extra_flags += ['--cuda-static-link'] if USE_FBGEMM: -- 2.7.4