From cb959aa708a886edb3e4e998881978c1e4816aa6 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 3 Apr 2019 13:38:56 -0700 Subject: [PATCH] Switch our Linux machine AMI to a newer image. (#18433) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/18433 ghimport-source-id: 1c92f98b091232c0045a2e1db75d19c1f258ac1f Differential Revision: D14748827 Pulled By: ezyang fbshipit-source-id: a459451058cf5560811403bafb96c6ff083d7e3a --- .circleci/config.yml | 98 ++++++++++++---------- .circleci/verbatim-sources/header-section.yml | 72 ++++++++++------ .circleci/verbatim-sources/job-specs-custom.yml | 4 +- .../verbatim-sources/job-specs-html-update.yml | 2 +- .../linux-binary-build-defaults.yml | 4 +- .../verbatim-sources/linux-build-defaults.yml | 12 +-- .../nightly-build-smoke-tests-defaults.yml | 4 +- 7 files changed, 110 insertions(+), 86 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index db6d274..46db2a7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -27,26 +27,37 @@ setup_linux_system_environment: &setup_linux_system_environment # Set up CircleCI GPG keys for apt, if needed curl -L https://packagecloud.io/circleci/trusty/gpgkey | sudo apt-key add - -# NOTE: We only perform the merge in build step and not in test step, because -# all source files will be shared from build to test -install_official_git_client: &install_official_git_client - name: Install Official Git Client - no_output_timeout: "1h" - command: | - set -ex - - sudo killall apt-get || true - sudo rm /var/lib/apt/lists/lock || true - sudo rm /var/cache/apt/archives/lock || true - sudo rm /var/lib/dpkg/lock || true + # Stop background apt updates. Hypothetically, the kill should not + # be necessary, because stop is supposed to send a kill signal to + # the process, but we've added it for good luck. Also + # hypothetically, it's supposed to be unnecessary to wait for + # the process to block. We also have that line for good luck. + # If you like, try deleting them and seeing if it works. + sudo systemctl stop apt-daily.service || true + sudo systemctl kill --kill-who=all apt-daily.service || true + + sudo systemctl stop unattended-upgrades.service || true + sudo systemctl kill --kill-who=all unattended-upgrades.service || true + + # wait until `apt-get update` has been killed + while systemctl is-active --quiet apt-daily.service + do + sleep 1; + done + while systemctl is-active --quiet unattended-upgrades.service + do + sleep 1; + done + + # See if we actually were successful + systemctl list-units --all | cat + + sudo apt-get purge -y unattended-upgrades cat /etc/apt/sources.list - sudo sed -i 's#archive.ubuntu.com/ubuntu#us-east-1.ec2.archive.ubuntu.com/ubuntu#g' /etc/apt/sources.list - sudo sed -i 's#security.ubuntu.com/ubuntu#us-east-1.ec2.archive.ubuntu.com/ubuntu#g' /etc/apt/sources.list - cat /etc/apt/sources.list - sudo apt-get -q -y update - sudo apt-get -q -y install openssh-client git + ps ax | grep apt + ps ax | grep dpkg install_doc_push_script: &install_doc_push_script name: Install the doc push script @@ -189,24 +200,30 @@ setup_ci_environment: &setup_ci_environment # Set up NVIDIA docker repo curl -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - - echo "deb https://nvidia.github.io/libnvidia-container/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list - echo "deb https://nvidia.github.io/nvidia-container-runtime/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list - echo "deb https://nvidia.github.io/nvidia-docker/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list + echo "deb https://nvidia.github.io/libnvidia-container/ubuntu16.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list + echo "deb https://nvidia.github.io/nvidia-container-runtime/ubuntu16.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list + echo "deb https://nvidia.github.io/nvidia-docker/ubuntu16.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list - sudo apt-get -q -y update - sudo apt-get -q -y remove linux-image-generic linux-headers-generic linux-generic docker-ce + sudo apt-get -y update + sudo apt-get -y remove linux-image-generic linux-headers-generic linux-generic docker-ce # WARNING: Docker version is hardcoded here; you must update the # version number below for docker-ce and nvidia-docker2 to get newer # versions of Docker. We hardcode these numbers because we kept # getting broken CI when Docker would update their docker version, # and nvidia-docker2 would be out of date for a day until they # released a newer version of their package. - sudo apt-get -q -y install \ + # + # How to figure out what the correct versions of these packages are? + # My preferred method is to start a Docker instance of the correct + # Ubuntu version (e.g., docker run -it ubuntu:16.04) and then ask + # apt what the packages you need are. Note that the CircleCI image + # comes with Docker. + sudo apt-get -y install \ linux-headers-$(uname -r) \ linux-image-generic \ moreutils \ - docker-ce=18.06.2~ce~3-0~ubuntu \ - nvidia-docker2=2.0.3+docker18.06.2-1 \ + docker-ce=5:18.09.4~3-0~ubuntu-xenial \ + nvidia-docker2=2.0.3+docker18.09.4-1 \ expect-dev sudo pkill -SIGHUP dockerd @@ -214,8 +231,9 @@ setup_ci_environment: &setup_ci_environment sudo pip -q install awscli==1.16.35 if [ -n "${USE_CUDA_DOCKER_RUNTIME}" ]; then - wget 'https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-410.79.run' - sudo /bin/bash ./NVIDIA-Linux-x86_64-410.79.run -s --no-drm + DRIVER_FN="NVIDIA-Linux-x86_64-410.104.run" + wget "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN" + sudo /bin/bash "$DRIVER_FN" -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false) nvidia-smi fi @@ -273,12 +291,10 @@ macos_brew_update: &macos_brew_update pytorch_linux_build_defaults: &pytorch_linux_build_defaults resource_class: large machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment - - run: - <<: *install_official_git_client - checkout - run: <<: *setup_ci_environment @@ -308,7 +324,7 @@ pytorch_linux_build_defaults: &pytorch_linux_build_defaults pytorch_linux_test_defaults: &pytorch_linux_test_defaults machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment @@ -337,12 +353,10 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults caffe2_linux_build_defaults: &caffe2_linux_build_defaults resource_class: large machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment - - run: - <<: *install_official_git_client - checkout - run: <<: *setup_ci_environment @@ -398,7 +412,7 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults caffe2_linux_test_defaults: &caffe2_linux_test_defaults machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment @@ -817,7 +831,7 @@ binary_linux_build: &binary_linux_build # that on the docker executor) binary_linux_test: &binary_linux_test machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment @@ -883,7 +897,7 @@ binary_linux_test: &binary_linux_test binary_linux_upload: &binary_linux_upload machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment @@ -1068,13 +1082,11 @@ binary_mac_upload: &binary_mac_upload ############################################################################## smoke_linux_test: &smoke_linux_test machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment - run: - <<: *install_official_git_client - - run: <<: *setup_ci_environment - run: <<: *binary_populate_env @@ -1360,7 +1372,7 @@ jobs: USE_CUDA_DOCKER_RUNTIME: "1" resource_class: gpu.medium machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment @@ -1391,7 +1403,7 @@ jobs: DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:291" resource_class: large machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment @@ -1705,7 +1717,7 @@ jobs: # update_s3_htmls job update_s3_htmls: machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment diff --git a/.circleci/verbatim-sources/header-section.yml b/.circleci/verbatim-sources/header-section.yml index 5e691ef..30ed060 100644 --- a/.circleci/verbatim-sources/header-section.yml +++ b/.circleci/verbatim-sources/header-section.yml @@ -27,26 +27,37 @@ setup_linux_system_environment: &setup_linux_system_environment # Set up CircleCI GPG keys for apt, if needed curl -L https://packagecloud.io/circleci/trusty/gpgkey | sudo apt-key add - -# NOTE: We only perform the merge in build step and not in test step, because -# all source files will be shared from build to test -install_official_git_client: &install_official_git_client - name: Install Official Git Client - no_output_timeout: "1h" - command: | - set -ex + # Stop background apt updates. Hypothetically, the kill should not + # be necessary, because stop is supposed to send a kill signal to + # the process, but we've added it for good luck. Also + # hypothetically, it's supposed to be unnecessary to wait for + # the process to block. We also have that line for good luck. + # If you like, try deleting them and seeing if it works. + sudo systemctl stop apt-daily.service || true + sudo systemctl kill --kill-who=all apt-daily.service || true + + sudo systemctl stop unattended-upgrades.service || true + sudo systemctl kill --kill-who=all unattended-upgrades.service || true + + # wait until `apt-get update` has been killed + while systemctl is-active --quiet apt-daily.service + do + sleep 1; + done + while systemctl is-active --quiet unattended-upgrades.service + do + sleep 1; + done + + # See if we actually were successful + systemctl list-units --all | cat + + sudo apt-get purge -y unattended-upgrades - sudo killall apt-get || true - sudo rm /var/lib/apt/lists/lock || true - sudo rm /var/cache/apt/archives/lock || true - sudo rm /var/lib/dpkg/lock || true - - cat /etc/apt/sources.list - sudo sed -i 's#archive.ubuntu.com/ubuntu#us-east-1.ec2.archive.ubuntu.com/ubuntu#g' /etc/apt/sources.list - sudo sed -i 's#security.ubuntu.com/ubuntu#us-east-1.ec2.archive.ubuntu.com/ubuntu#g' /etc/apt/sources.list cat /etc/apt/sources.list - sudo apt-get -q -y update - sudo apt-get -q -y install openssh-client git + ps ax | grep apt + ps ax | grep dpkg install_doc_push_script: &install_doc_push_script name: Install the doc push script @@ -189,24 +200,30 @@ setup_ci_environment: &setup_ci_environment # Set up NVIDIA docker repo curl -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - - echo "deb https://nvidia.github.io/libnvidia-container/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list - echo "deb https://nvidia.github.io/nvidia-container-runtime/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list - echo "deb https://nvidia.github.io/nvidia-docker/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list + echo "deb https://nvidia.github.io/libnvidia-container/ubuntu16.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list + echo "deb https://nvidia.github.io/nvidia-container-runtime/ubuntu16.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list + echo "deb https://nvidia.github.io/nvidia-docker/ubuntu16.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list - sudo apt-get -q -y update - sudo apt-get -q -y remove linux-image-generic linux-headers-generic linux-generic docker-ce + sudo apt-get -y update + sudo apt-get -y remove linux-image-generic linux-headers-generic linux-generic docker-ce # WARNING: Docker version is hardcoded here; you must update the # version number below for docker-ce and nvidia-docker2 to get newer # versions of Docker. We hardcode these numbers because we kept # getting broken CI when Docker would update their docker version, # and nvidia-docker2 would be out of date for a day until they # released a newer version of their package. - sudo apt-get -q -y install \ + # + # How to figure out what the correct versions of these packages are? + # My preferred method is to start a Docker instance of the correct + # Ubuntu version (e.g., docker run -it ubuntu:16.04) and then ask + # apt what the packages you need are. Note that the CircleCI image + # comes with Docker. + sudo apt-get -y install \ linux-headers-$(uname -r) \ linux-image-generic \ moreutils \ - docker-ce=18.06.2~ce~3-0~ubuntu \ - nvidia-docker2=2.0.3+docker18.06.2-1 \ + docker-ce=5:18.09.4~3-0~ubuntu-xenial \ + nvidia-docker2=2.0.3+docker18.09.4-1 \ expect-dev sudo pkill -SIGHUP dockerd @@ -214,8 +231,9 @@ setup_ci_environment: &setup_ci_environment sudo pip -q install awscli==1.16.35 if [ -n "${USE_CUDA_DOCKER_RUNTIME}" ]; then - wget 'https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-410.79.run' - sudo /bin/bash ./NVIDIA-Linux-x86_64-410.79.run -s --no-drm + DRIVER_FN="NVIDIA-Linux-x86_64-410.104.run" + wget "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN" + sudo /bin/bash "$DRIVER_FN" -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false) nvidia-smi fi diff --git a/.circleci/verbatim-sources/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs-custom.yml index 3d70858..6892c45 100644 --- a/.circleci/verbatim-sources/job-specs-custom.yml +++ b/.circleci/verbatim-sources/job-specs-custom.yml @@ -6,7 +6,7 @@ USE_CUDA_DOCKER_RUNTIME: "1" resource_class: gpu.medium machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment @@ -37,7 +37,7 @@ DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:291" resource_class: large machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment diff --git a/.circleci/verbatim-sources/job-specs-html-update.yml b/.circleci/verbatim-sources/job-specs-html-update.yml index 2a28898..39e760a 100644 --- a/.circleci/verbatim-sources/job-specs-html-update.yml +++ b/.circleci/verbatim-sources/job-specs-html-update.yml @@ -1,7 +1,7 @@ # update_s3_htmls job update_s3_htmls: machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment diff --git a/.circleci/verbatim-sources/linux-binary-build-defaults.yml b/.circleci/verbatim-sources/linux-binary-build-defaults.yml index 7f5f002..581a9c1 100644 --- a/.circleci/verbatim-sources/linux-binary-build-defaults.yml +++ b/.circleci/verbatim-sources/linux-binary-build-defaults.yml @@ -60,7 +60,7 @@ binary_linux_build: &binary_linux_build # that on the docker executor) binary_linux_test: &binary_linux_test machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment @@ -126,7 +126,7 @@ binary_linux_test: &binary_linux_test binary_linux_upload: &binary_linux_upload machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment diff --git a/.circleci/verbatim-sources/linux-build-defaults.yml b/.circleci/verbatim-sources/linux-build-defaults.yml index 953cb55..0156171 100644 --- a/.circleci/verbatim-sources/linux-build-defaults.yml +++ b/.circleci/verbatim-sources/linux-build-defaults.yml @@ -7,12 +7,10 @@ pytorch_linux_build_defaults: &pytorch_linux_build_defaults resource_class: large machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment - - run: - <<: *install_official_git_client - checkout - run: <<: *setup_ci_environment @@ -42,7 +40,7 @@ pytorch_linux_build_defaults: &pytorch_linux_build_defaults pytorch_linux_test_defaults: &pytorch_linux_test_defaults machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment @@ -71,12 +69,10 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults caffe2_linux_build_defaults: &caffe2_linux_build_defaults resource_class: large machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment - - run: - <<: *install_official_git_client - checkout - run: <<: *setup_ci_environment @@ -132,7 +128,7 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults caffe2_linux_test_defaults: &caffe2_linux_test_defaults machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment diff --git a/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml b/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml index f2235f7..f6fa4d9 100644 --- a/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml +++ b/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml @@ -5,13 +5,11 @@ ############################################################################## smoke_linux_test: &smoke_linux_test machine: - image: default + image: ubuntu-1604:201903-01 steps: - run: <<: *setup_linux_system_environment - run: - <<: *install_official_git_client - - run: <<: *setup_ci_environment - run: <<: *binary_populate_env -- 2.7.4