From f922b58b5fe420538f35d4c88953b664be85d7ee Mon Sep 17 00:00:00 2001
From: Sergei Vorobev <sergei.vorobev@getcruise.com>
Date: Fri, 27 Aug 2021 09:31:36 -0700
Subject: [PATCH] [bazel] GPU-support: add @local_config_cuda and @cuda
 (#63604)

Summary:
## Context

We take the first step at tackling the GPU-bazel support by adding bazel external workspaces `local_config_cuda` and `cuda`, where the first one has some hardcoded values and lists of files, and the second one provides a nicer, high-level wrapper that maps into the already expected by pytorch bazel targets that are guarded with `if_cuda` macro.

The prefix `local_config_` signifies the fact that we are breaking the bazel hermeticity philosophy by explicitly relaying on the CUDA installation that is present on the machine.

## Testing

Notice an important scenario that is unlocked by this change: compilation of cpp code that depends on cuda libraries (i.e. cuda.h and so on).

Before:
```
sergei.vorobev@cs-sv7xn77uoy-gpu-1628706590:~/src/pytorch4$ bazelisk build --define=cuda=true //:c10
ERROR: /home/sergei.vorobev/src/pytorch4/tools/config/BUILD:12:1: no such package 'tools/toolchain': BUILD file not found in any of the following directories. Add a BUILD file to a directory to mark it as a package.
 - /home/sergei.vorobev/src/pytorch4/tools/toolchain and referenced by '//tools/config:cuda_enabled_and_capable'
ERROR: While resolving configuration keys for //:c10: Analysis failed
ERROR: Analysis of target '//:c10' failed; build aborted: Analysis failed
INFO: Elapsed time: 0.259s
INFO: 0 processes.
FAILED: Build did NOT complete successfully (2 packages loaded, 2 targets configured)
```

After:
```
sergei.vorobev@cs-sv7xn77uoy-gpu-1628706590:~/src/pytorch4$ bazelisk build --define=cuda=true //:c10
INFO: Analyzed target //:c10 (6 packages loaded, 246 targets configured).
INFO: Found 1 target...
Target //:c10 up-to-date:
  bazel-bin/libc10.lo
  bazel-bin/libc10.so
INFO: Elapsed time: 0.617s, Critical Path: 0.04s
INFO: 0 processes.
INFO: Build completed successfully, 1 total action
```

The `//:c10` target is a good testing one for this, because it has such cases where the [glob is different](https://github.com/pytorch/pytorch/blob/075024b9a34904ec3ecdab3704c3bcaa329bdfea/BUILD.bazel#L76-L81), based on do we compile for CUDA or not.

## What is out of scope of this PR

This PR is a first in a series of providing the comprehensive GPU bazel build support. Namely, we don't tackle the [cu_library](https://github.com/pytorch/pytorch/blob/11a40ad915d4d3d8551588e303204810887fcf8d/tools/rules/cu.bzl#L2) implementation here. This would be a separate large chunk of work.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63604

Reviewed By: soulitzer

Differential Revision: D30442083

Pulled By: malfet

fbshipit-source-id: b2a8e4f7e5a25a69b960a82d9e36ba568eb64595
---
 .bazelrc                                           |   6 +-
 .github/scripts/generate_ci_workflows.py           |   2 +-
 ...enerated-linux-xenial-py3.6-gcc7-bazel-test.yml |   2 +-
 .jenkins/pytorch/build.sh                          |   4 +
 WORKSPACE                                          |  13 +-
 third_party/cuda.BUILD                             |  43 ++
 third_party/tensorflow_cuda_bazel_build/BUILD      |   0
 third_party/tensorflow_cuda_bazel_build/README.md  |   5 +
 third_party/tensorflow_cuda_bazel_build/WORKSPACE  |   1 +
 third_party/tensorflow_cuda_bazel_build/cuda/BUILD | 451 +++++++++++++++++++++
 tools/config/BUILD                                 |   1 -
 tools/rules/workspace.bzl                          |  25 ++
 12 files changed, 548 insertions(+), 5 deletions(-)
 create mode 100644 third_party/cuda.BUILD
 create mode 100644 third_party/tensorflow_cuda_bazel_build/BUILD
 create mode 100644 third_party/tensorflow_cuda_bazel_build/README.md
 create mode 100644 third_party/tensorflow_cuda_bazel_build/WORKSPACE
 create mode 100755 third_party/tensorflow_cuda_bazel_build/cuda/BUILD

diff --git a/.bazelrc b/.bazelrc
index ecfe8fd..310eb29 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -3,7 +3,11 @@ build --copt=-I.
 build --copt=-isystem --copt bazel-out/k8-fastbuild/bin
 
 # Configuration to disable tty features for environments like CI
-
 build:no-tty --curses no
 build:no-tty --progress_report_interval 10
 build:no-tty --show_progress_rate_limit 10
+
+# Configuration to build with GPU support
+build:gpu --define=cuda=true
+# define a separate build folder for faster switching between configs
+build:gpu --platform_suffix=-gpu
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index cd7065d..f1819db 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -497,7 +497,7 @@ BAZEL_WORKFLOWS = [
     CIWorkflow(
         arch="linux",
         build_environment="linux-xenial-py3.6-gcc7-bazel-test",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc7",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7",
         test_runner_type=LINUX_CPU_TEST_RUNNER,
         on_pull_request=True,
         ciflow_config=CIFlowConfig(
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
index 71a9bf7..2331442 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
@@ -15,7 +15,7 @@ on:
 
 env:
   BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc7-bazel-test
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index d7b66e7..085cf51 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -224,7 +224,11 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
 
   get_bazel
 
+  # first build the whole torch for CPU-only
   tools/bazel build --config=no-tty :torch
+  # then build selected set of targets with GPU-support.
+  # TODO: eventually this should converge to building the whole :torch with GPU-support
+  tools/bazel build --config=no-tty --config=gpu :c10
 else
   # check that setup.py would fail with bad arguments
   echo "The next three invocations are expected to fail with invalid command error messages."
diff --git a/WORKSPACE b/WORKSPACE
index 6f5028d..9396a34 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,7 +1,7 @@
 workspace(name = "pytorch")
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-load("//tools/rules:workspace.bzl", "new_patched_local_repository")
+load("//tools/rules:workspace.bzl", "new_patched_local_repository", "new_empty_repository")
 
 http_archive(
     name = "bazel_skylib",
@@ -170,3 +170,14 @@ protobuf_deps()
 load("@rules_python//python:repositories.bzl", "py_repositories")
 
 py_repositories()
+
+local_repository(
+    name = "local_config_cuda",
+    path = "third_party/tensorflow_cuda_bazel_build",
+)
+
+# Wrapper to expose local_config_cuda in an agnostic way
+new_empty_repository(
+    name = "cuda",
+    build_file = "//third_party:cuda.BUILD",
+)
diff --git a/third_party/cuda.BUILD b/third_party/cuda.BUILD
new file mode 100644
index 0000000..0c58b34
--- /dev/null
+++ b/third_party/cuda.BUILD
@@ -0,0 +1,43 @@
+"""
+Collect all the CUDA stuff from @local_config_cuda in a single target
+for convenience.
+"""
+
+cc_library(
+    name = "cuda",
+    visibility = ["//visibility:public"],
+    deps = [
+        "@local_config_cuda//cuda:cublas",
+        "@local_config_cuda//cuda:cuda_driver",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cudart",
+        "@local_config_cuda//cuda:cufft",
+        "@local_config_cuda//cuda:curand",
+    ],
+)
+
+cc_library(
+    name = "cupti",
+    deps = [
+        "@local_config_cuda//cuda:cupti_headers",
+        "@local_config_cuda//cuda:cupti_link",
+    ],
+)
+
+[
+    alias(
+        name = lib,
+        actual = "@local_config_cuda//cuda:{}".format(lib),
+        visibility = ["//visibility:public"],
+    )
+    for lib in [
+        "cublas",
+        "cufft",
+        "cusolver",
+        "cusparse",
+        "curand",
+        "nvrtc",
+        "cuda_driver",
+        "nvToolsExt",
+    ]
+]
diff --git a/third_party/tensorflow_cuda_bazel_build/BUILD b/third_party/tensorflow_cuda_bazel_build/BUILD
new file mode 100644
index 0000000..e69de29
diff --git a/third_party/tensorflow_cuda_bazel_build/README.md b/third_party/tensorflow_cuda_bazel_build/README.md
new file mode 100644
index 0000000..439e195
--- /dev/null
+++ b/third_party/tensorflow_cuda_bazel_build/README.md
@@ -0,0 +1,5 @@
+# Config for CUDA
+
+This is a checked-in copy of the auto-generated config for building CUDA code with bazel. The content of this folder was generated from https://github.com/tensorflow/tensorflow `./configure` execution and then edited manually to fit the pytorch needs.
+
+The LICENSE for the TensorFlow project is APACHE 2. The full LICENSE file could be found here https://github.com/tensorflow/tensorflow/blob/master/LICENSE.
diff --git a/third_party/tensorflow_cuda_bazel_build/WORKSPACE b/third_party/tensorflow_cuda_bazel_build/WORKSPACE
new file mode 100644
index 0000000..59369ce
--- /dev/null
+++ b/third_party/tensorflow_cuda_bazel_build/WORKSPACE
@@ -0,0 +1 @@
+workspace(name = "local_config_cuda")
diff --git a/third_party/tensorflow_cuda_bazel_build/cuda/BUILD b/third_party/tensorflow_cuda_bazel_build/cuda/BUILD
new file mode 100755
index 0000000..f7271af
--- /dev/null
+++ b/third_party/tensorflow_cuda_bazel_build/cuda/BUILD
@@ -0,0 +1,451 @@
+licenses([
+    "restricted",
+    "reciprocal",
+    "notice",
+])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_nvcc",
+    values = {
+        "define": "using_cuda_nvcc=true",
+    },
+)
+
+config_setting(
+    name = "using_clang",
+    values = {
+        "define": "using_cuda_clang=true",
+    },
+)
+
+# Equivalent to using_clang && -c opt.
+config_setting(
+    name = "using_clang_opt",
+    values = {
+        "define": "using_cuda_clang=true",
+        "compilation_mode": "opt",
+    },
+)
+
+config_setting(
+    name = "darwin",
+    values = {"cpu": "darwin"},
+)
+
+cc_library(
+    name = "cuda_headers",
+    hdrs = [
+        ":cuda-include",
+        ":cudnn-include",
+    ],
+    includes = [
+        ".",
+        "include",
+    ],
+)
+
+cc_library(
+    name = "cudnn_headers",
+    hdrs = [
+        ":cudnn-include",
+    ],
+    includes = [
+        ".",
+        "include",
+    ],
+)
+
+cc_library(
+    name = "cudart_static",
+    linkopts = [
+        "-L/usr/local/cuda/lib64",
+    ],
+)
+
+cc_library(
+    name = "cuda_driver",
+    linkopts = ["-lcuda"],
+    deps = [":linker_search_path"],
+)
+
+# Provides the RPATH for Nvidia-less sytems to be able to run binaries linked to libcuda.
+cc_library(
+    name = "driver_stub_runtime",
+    linkopts = [
+        "-Wl,-rpath,/usr/local/cuda/lib64/stubs",
+    ],
+    deps = [":cuda_driver"],
+)
+
+cc_library(
+    name = "linker_search_path",
+    linkopts = [
+        "-L/usr/local/cuda/lib64",
+        "-L/usr/local/cuda/lib64/stubs",
+        "-Wl,-rpath-link,/usr/local/cuda/lib64",
+        "-Wl,-rpath-link,/usr/local/cuda/lib64/stubs",
+    ],
+)
+
+[
+    cc_library(
+        name = libname,
+        linkopts = ["-l" + libname] + (["-lgomp"] if (libname == "cusolver") else []),
+        linkstatic = True,
+        deps = [":linker_search_path"],
+    )
+    for libname in [
+        "cublas",
+        "cudart",
+        "cudnn",
+        "cufft",
+        "curand",
+        "cusolver",
+        "cusparse",
+        "nvrtc",
+        "nvToolsExt",
+    ]
+]
+
+cc_library(
+    name = "cuda",
+    deps = [
+        ":cublas",
+        ":cuda_headers",
+        ":cudart",
+        ":cudnn",
+        ":cufft",
+        ":curand",
+        ":nvToolsExt",
+    ],
+)
+
+# NVIDIA Performance Primitives (http://docs.nvidia.com/cuda/npp/modules.html))
+# used by OpenCV
+cc_library(
+    name = "nppi",
+    linkopts = [
+        "-lnppc",
+        "-lnppial",
+        "-lnppicom",
+        "-lnppidei",
+        "-lnppif",
+        "-lnppig",
+        "-lnppim",
+        "-lnppist",
+        "-lnppitc",
+        "-lnpps",
+    ],
+    linkstatic = True,
+    deps = [":linker_search_path"],
+)
+
+# NVIDIA Management Library
+cc_library(
+    name = "nvml",
+    linkopts = [
+        "-lnvidia-ml",
+        "-Wl,-rpath,/usr/lib/nvidia-410",
+        "-Wl,-rpath,/usr/lib/nvidia-390",
+        "-Wl,-rpath,/usr/lib/nvidia-387",
+        "-Wl,-rpath,/usr/lib/nvidia-384",
+    ],
+    deps = [":linker_search_path"],
+)
+
+cc_library(
+    name = "cupti_headers",
+    hdrs = [
+        ":cuda-extras",
+    ],
+    includes = [
+        ".",
+        "extras/CUPTI/include/",
+    ],
+)
+
+# cupti .so exposed at linktime
+cc_library(
+    name = "cupti_link",
+    linkopts = [
+        "-L/usr/local/cuda/extras/CUPTI/lib64",
+        "-lcupti",
+    ],
+)
+
+cc_library(
+    name = "libdevice_root",
+    data = [":cuda-nvvm"],
+)
+
+CUDA_INCLUDES_FILES = [
+    "include/builtin_types.h",
+    "include/channel_descriptor.h",
+    "include/CL/cl_egl.h",
+    "include/CL/cl_ext.h",
+    "include/CL/cl_gl_ext.h",
+    "include/CL/cl_gl.h",
+    "include/CL/cl.h",
+    "include/CL/cl.hpp",
+    "include/CL/cl_platform.h",
+    "include/CL/opencl.h",
+    "include/common_functions.h",
+    "include/cooperative_groups.h",
+    "include/cooperative_groups_helpers.h",
+    "include/crt/common_functions.h",
+    "include/crt/device_double_functions.h",
+    "include/crt/device_double_functions.hpp",
+    "include/crt/device_functions.h",
+    "include/crt/device_functions.hpp",
+    "include/crt/func_macro.h",
+    "include/crt/host_config.h",
+    "include/crt/host_defines.h",
+    "include/crt/host_runtime.h",
+    "include/crt/math_functions.h",
+    "include/crt/math_functions.hpp",
+    "include/crt/mma.h",
+    "include/crt/mma.hpp",
+    "include/crt/nvfunctional",
+    "include/crt/sm_70_rt.h",
+    "include/crt/sm_70_rt.hpp",
+    "include/crt/storage_class.h",
+    # TODO: figure out why on a CI machine with CUDA 10.2 it's not present
+    # "include/cublas_api.h",
+    # "include/cublas.h",
+    # "include/cublas_v2.h",
+    # "include/cublasXt.h",
+    "include/cuComplex.h",
+    "include/cuda_device_runtime_api.h",
+    "include/cudaEGL.h",
+    "include/cuda_egl_interop.h",
+    "include/cuda_fp16.h",
+    "include/cuda_fp16.hpp",
+    "include/cudaGL.h",
+    "include/cuda_gl_interop.h",
+    "include/cuda.h",
+    "include/cudalibxt.h",
+    "include/cuda_occupancy.h",
+    "include/cuda_profiler_api.h",
+    "include/cudaProfiler.h",
+    "include/cudart_platform.h",
+    "include/cuda_runtime_api.h",
+    "include/cuda_runtime.h",
+    "include/cuda_surface_types.h",
+    "include/cuda_texture_types.h",
+    "include/cudaVDPAU.h",
+    "include/cuda_vdpau_interop.h",
+    "include/cufft.h",
+    "include/cufftw.h",
+    "include/cufftXt.h",
+    "include/curand_discrete2.h",
+    "include/curand_discrete.h",
+    "include/curand_globals.h",
+    "include/curand.h",
+    "include/curand_kernel.h",
+    "include/curand_lognormal.h",
+    "include/curand_mrg32k3a.h",
+    "include/curand_mtgp32dc_p_11213.h",
+    "include/curand_mtgp32.h",
+    "include/curand_mtgp32_host.h",
+    "include/curand_mtgp32_kernel.h",
+    "include/curand_normal.h",
+    "include/curand_normal_static.h",
+    "include/curand_philox4x32_x.h",
+    "include/curand_poisson.h",
+    "include/curand_precalc.h",
+    "include/curand_uniform.h",
+    "include/cusolver_common.h",
+    "include/cusolverDn.h",
+    "include/cusolverRf.h",
+    "include/cusolverSp.h",
+    "include/cusolverSp_LOWLEVEL_PREVIEW.h",
+    "include/cusparse.h",
+    "include/cusparse_v2.h",
+    "include/device_atomic_functions.h",
+    "include/device_atomic_functions.hpp",
+    "include/device_double_functions.h",
+    "include/device_functions.h",
+    "include/device_launch_parameters.h",
+    "include/device_types.h",
+    "include/driver_functions.h",
+    "include/driver_types.h",
+    "include/fatBinaryCtl.h",
+    "include/fatbinary.h",
+    "include/host_config.h",
+    "include/host_defines.h",
+    "include/library_types.h",
+    "include/math_constants.h",
+    "include/math_functions.h",
+    "include/mma.h",
+    "include/nppcore.h",
+    "include/nppdefs.h",
+    "include/npp.h",
+    "include/nppi_arithmetic_and_logical_operations.h",
+    "include/nppi_color_conversion.h",
+    "include/nppi_compression_functions.h",
+    "include/nppi_computer_vision.h",
+    "include/nppi_data_exchange_and_initialization.h",
+    "include/nppi_filtering_functions.h",
+    "include/nppi_geometry_transforms.h",
+    "include/nppi.h",
+    "include/nppi_linear_transforms.h",
+    "include/nppi_morphological_operations.h",
+    "include/nppi_statistics_functions.h",
+    "include/nppi_support_functions.h",
+    "include/nppi_threshold_and_compare_operations.h",
+    "include/npps_arithmetic_and_logical_operations.h",
+    "include/npps_conversion_functions.h",
+    "include/npps_filtering_functions.h",
+    "include/npps.h",
+    "include/npps_initialization.h",
+    "include/npps_statistics_functions.h",
+    "include/npps_support_functions.h",
+    # Note: CUDA 10.0 only
+    # "include/nppversion.h",
+    # TODO: figure out why on a CI machine with CUDA 10.2 it's not present
+    # "include/nvblas.h",
+    "include/nvfunctional",
+    "include/nvgraph.h",
+    "include/nvjpeg.h",
+    "include/nvml.h",
+    "include/nvrtc.h",
+    "include/nvToolsExtCuda.h",
+    "include/nvToolsExtCudaRt.h",
+    "include/nvToolsExt.h",
+    "include/nvToolsExtMeta.h",
+    "include/nvToolsExtSync.h",
+    "include/nvtx3/nvToolsExtCuda.h",
+    "include/nvtx3/nvToolsExtCudaRt.h",
+    "include/nvtx3/nvToolsExt.h",
+    "include/nvtx3/nvToolsExtOpenCL.h",
+    "include/nvtx3/nvToolsExtSync.h",
+    "include/nvtx3/nvtxDetail/nvtxImplCore.h",
+    "include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h",
+    "include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h",
+    "include/nvtx3/nvtxDetail/nvtxImpl.h",
+    "include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h",
+    "include/nvtx3/nvtxDetail/nvtxImplSync_v3.h",
+    "include/nvtx3/nvtxDetail/nvtxInitDecls.h",
+    "include/nvtx3/nvtxDetail/nvtxInitDefs.h",
+    "include/nvtx3/nvtxDetail/nvtxInit.h",
+    "include/nvtx3/nvtxDetail/nvtxLinkOnce.h",
+    "include/nvtx3/nvtxDetail/nvtxTypes.h",
+    "include/sm_20_atomic_functions.h",
+    "include/sm_20_atomic_functions.hpp",
+    "include/sm_20_intrinsics.h",
+    "include/sm_20_intrinsics.hpp",
+    "include/sm_30_intrinsics.h",
+    "include/sm_30_intrinsics.hpp",
+    "include/sm_32_atomic_functions.h",
+    "include/sm_32_atomic_functions.hpp",
+    "include/sm_32_intrinsics.h",
+    "include/sm_32_intrinsics.hpp",
+    "include/sm_35_atomic_functions.h",
+    "include/sm_35_intrinsics.h",
+    "include/sm_60_atomic_functions.h",
+    "include/sm_60_atomic_functions.hpp",
+    "include/sm_61_intrinsics.h",
+    "include/sm_61_intrinsics.hpp",
+    # CUDA 10.0 only
+    # "include/sobol_direction_vectors.h",
+    "include/surface_functions.h",
+    "include/surface_functions.hpp",
+    "include/surface_indirect_functions.h",
+    "include/surface_indirect_functions.hpp",
+    "include/surface_types.h",
+    "include/texture_fetch_functions.h",
+    "include/texture_fetch_functions.hpp",
+    "include/texture_indirect_functions.h",
+    "include/texture_indirect_functions.hpp",
+    "include/texture_types.h",
+    "include/vector_functions.h",
+    "include/vector_functions.hpp",
+    "include/vector_types.h",
+]
+
+genrule(
+    name = "cuda-include",
+    outs = CUDA_INCLUDES_FILES,
+    cmd = " && ".join([
+        "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p)
+        for p in CUDA_INCLUDES_FILES
+    ]),
+    local = True,
+    tags = ["no-cache"],
+)
+
+CUDA_NVVM_FILES = [
+    "nvvm/bin/cicc",
+    "nvvm/include/nvvm.h",
+    "nvvm/lib64/libnvvm.so",
+    "nvvm/lib64/libnvvm.so.3",
+    "nvvm/lib64/libnvvm.so.3.3.0",
+    "nvvm/libdevice/libdevice.10.bc",
+]
+
+genrule(
+    name = "cuda-nvvm",
+    outs = CUDA_NVVM_FILES,
+    cmd = " && ".join([
+        "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p)
+        for p in CUDA_NVVM_FILES
+    ]),
+    local = True,
+    tags = ["no-cache"],
+)
+
+CUDA_EXTRAS_FILES = [
+    "extras/CUPTI/include/cuda_stdint.h",
+    "extras/CUPTI/include/cupti.h",
+    "extras/CUPTI/include/cupti_activity.h",
+    "extras/CUPTI/include/cupti_callbacks.h",
+    "extras/CUPTI/include/cupti_driver_cbid.h",
+    "extras/CUPTI/include/cupti_events.h",
+    "extras/CUPTI/include/cupti_metrics.h",
+    "extras/CUPTI/include/cupti_nvtx_cbid.h",
+    "extras/CUPTI/include/cupti_result.h",
+    "extras/CUPTI/include/cupti_runtime_cbid.h",
+    "extras/CUPTI/include/cupti_version.h",
+    "extras/CUPTI/include/generated_cuda_gl_interop_meta.h",
+    "extras/CUPTI/include/generated_cuda_meta.h",
+    "extras/CUPTI/include/generated_cuda_runtime_api_meta.h",
+    "extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h",
+    "extras/CUPTI/include/generated_cudaGL_meta.h",
+    "extras/CUPTI/include/generated_cudaVDPAU_meta.h",
+    "extras/CUPTI/include/generated_nvtx_meta.h",
+    "extras/CUPTI/include/GL/gl.h",
+    "extras/CUPTI/include/GL/glew.h",
+    "extras/CUPTI/include/GL/glext.h",
+    "extras/CUPTI/include/GL/glu.h",
+    "extras/CUPTI/include/GL/glut.h",
+    "extras/CUPTI/include/GL/glx.h",
+    "extras/CUPTI/include/GL/glxext.h",
+    "extras/CUPTI/include/GL/wglew.h",
+    "extras/CUPTI/include/GL/wglext.h",
+    "extras/CUPTI/include/openacc/cupti_openacc.h",
+]
+
+genrule(
+    name = "cuda-extras",
+    outs = CUDA_EXTRAS_FILES,
+    cmd = " && ".join([
+        "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p)
+        for p in CUDA_EXTRAS_FILES
+    ]),
+    local = True,
+    tags = ["no-cache"],
+)
+
+genrule(
+    name = "cudnn-include",
+    outs = [
+        "include/cudnn.h",
+    ],
+    cmd = """
+        ln -s /usr/include/cudnn.h $(@D)/cudnn.h""",
+    local = True,
+    tags = ["no-cache"],
+)
+
diff --git a/tools/config/BUILD b/tools/config/BUILD
index a8f9d04..ba13eda 100644
--- a/tools/config/BUILD
+++ b/tools/config/BUILD
@@ -13,7 +13,6 @@ selects.config_setting_group(
     name = "cuda_enabled_and_capable",
     match_all = [
         ":cuda",
-        "//tools/toolchain:is_cuda_capable",
     ],
 )
 
diff --git a/tools/rules/workspace.bzl b/tools/rules/workspace.bzl
index 59e12e8..34317be 100644
--- a/tools/rules/workspace.bzl
+++ b/tools/rules/workspace.bzl
@@ -27,3 +27,28 @@ pkg_tar(name = "content", srcs = glob(["**"]))
         path = path,
     )
     _patched_rule(name = name, **kwargs)
+
+def _new_empty_repository_impl(repo_ctx):
+    build_file = repo_ctx.attr.build_file
+    build_file_content = repo_ctx.attr.build_file_content
+    if not (bool(build_file) != bool(build_file_content)):
+        fail("Exactly one of 'build_file' or 'build_file_content' is required")
+
+    if build_file_content:
+        repo_ctx.file("BUILD", build_file_content)
+    elif build_file:
+        repo_ctx.template("BUILD", repo_ctx.attr.build_file, {})
+
+new_empty_repository = repository_rule(
+    attrs = {
+        "build_file": attr.label(allow_files = True),
+        "build_file_content": attr.string(),
+    },
+    implementation = _new_empty_repository_impl,
+)
+
+"""Create an empty repository with the supplied BUILD file.
+
+This is mostly useful to create wrappers for specific target that we want
+to be used with the '@' syntax.
+"""
-- 
2.7.4