# torch/csrc/{autgrad,jit}/generated. In fbcode, this distinction is
# not currently relevant so they are combined into one list.
from __future__ import absolute_import, division, print_function, unicode_literals
+load("@bazel_skylib//lib:new_sets.bzl", "sets")
GENERATED_CPP = [
"python_variable_methods.cpp",
]
-# copied from https://github.com/pytorch/pytorch/blob/master/tools/cpp_build/libtorch/CMakeLists.txt
-torch_sources_no_python_default = [
+# copied from https://github.com/pytorch/pytorch/blob/master/tools/cpp_build/torch/CMakeLists.txt
+libtorch_sources = [
":generate-code=Functions.cpp",
":generate-code=register_aten_ops_0.cpp",
":generate-code=register_aten_ops_1.cpp",
"torch/csrc/autograd/function.cpp",
"torch/csrc/autograd/functions/accumulate_grad.cpp",
"torch/csrc/autograd/functions/basic_ops.cpp",
- "torch/csrc/autograd/functions/comm.cpp",
"torch/csrc/autograd/functions/tensor.cpp",
"torch/csrc/autograd/functions/utils.cpp",
"torch/csrc/autograd/grad_mode.cpp",
"torch/csrc/jit/fuser/interface.cpp",
]
+libtorch_cuda_sources = [
+ "torch/csrc/cuda/comm.cpp",
+ "torch/csrc/cuda/nccl.cpp",
+ "torch/csrc/jit/fuser/cuda/fused_kernel.cpp",
+ "torch/csrc/autograd/profiler_cuda.cpp",
+ "torch/csrc/autograd/functions/comm.cpp"
+]
-def torch_vars():
- r = {}
- # We start torch_sources with all cpp files, and exclude some.
- # This is a much better approach than listing all of them manually because
- # the number of excluded files is small and doesn"t change very frequently
- r["torch_sources"] = (
- native.glob(
- ["torch/csrc/**/*.cpp"],
- exclude=[
- # remove anything that has "generic" in it"s path
- "torch/csrc/**/generic/**/*.cpp",
- # distributed only uses Module.cpp
- # so remove all other files and just include that
- "torch/csrc/distributed/**/*.cpp",
- ],
- )
- + [
- "torch/csrc/distributed/Module.cpp",
- "torch/csrc/distributed/c10d/init.cpp",
- "torch/csrc/distributed/c10d/ddp.cpp",
- ]
- + [":generate-code=" + x for x in GENERATED_CPP]
- )
-
- r["torch_sources_no_python"] = (
- torch_sources_no_python_default
- + ["torch/csrc/cuda/comm.cpp", "torch/csrc/cuda/nccl.cpp", "torch/csrc/jit/fuser/cuda/fused_kernel.cpp"]
- )
- r["torch_sources_no_python_cpu"] = torch_sources_no_python_default
+def add_torch_libs():
+ r = {}
+ # We start torch_python_sources with all cpp files, and exclude some
+ # including the files already contained in the torch and cuda bindings
+ globbed_sources = (native.glob(
+ ["torch/csrc/**/*.cpp"],
+ exclude=[
+ # remove anything that has "generic" in it"s path
+ "torch/csrc/**/generic/**/*.cpp",
+ # distributed only uses Module.cpp
+ # so remove all other files and just include that
+ "torch/csrc/distributed/**/*.cpp",
+ ],
+ ) + [
+ "torch/csrc/distributed/Module.cpp",
+ "torch/csrc/distributed/c10d/init.cpp",
+ "torch/csrc/distributed/c10d/ddp.cpp",
+ ] + [":generate-code=" + x for x in GENERATED_CPP])
+ libtorch_python_sources = sets.to_list(sets.difference(
+ sets.make(globbed_sources),
+ sets.make(libtorch_sources + libtorch_cuda_sources),
+ ))
- r["torch_csrc_flags"] = {
+ common_flags = {
"compiler_flags": [
"-D_THP_CORE",
"-DUSE_C10D",
],
}
- r["torch_csrc_flags_cpu"] = dict(r["torch_csrc_flags"])
+ cpp_library(
+ name="libtorch",
+ srcs=libtorch_sources,
+ link_whole=True,
+ deps=[
+ ":generated-autograd-headers",
+ ":generated-autograd-headers-bare",
+ ":generated-jit-headers",
+ "//caffe2/aten:ATen-cpu",
+ "//caffe2/caffe2:caffe2_cpu",
+ "//caffe2/torch/lib/libshm:libshm",
+ ],
+ external_deps=[
+ ("nanopb", None, "protobuf-nanopb"),
+ ("protobuf", None),
+ ],
+ **common_flags
+ )
+
+ cpp_library(
+ name="libtorch_cuda",
+ srcs=libtorch_cuda_sources,
+ link_whole=True,
+ propagated_pp_flags=[
+ "-DUSE_CUDA",
+ ],
+ deps=[
+ ":generated-autograd-headers",
+ ":generated-autograd-headers-bare",
+ ":generated-jit-headers",
+ ":libtorch",
+ "//caffe2/aten:ATen",
+ "//caffe2/aten:generated-aten-headers-cuda",
+ "//caffe2/caffe2:caffe2_cpu",
+ "//caffe2/torch/lib/libshm:libshm",
+ ],
+ external_deps=[
+ ("cudnn", "7.1.2", "cudnn-lazy"),
+ ("nccl", "2.1.15", "nccl-lazy"),
+ ("cuda", None, "nvToolsExt-lazy"),
+ ("cuda", None, "nvrtc-lazy"),
+ ("cuda", None, "nvrtc-builtins-lazy"),
+ ],
+ **common_flags
+ )
+
+ cpp_python_extension(
+ name="_C",
+ srcs=libtorch_python_sources,
+ base_module="torch",
+ deps=[
+ ":libtorch_cuda",
+ ":thnn",
+ ":torch-lib-headers",
+ "//caffe2/torch/lib/THD:THD",
+ "//caffe2/torch/lib/c10d:c10d",
+ "//caffe2/torch/lib/libshm:libshm",
+ ],
+ external_deps=[
+ ("numpy", None, "cpp"),
+ ("pybind11", None),
+ ],
+ **common_flags
+ )
- r["torch_csrc_flags_cpu"]["preprocessor_flags"] = [
- "-Icaffe2",
- "-Icaffe2/torch/csrc/api/include",
- "-Icaffe2/torch/csrc",
- "-Icaffe2/torch/csrc/nn",
- "-Icaffe2/torch/lib",
- ]
return r
${TORCH_SRC_DIR}/csrc/autograd/function.cpp
${TORCH_SRC_DIR}/csrc/autograd/functions/accumulate_grad.cpp
${TORCH_SRC_DIR}/csrc/autograd/functions/basic_ops.cpp
- ${TORCH_SRC_DIR}/csrc/autograd/functions/comm.cpp
${TORCH_SRC_DIR}/csrc/autograd/functions/tensor.cpp
${TORCH_SRC_DIR}/csrc/autograd/functions/utils.cpp
${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp
${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp
${TORCH_SRC_DIR}/csrc/autograd/variable.cpp
${TORCH_SRC_DIR}/csrc/autograd/VariableTypeManual.cpp
- ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp
${TORCH_SRC_DIR}/csrc/jit/autodiff.cpp
${TORCH_SRC_DIR}/csrc/jit/export.cpp
${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_0.cpp
endif()
endif ()
+if (USE_CUDA)
+ list(APPEND TORCH_SRCS
+ ${TORCH_SRC_DIR}/csrc/autograd/profiler_cuda.cpp
+ ${TORCH_SRC_DIR}/csrc/autograd/functions/comm.cpp
+ ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp
+ )
+endif()
+
if (NOT NO_API)
list(APPEND TORCH_SRCS
${TORCH_SRC_DIR}/csrc/api/src/cuda.cpp
#include <torch/types.h>
#include <torch/csrc/autograd/functions/comm.h>
+#ifdef USE_CUDA
#include <torch/csrc/cuda/comm.h>
+#endif
#include <torch/csrc/utils/functional.h>
#include <ATen/Device.h>
-#ifdef USE_CUDA
-
#include <torch/csrc/autograd/functions/comm.h>
#include <torch/csrc/autograd/function.h>
unsqueeze_scalars_(unsqueeze_scalars) {}
variable_list Scatter::apply(variable_list&& inputs) {
-#ifdef USE_CUDA
AT_ASSERT(inputs.size() == 1);
auto& input = inputs.front();
set_history(variables, grad_fn);
return variables;
-#else
- AT_ERROR("Scatter is only supported in CUDA environments");
-#endif
}
Gather::Gather(const at::Device& destination_device, int64_t dim)
: destination_device_(destination_device), dim_(dim) {}
variable_list Gather::apply(variable_list&& inputs) {
-#ifdef USE_CUDA
bool all_are_zero_dim = true;
for (const auto& input : inputs) {
AT_CHECK(
auto variable = torch::cuda::gather(tensors, dim_, destination_index);
set_history(variable, grad_fn);
return {variable};
-#else
- AT_ERROR("Gather is only supported in CUDA environments");
-#endif
}
} // namespace autograd
} // namespace torch
-
-#endif
#pragma once
-#ifdef USE_CUDA
-
#include <torch/csrc/autograd/function.h>
#include <torch/csrc/autograd/variable.h>
#include <torch/csrc/WindowsTorchApiMacro.h>
} // namespace autograd
} // namespace torch
-
-#endif
#include <torch/csrc/autograd/profiler.h>
#include <torch/csrc/autograd/function.h>
-#ifdef USE_CUDA
-#include <c10/cuda/CUDAGuard.h>
-#endif
-
#include <sstream>
namespace torch { namespace autograd { namespace profiler {
+CUDAStubs default_stubs;
+constexpr CUDAStubs* default_stubs_addr = &default_stubs;
+// constant initialization, so it is guarenteed to be initialized before
+// static initialization calls which may invoke registerCUDAMethods
+static CUDAStubs* cuda_stubs = default_stubs_addr;
+
+TORCH_API void registerCUDAMethods(CUDAStubs* stubs) {
+ cuda_stubs = stubs;
+}
+
ProfilerState state = ProfilerState::Disabled;
uint16_t next_thread_id = 0;
std::mutex all_event_lists_mutex;
return;
}
if (state == ProfilerState::NVTX) {
-#ifdef USE_CUDA
- nvtxMarkA(name.c_str());
-#else
- throw std::logic_error(
- "mark called with NVTX tracing, but compiled without CUDA");
-#endif
+ cuda_stubs->nvtxMarkA(name.c_str());
} else {
getEventList().record(
EventKind::Mark,
return;
}
if (state == ProfilerState::NVTX) {
-#ifdef USE_CUDA
if(sequence_nr >= 0) {
std::stringstream s;
s << name << msg << sequence_nr;
- nvtxRangePushA(s.str().c_str());
+ cuda_stubs->nvtxRangePushA(s.str().c_str());
} else {
- nvtxRangePushA(c_str(name));
+ cuda_stubs->nvtxRangePushA(c_str(name));
}
-#else
- throw std::logic_error(
- "pushRange called with NVTX tracing, but compiled without CUDA");
-#endif
} else {
getEventList().record(
EventKind::PushRange,
return;
}
if (state == ProfilerState::NVTX) {
-#ifdef USE_CUDA
- nvtxRangePop();
-#else
- throw std::logic_error(
- "popRange called with NVTX tracing, but compiled without CUDA");
-#endif
+ cuda_stubs->nvtxRangePop();
} else {
getEventList().record(
EventKind::PopRange,
RecordFunction::RecordFunction(Function* fn) {
// typeid(*fn).name() would avoid an additional string allocation.
- // However, typeid(*fn).name() would cause nvtx annotations for all user-defined
+ // However, typeid(*fn).name() would cause nvtx annotations for all user-defined
// (Python-side) custom autograd function backward() methods to have the same name,
// because they route through the same C++ side class.
// fn->name() ensures that nvtx annotations for custom function backward() methods
pushRangeImpl<const char*>(name, ", seq=", current_sequence_nr);
}
-#ifdef USE_CUDA
-static void onEachDevice(std::function<void(int)> op) {
- at::cuda::OptionalCUDAGuard device_guard;
- int count;
- TORCH_CUDA_CHECK(cudaGetDeviceCount(&count));
- for(int i = 0; i < count; i++) {
- device_guard.set_index(i);
- op(i);
- }
-}
-#endif
-
void enableProfiler(ProfilerState new_state) {
AT_ASSERT(new_state != ProfilerState::Disabled);
-#ifndef USE_CUDA
- if (new_state == ProfilerState::NVTX)
+ if (new_state == ProfilerState::NVTX && !cuda_stubs->enabled())
throw std::runtime_error("Can't use NVTX profiler - PyTorch was compiled without CUDA");
-#endif
if (state != ProfilerState::Disabled && new_state != state) {
throw std::runtime_error("can't change kind of profiling (e.g. NVTX to CPU) while profiler is running");
}
state = new_state;
-#ifdef USE_CUDA
if(state == ProfilerState::CUDA) {
// event recording appears to have some startup overhead, so we need to
// to generate some dummy events first before recording syncrhonization events
for(int i = 0; i < 5; i++) {
- onEachDevice([](int d) {
+ cuda_stubs->onEachDevice([](int d) {
mark("__cuda_startup");
- cudaDeviceSynchronize();
+ cuda_stubs->synchronize();
});
}
// cuda events must be on the same device, so we need a start event recorded
// for each gpu. we then use this event to synchronize time on the GPU
// with the CPU clock.
- onEachDevice([](int d) {
+ cuda_stubs->onEachDevice([](int d) {
mark("__cuda_start_event");
});
}
-#endif
mark("__start_profile", false);
}
}
}
+void Event::record(bool record_cuda) {
+ if (record_cuda) {
+ cuda_stubs->record(&device_, &event, &cpu_ns_);
+ return;
+ }
+ cpu_ns_ = getTime();
+}
+
+double Event::cuda_elapsed_us(const Event & e) {
+ if(!e.has_cuda() || !has_cuda()) {
+ throw std::logic_error("Events were not recorded for CUDA");
+ }
+ if(e.device() != device()) {
+ throw std::logic_error("Events are not on the same device");
+ }
+ return cuda_stubs->elapsed(event, e.event);
+}
+
+CUDAStubs::~CUDAStubs() = default;
+
}}}
#pragma once
-#ifdef USE_CUDA
-#include <nvToolsExt.h>
-#endif
#include <thread>
#include <iostream>
#include <mutex>
#include <tuple>
#include <ATen/ATen.h>
#include <torch/csrc/WindowsTorchApiMacro.h>
-#include <torch/csrc/cuda/cuda_check.h>
-#ifdef USE_CUDA
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime.h>
-#endif
#ifndef _WIN32
#include <ctime>
#endif
+typedef struct CUevent_st* CUDAEventStub;
+
namespace torch { namespace autograd {
struct Function;
namespace profiler {
+struct TORCH_API CUDAStubs {
+ virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) {
+ fail();
+ }
+ virtual float elapsed(CUDAEventStub event, CUDAEventStub event2) {
+ fail();
+ return 0.f;
+ }
+ virtual void nvtxMarkA(const char* name) {
+ fail();
+ }
+ virtual void nvtxRangePushA(const char* name) {
+ fail();
+ }
+ virtual void nvtxRangePop() {
+ fail();
+ }
+ virtual bool enabled() {
+ return false;
+ }
+ virtual void onEachDevice(std::function<void(int)> op) {
+ fail();
+ }
+ virtual void synchronize() {
+ fail();
+ }
+ virtual ~CUDAStubs();
+
+private:
+ void fail() {
+ AT_ERROR("CUDA used in profiler but not enabled.");
+ }
+};
+
+TORCH_API void registerCUDAMethods(CUDAStubs* stubs);
+
constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
return ((a + b - 1) / b) * b;
}
PopRange
};
-struct Event final {
+struct TORCH_API Event final {
Event(EventKind kind, std::string name, uint16_t thread_id, bool record_cuda)
: owned_name_(new std::string(std::move(name)))
, name_ptr_(owned_name_->c_str())
, kind_(kind)
, thread_id_(thread_id) { record(record_cuda); }
- void record(bool record_cuda) {
-#ifdef USE_CUDA
- if (record_cuda) {
- TORCH_CUDA_CHECK(cudaGetDevice(&device_));
- TORCH_CUDA_CHECK(cudaEventCreate(&event));
- auto stream = at::cuda::getCurrentCUDAStream();
- cpu_ns_ = getTime();
- TORCH_CUDA_CHECK(cudaEventRecord(event, stream));
- return;
- }
-#endif
- cpu_ns_ = getTime();
- }
+ void record(bool record_cuda);
std::string kind() const {
switch(kind_) {
case EventKind::Mark: return "mark";
double cpu_elapsed_us(const Event & e) {
return (e.cpu_ns_ - cpu_ns_)/(1000.0);
}
- double cuda_elapsed_us(const Event & e) {
-#ifdef USE_CUDA
- if(!e.has_cuda() || !has_cuda()) {
- throw std::logic_error("Events were not recorded for CUDA");
- }
- if(e.device() != device()) {
- throw std::logic_error("Events are not on the same device");
- }
- TORCH_CUDA_CHECK(cudaEventSynchronize(event));
- TORCH_CUDA_CHECK(cudaEventSynchronize(e.event));
- float ms;
- TORCH_CUDA_CHECK(cudaEventElapsedTime(&ms, event, e.event));
- return ms*1000.0;
-#else
- throw std::logic_error("CUDA not enabled");
-#endif
- }
+ double cuda_elapsed_us(const Event & e);
bool has_cuda() const {
-#ifdef USE_CUDA
return event != nullptr;
-#else
- return false;
-#endif
}
int device() const {
return device_;
EventKind kind_;
uint16_t thread_id_;
int device_ = -1;
-#ifdef USE_CUDA
- cudaEvent_t event = nullptr;
-#endif
+ struct CUevent_st* event = nullptr;
};
// a linked-list of fixed sized vectors, to avoid
--- /dev/null
+#include <torch/csrc/autograd/profiler.h>
+#include <torch/csrc/cuda/cuda_check.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <nvToolsExt.h>
+
+#include <sstream>
+
+namespace torch { namespace autograd { namespace profiler {
+
+namespace {
+
+struct CUDAMethods : public CUDAStubs {
+ void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) override {
+ TORCH_CUDA_CHECK(cudaGetDevice(device));
+ TORCH_CUDA_CHECK(cudaEventCreate(event));
+ auto stream = at::cuda::getCurrentCUDAStream();
+ *cpu_ns = getTime();
+ TORCH_CUDA_CHECK(cudaEventRecord(*event, stream));
+ }
+ float elapsed(CUDAEventStub event, CUDAEventStub event2) override {
+ TORCH_CUDA_CHECK(cudaEventSynchronize(event));
+ TORCH_CUDA_CHECK(cudaEventSynchronize(event2));
+ float ms;
+ TORCH_CUDA_CHECK(cudaEventElapsedTime(&ms, event, event2));
+ return ms*1000.0;
+ }
+ void nvtxMarkA(const char* name) override {
+ ::nvtxMark(name);
+ }
+ void nvtxRangePushA(const char* name) override {
+ ::nvtxRangePushA(name);
+ }
+ void nvtxRangePop() override {
+ ::nvtxRangePop();
+ }
+ void onEachDevice(std::function<void(int)> op) override {
+ at::cuda::OptionalCUDAGuard device_guard;
+ int count;
+ TORCH_CUDA_CHECK(cudaGetDeviceCount(&count));
+ for(int i = 0; i < count; i++) {
+ device_guard.set_index(i);
+ op(i);
+ }
+ }
+ void synchronize() override {
+ cudaDeviceSynchronize();
+ }
+ bool enabled() override {
+ return true;
+ }
+
+};
+
+struct RegisterCUDAMethods {
+ RegisterCUDAMethods() {
+ static CUDAMethods methods;
+ registerCUDAMethods(&methods);
+ }
+};
+RegisterCUDAMethods reg;
+
+} // namespaces
+} // namespace profiler
+} // namespace autograd
+} // namespace torch
#include <torch/csrc/cuda/comm.h>
-#ifdef USE_CUDA
-
#include <torch/csrc/cuda/device_set.h>
#include <torch/csrc/utils/tensor_flatten.h>
return result;
}
}} // namespace torch::cuda
-
-#endif
#pragma once
-#ifdef USE_CUDA
-
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/util/Optional.h>
int64_t dim,
c10::optional<int32_t> destination_index);
}}
-
-#endif
#pragma once
-#ifdef USE_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#include <nvrtc.h>
#define TORCH_CUDA_CHECK(result) ::torch::cudaCheck(result,__FILE__,__LINE__);
}
-
-#endif