Remove cuda from autograd profiler (#15898)

author Zachary DeVito <zdevito@fb.com>

Wed, 16 Jan 2019 00:25:28 +0000 (16:25 -0800)

committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>

Wed, 16 Jan 2019 00:43:11 +0000 (16:43 -0800)
author Zachary DeVito <zdevito@fb.com>
Wed, 16 Jan 2019 00:25:28 +0000 (16:25 -0800)
committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
Wed, 16 Jan 2019 00:43:11 +0000 (16:43 -0800)
diff --git a/tools/build_variables.py b/tools/build_variables.py

index b6511c3..a24e795 100644 (file)
--- a/tools/build_variables.py
+++ b/tools/build_variables.py
@@ -2,6 +2,7 @@
  # torch/csrc/{autgrad,jit}/generated. In fbcode, this distinction is
  # not currently relevant so they are combined into one list.
  from __future__ import absolute_import, division, print_function, unicode_literals
+load("@bazel_skylib//lib:new_sets.bzl", "sets")
  
  
  GENERATED_CPP = [
@@ -22,8 +23,8 @@ GENERATED_CPP = [
      "python_variable_methods.cpp",
  ]
  
-# copied from https://github.com/pytorch/pytorch/blob/master/tools/cpp_build/libtorch/CMakeLists.txt
-torch_sources_no_python_default = [
+# copied from https://github.com/pytorch/pytorch/blob/master/tools/cpp_build/torch/CMakeLists.txt
+libtorch_sources = [
      ":generate-code=Functions.cpp",
      ":generate-code=register_aten_ops_0.cpp",
      ":generate-code=register_aten_ops_1.cpp",
@@ -39,7 +40,6 @@ torch_sources_no_python_default = [
      "torch/csrc/autograd/function.cpp",
      "torch/csrc/autograd/functions/accumulate_grad.cpp",
      "torch/csrc/autograd/functions/basic_ops.cpp",
-    "torch/csrc/autograd/functions/comm.cpp",
      "torch/csrc/autograd/functions/tensor.cpp",
      "torch/csrc/autograd/functions/utils.cpp",
      "torch/csrc/autograd/grad_mode.cpp",
@@ -110,39 +110,39 @@ torch_sources_no_python_default = [
      "torch/csrc/jit/fuser/interface.cpp",
  ]
  
+libtorch_cuda_sources = [
+    "torch/csrc/cuda/comm.cpp",
+    "torch/csrc/cuda/nccl.cpp",
+    "torch/csrc/jit/fuser/cuda/fused_kernel.cpp",
+    "torch/csrc/autograd/profiler_cuda.cpp",
+    "torch/csrc/autograd/functions/comm.cpp"
+]
  
-def torch_vars():
-    r = {}
-    # We start torch_sources with all cpp files, and exclude some.
-    # This is a much better approach than listing all of them manually because
-    # the number of excluded files is small and doesn"t change very frequently
-    r["torch_sources"] = (
-        native.glob(
-            ["torch/csrc/**/*.cpp"],
-            exclude=[
-                # remove anything that has "generic" in it"s path
-                "torch/csrc/**/generic/**/*.cpp",
-                # distributed only uses Module.cpp
-                # so remove all other files and just include that
-                "torch/csrc/distributed/**/*.cpp",
-            ],
-        )
-        + [
-            "torch/csrc/distributed/Module.cpp",
-            "torch/csrc/distributed/c10d/init.cpp",
-            "torch/csrc/distributed/c10d/ddp.cpp",
-        ]
-        + [":generate-code=" + x for x in GENERATED_CPP]
-    )
-
-    r["torch_sources_no_python"] = (
-        torch_sources_no_python_default
-        + ["torch/csrc/cuda/comm.cpp", "torch/csrc/cuda/nccl.cpp", "torch/csrc/jit/fuser/cuda/fused_kernel.cpp"]
-    )
  
-    r["torch_sources_no_python_cpu"] = torch_sources_no_python_default
+def add_torch_libs():
+    r = {}
+    # We start torch_python_sources with all cpp files, and exclude some
+    # including the files already contained in the torch and cuda bindings
+    globbed_sources = (native.glob(
+        ["torch/csrc/**/*.cpp"],
+        exclude=[
+            # remove anything that has "generic" in it"s path
+            "torch/csrc/**/generic/**/*.cpp",
+            # distributed only uses Module.cpp
+            # so remove all other files and just include that
+            "torch/csrc/distributed/**/*.cpp",
+        ],
+    ) + [
+        "torch/csrc/distributed/Module.cpp",
+        "torch/csrc/distributed/c10d/init.cpp",
+        "torch/csrc/distributed/c10d/ddp.cpp",
+    ] + [":generate-code=" + x for x in GENERATED_CPP])
+    libtorch_python_sources = sets.to_list(sets.difference(
+        sets.make(globbed_sources),
+        sets.make(libtorch_sources + libtorch_cuda_sources),
+    ))
  
-    r["torch_csrc_flags"] = {
+    common_flags = {
          "compiler_flags": [
              "-D_THP_CORE",
              "-DUSE_C10D",
@@ -180,13 +180,69 @@ def torch_vars():
          ],
      }
  
-    r["torch_csrc_flags_cpu"] = dict(r["torch_csrc_flags"])
+    cpp_library(
+        name="libtorch",
+        srcs=libtorch_sources,
+        link_whole=True,
+        deps=[
+            ":generated-autograd-headers",
+            ":generated-autograd-headers-bare",
+            ":generated-jit-headers",
+            "//caffe2/aten:ATen-cpu",
+            "//caffe2/caffe2:caffe2_cpu",
+            "//caffe2/torch/lib/libshm:libshm",
+        ],
+        external_deps=[
+            ("nanopb", None, "protobuf-nanopb"),
+            ("protobuf", None),
+        ],
+        **common_flags
+    )
+
+    cpp_library(
+        name="libtorch_cuda",
+        srcs=libtorch_cuda_sources,
+        link_whole=True,
+        propagated_pp_flags=[
+            "-DUSE_CUDA",
+        ],
+        deps=[
+            ":generated-autograd-headers",
+            ":generated-autograd-headers-bare",
+            ":generated-jit-headers",
+            ":libtorch",
+            "//caffe2/aten:ATen",
+            "//caffe2/aten:generated-aten-headers-cuda",
+            "//caffe2/caffe2:caffe2_cpu",
+            "//caffe2/torch/lib/libshm:libshm",
+        ],
+        external_deps=[
+            ("cudnn", "7.1.2", "cudnn-lazy"),
+            ("nccl", "2.1.15", "nccl-lazy"),
+            ("cuda", None, "nvToolsExt-lazy"),
+            ("cuda", None, "nvrtc-lazy"),
+            ("cuda", None, "nvrtc-builtins-lazy"),
+        ],
+        **common_flags
+    )
+
+    cpp_python_extension(
+        name="_C",
+        srcs=libtorch_python_sources,
+        base_module="torch",
+        deps=[
+            ":libtorch_cuda",
+            ":thnn",
+            ":torch-lib-headers",
+            "//caffe2/torch/lib/THD:THD",
+            "//caffe2/torch/lib/c10d:c10d",
+            "//caffe2/torch/lib/libshm:libshm",
+        ],
+        external_deps=[
+            ("numpy", None, "cpp"),
+            ("pybind11", None),
+        ],
+        **common_flags
+    )
  
-    r["torch_csrc_flags_cpu"]["preprocessor_flags"] = [
-        "-Icaffe2",
-        "-Icaffe2/torch/csrc/api/include",
-        "-Icaffe2/torch/csrc",
-        "-Icaffe2/torch/csrc/nn",
-        "-Icaffe2/torch/lib",
-    ]
      return r
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt

index 72f3431..02d6abc 100644 (file)
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -129,7 +129,6 @@ set(TORCH_SRCS
    ${TORCH_SRC_DIR}/csrc/autograd/function.cpp
    ${TORCH_SRC_DIR}/csrc/autograd/functions/accumulate_grad.cpp
    ${TORCH_SRC_DIR}/csrc/autograd/functions/basic_ops.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/functions/comm.cpp
    ${TORCH_SRC_DIR}/csrc/autograd/functions/tensor.cpp
    ${TORCH_SRC_DIR}/csrc/autograd/functions/utils.cpp
    ${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp
@@ -144,7 +143,6 @@ set(TORCH_SRCS
    ${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp
    ${TORCH_SRC_DIR}/csrc/autograd/variable.cpp
    ${TORCH_SRC_DIR}/csrc/autograd/VariableTypeManual.cpp
-  ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp
    ${TORCH_SRC_DIR}/csrc/jit/autodiff.cpp
    ${TORCH_SRC_DIR}/csrc/jit/export.cpp
    ${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_0.cpp
@@ -227,6 +225,14 @@ else ()
    endif()
  endif ()
  
+if (USE_CUDA)
+  list(APPEND TORCH_SRCS
+    ${TORCH_SRC_DIR}/csrc/autograd/profiler_cuda.cpp
+    ${TORCH_SRC_DIR}/csrc/autograd/functions/comm.cpp
+    ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp
+  )
+endif()
+
  if (NOT NO_API)
    list(APPEND TORCH_SRCS
      ${TORCH_SRC_DIR}/csrc/api/src/cuda.cpp
diff --git a/torch/csrc/api/include/torch/nn/parallel/data_parallel.h b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h

index ea23398..ebdb174 100644 (file)
--- a/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
+++ b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
@@ -6,7 +6,9 @@
  #include <torch/types.h>
  
  #include <torch/csrc/autograd/functions/comm.h>
+#ifdef USE_CUDA
  #include <torch/csrc/cuda/comm.h>
+#endif
  #include <torch/csrc/utils/functional.h>
  
  #include <ATen/Device.h>
diff --git a/torch/csrc/autograd/functions/comm.cpp b/torch/csrc/autograd/functions/comm.cpp

index e604859..3ae4faf 100644 (file)
--- a/torch/csrc/autograd/functions/comm.cpp
+++ b/torch/csrc/autograd/functions/comm.cpp
@@ -1,5 +1,3 @@
-#ifdef USE_CUDA
-
  #include <torch/csrc/autograd/functions/comm.h>
  
  #include <torch/csrc/autograd/function.h>
@@ -31,7 +29,6 @@ Scatter::Scatter(
        unsqueeze_scalars_(unsqueeze_scalars) {}
  
  variable_list Scatter::apply(variable_list&& inputs) {
-#ifdef USE_CUDA
    AT_ASSERT(inputs.size() == 1);
    auto& input = inputs.front();
  
@@ -63,16 +60,12 @@ variable_list Scatter::apply(variable_list&& inputs) {
    set_history(variables, grad_fn);
  
    return variables;
-#else
-  AT_ERROR("Scatter is only supported in CUDA environments");
-#endif
  }
  
  Gather::Gather(const at::Device& destination_device, int64_t dim)
      : destination_device_(destination_device), dim_(dim) {}
  
  variable_list Gather::apply(variable_list&& inputs) {
-#ifdef USE_CUDA
    bool all_are_zero_dim = true;
    for (const auto& input : inputs) {
      AT_CHECK(
@@ -125,12 +118,7 @@ variable_list Gather::apply(variable_list&& inputs) {
    auto variable = torch::cuda::gather(tensors, dim_, destination_index);
    set_history(variable, grad_fn);
    return {variable};
-#else
-  AT_ERROR("Gather is only supported in CUDA environments");
-#endif
  }
  
  } // namespace autograd
  } // namespace torch
-
-#endif
diff --git a/torch/csrc/autograd/functions/comm.h b/torch/csrc/autograd/functions/comm.h

index e806546..552c085 100644 (file)
--- a/torch/csrc/autograd/functions/comm.h
+++ b/torch/csrc/autograd/functions/comm.h
@@ -1,7 +1,5 @@
  #pragma once
  
-#ifdef USE_CUDA
-
  #include <torch/csrc/autograd/function.h>
  #include <torch/csrc/autograd/variable.h>
  #include <torch/csrc/WindowsTorchApiMacro.h>
@@ -44,5 +42,3 @@ struct TORCH_API Gather : public Function {
  
  } // namespace autograd
  } // namespace torch
-
-#endif
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp

index 25b2780..9a6550c 100644 (file)
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -1,14 +1,20 @@
  #include <torch/csrc/autograd/profiler.h>
  #include <torch/csrc/autograd/function.h>
  
-#ifdef USE_CUDA
-#include <c10/cuda/CUDAGuard.h>
-#endif
-
  #include <sstream>
  
  namespace torch { namespace autograd { namespace profiler {
  
+CUDAStubs default_stubs;
+constexpr CUDAStubs* default_stubs_addr = &default_stubs;
+// constant initialization, so it is guarenteed to be initialized before
+// static initialization calls which may invoke registerCUDAMethods
+static CUDAStubs* cuda_stubs = default_stubs_addr;
+
+TORCH_API void registerCUDAMethods(CUDAStubs* stubs) {
+  cuda_stubs = stubs;
+}
+
  ProfilerState state = ProfilerState::Disabled;
  uint16_t next_thread_id = 0;
  std::mutex all_event_lists_mutex;
@@ -31,12 +37,7 @@ void mark(std::string name, bool include_cuda /* = true */) {
      return;
    }
    if (state == ProfilerState::NVTX) {
-#ifdef USE_CUDA
-    nvtxMarkA(name.c_str());
-#else
-    throw std::logic_error(
-        "mark called with NVTX tracing, but compiled without CUDA");
-#endif
+    cuda_stubs->nvtxMarkA(name.c_str());
    } else {
      getEventList().record(
          EventKind::Mark,
@@ -56,18 +57,13 @@ void pushRangeImpl(T name, const char* msg="", int64_t sequence_nr=-1) {
      return;
    }
    if (state == ProfilerState::NVTX) {
-#ifdef USE_CUDA
      if(sequence_nr >= 0) {
        std::stringstream s;
        s << name << msg << sequence_nr;
-      nvtxRangePushA(s.str().c_str());
+      cuda_stubs->nvtxRangePushA(s.str().c_str());
      } else {
-      nvtxRangePushA(c_str(name));
+      cuda_stubs->nvtxRangePushA(c_str(name));
      }
-#else
-    throw std::logic_error(
-        "pushRange called with NVTX tracing, but compiled without CUDA");
-#endif
    } else {
      getEventList().record(
          EventKind::PushRange,
@@ -86,12 +82,7 @@ void popRange() {
      return;
    }
    if (state == ProfilerState::NVTX) {
-#ifdef USE_CUDA
-    nvtxRangePop();
-#else
-    throw std::logic_error(
-        "popRange called with NVTX tracing, but compiled without CUDA");
-#endif
+    cuda_stubs->nvtxRangePop();
    } else {
      getEventList().record(
          EventKind::PopRange,
@@ -103,7 +94,7 @@ void popRange() {
  
  RecordFunction::RecordFunction(Function* fn) {
    // typeid(*fn).name() would avoid an additional string allocation.
-  // However, typeid(*fn).name() would cause nvtx annotations for all user-defined 
+  // However, typeid(*fn).name() would cause nvtx annotations for all user-defined
    // (Python-side) custom autograd function backward() methods to have the same name,
    // because they route through the same C++ side class.
    // fn->name() ensures that nvtx annotations for custom function backward() methods
@@ -124,48 +115,32 @@ RecordFunction::RecordFunction(const char* name, int64_t current_sequence_nr)
    pushRangeImpl<const char*>(name, ", seq=", current_sequence_nr);
  }
  
-#ifdef USE_CUDA
-static void onEachDevice(std::function<void(int)> op) {
-  at::cuda::OptionalCUDAGuard device_guard;
-  int count;
-  TORCH_CUDA_CHECK(cudaGetDeviceCount(&count));
-  for(int i = 0; i < count; i++) {
-    device_guard.set_index(i);
-    op(i);
-  }
-}
-#endif
-
  void enableProfiler(ProfilerState new_state) {
    AT_ASSERT(new_state != ProfilerState::Disabled);
-#ifndef USE_CUDA
-  if (new_state == ProfilerState::NVTX)
+  if (new_state == ProfilerState::NVTX && !cuda_stubs->enabled())
      throw std::runtime_error("Can't use NVTX profiler - PyTorch was compiled without CUDA");
-#endif
    if (state != ProfilerState::Disabled && new_state != state) {
        throw std::runtime_error("can't change kind of profiling (e.g. NVTX to CPU) while profiler is running");
    }
    state = new_state;
  
-#ifdef USE_CUDA
    if(state == ProfilerState::CUDA) {
      // event recording appears to have some startup overhead, so we need to
      // to generate some dummy events first before recording syncrhonization events
      for(int i = 0; i < 5; i++) {
-      onEachDevice([](int d) {
+      cuda_stubs->onEachDevice([](int d) {
            mark("__cuda_startup");
-          cudaDeviceSynchronize();
+          cuda_stubs->synchronize();
        });
      }
  
      // cuda events must be on the same device, so we need a start event recorded
      // for each gpu. we then use this event to synchronize time on the GPU
      // with the CPU clock.
-    onEachDevice([](int d) {
+    cuda_stubs->onEachDevice([](int d) {
          mark("__cuda_start_event");
      });
    }
-#endif
    mark("__start_profile", false);
  }
  
@@ -197,4 +172,24 @@ thread_event_lists disableProfiler() {
    }
  }
  
+void Event::record(bool record_cuda) {
+  if (record_cuda) {
+    cuda_stubs->record(&device_, &event, &cpu_ns_);
+    return;
+  }
+  cpu_ns_ = getTime();
+}
+
+double Event::cuda_elapsed_us(const Event & e) {
+  if(!e.has_cuda() || !has_cuda()) {
+    throw std::logic_error("Events were not recorded for CUDA");
+  }
+  if(e.device() != device()) {
+    throw std::logic_error("Events are not on the same device");
+  }
+  return cuda_stubs->elapsed(event, e.event);
+}
+
+CUDAStubs::~CUDAStubs() = default;
+
  }}}
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h

index 0750cdf..e262f08 100644 (file)
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -1,8 +1,5 @@
  #pragma once
  
-#ifdef USE_CUDA
-#include <nvToolsExt.h>
-#endif
  #include <thread>
  #include <iostream>
  #include <mutex>
@@ -16,21 +13,54 @@
  #include <tuple>
  #include <ATen/ATen.h>
  #include <torch/csrc/WindowsTorchApiMacro.h>
-#include <torch/csrc/cuda/cuda_check.h>
-#ifdef USE_CUDA
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime.h>
-#endif
  #ifndef _WIN32
  #include <ctime>
  #endif
  
+typedef struct CUevent_st* CUDAEventStub;
+
  namespace torch { namespace autograd {
  
  struct Function;
  
  namespace profiler {
  
+struct TORCH_API CUDAStubs {
+  virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) {
+    fail();
+  }
+  virtual float elapsed(CUDAEventStub event, CUDAEventStub event2) {
+    fail();
+    return 0.f;
+  }
+  virtual void nvtxMarkA(const char* name) {
+    fail();
+  }
+  virtual void nvtxRangePushA(const char* name) {
+    fail();
+  }
+  virtual void nvtxRangePop() {
+    fail();
+  }
+  virtual bool enabled() {
+    return false;
+  }
+  virtual void onEachDevice(std::function<void(int)> op) {
+    fail();
+  }
+  virtual void synchronize() {
+    fail();
+  }
+  virtual ~CUDAStubs();
+
+private:
+  void fail() {
+    AT_ERROR("CUDA used in profiler but not enabled.");
+  }
+};
+
+TORCH_API void registerCUDAMethods(CUDAStubs* stubs);
+
  constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
    return ((a + b - 1) / b) * b;
  }
@@ -64,7 +94,7 @@ enum class EventKind : uint16_t {
    PopRange
  };
  
-struct Event final {
+struct TORCH_API Event final {
    Event(EventKind kind, std::string name, uint16_t thread_id, bool record_cuda)
    : owned_name_(new std::string(std::move(name)))
    , name_ptr_(owned_name_->c_str())
@@ -75,19 +105,7 @@ struct Event final {
    , kind_(kind)
    , thread_id_(thread_id) { record(record_cuda); }
  
-  void record(bool record_cuda) {
-#ifdef USE_CUDA
-    if (record_cuda) {
-      TORCH_CUDA_CHECK(cudaGetDevice(&device_));
-      TORCH_CUDA_CHECK(cudaEventCreate(&event));
-      auto stream = at::cuda::getCurrentCUDAStream();
-      cpu_ns_ = getTime();
-      TORCH_CUDA_CHECK(cudaEventRecord(event, stream));
-      return;
-    }
-#endif
-    cpu_ns_ = getTime();
-  }
+  void record(bool record_cuda);
    std::string kind() const {
      switch(kind_) {
        case EventKind::Mark: return "mark";
@@ -105,29 +123,9 @@ struct Event final {
    double cpu_elapsed_us(const Event & e) {
      return (e.cpu_ns_ - cpu_ns_)/(1000.0);
    }
-  double cuda_elapsed_us(const Event & e) {
-#ifdef USE_CUDA
-    if(!e.has_cuda() || !has_cuda()) {
-      throw std::logic_error("Events were not recorded for CUDA");
-    }
-    if(e.device() != device()) {
-      throw std::logic_error("Events are not on the same device");
-    }
-    TORCH_CUDA_CHECK(cudaEventSynchronize(event));
-    TORCH_CUDA_CHECK(cudaEventSynchronize(e.event));
-    float ms;
-    TORCH_CUDA_CHECK(cudaEventElapsedTime(&ms, event, e.event));
-    return ms*1000.0;
-#else
-    throw std::logic_error("CUDA not enabled");
-#endif
-  }
+  double cuda_elapsed_us(const Event & e);
    bool has_cuda() const {
-#ifdef USE_CUDA
      return event != nullptr;
-#else
-    return false;
-#endif
    }
    int device() const {
      return device_;
@@ -142,9 +140,7 @@ private:
    EventKind kind_;
    uint16_t thread_id_;
    int device_ = -1;
-#ifdef USE_CUDA
-  cudaEvent_t event = nullptr;
-#endif
+  struct CUevent_st* event = nullptr;
  };
  
  // a linked-list of fixed sized vectors, to avoid
diff --git a/torch/csrc/autograd/profiler_cuda.cpp b/torch/csrc/autograd/profiler_cuda.cpp

new file mode 100644 (file)

index 0000000..cbea26d
--- /dev/null
+++ b/torch/csrc/autograd/profiler_cuda.cpp
@@ -0,0 +1,65 @@
+#include <torch/csrc/autograd/profiler.h>
+#include <torch/csrc/cuda/cuda_check.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <nvToolsExt.h>
+
+#include <sstream>
+
+namespace torch { namespace autograd { namespace profiler {
+
+namespace {
+
+struct CUDAMethods : public CUDAStubs {
+  void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) override {
+    TORCH_CUDA_CHECK(cudaGetDevice(device));
+    TORCH_CUDA_CHECK(cudaEventCreate(event));
+    auto stream = at::cuda::getCurrentCUDAStream();
+    *cpu_ns = getTime();
+    TORCH_CUDA_CHECK(cudaEventRecord(*event, stream));
+  }
+  float elapsed(CUDAEventStub event, CUDAEventStub event2) override {
+    TORCH_CUDA_CHECK(cudaEventSynchronize(event));
+    TORCH_CUDA_CHECK(cudaEventSynchronize(event2));
+    float ms;
+    TORCH_CUDA_CHECK(cudaEventElapsedTime(&ms, event, event2));
+    return ms*1000.0;
+  }
+  void nvtxMarkA(const char* name) override {
+    ::nvtxMark(name);
+  }
+  void nvtxRangePushA(const char* name) override {
+    ::nvtxRangePushA(name);
+  }
+  void nvtxRangePop() override {
+    ::nvtxRangePop();
+  }
+  void onEachDevice(std::function<void(int)> op) override {
+    at::cuda::OptionalCUDAGuard device_guard;
+    int count;
+    TORCH_CUDA_CHECK(cudaGetDeviceCount(&count));
+    for(int i = 0; i < count; i++) {
+      device_guard.set_index(i);
+      op(i);
+    }
+  }
+  void synchronize() override {
+    cudaDeviceSynchronize();
+  }
+  bool enabled() override {
+    return true;
+  }
+
+};
+
+struct RegisterCUDAMethods {
+  RegisterCUDAMethods() {
+    static CUDAMethods methods;
+    registerCUDAMethods(&methods);
+  }
+};
+RegisterCUDAMethods reg;
+
+} // namespaces
+} // namespace profiler
+} // namespace autograd
+} // namespace torch
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp

index c6bc582..d4045b8 100644 (file)
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -1,7 +1,5 @@
  #include <torch/csrc/cuda/comm.h>
  
-#ifdef USE_CUDA
-
  #include <torch/csrc/cuda/device_set.h>
  #include <torch/csrc/utils/tensor_flatten.h>
  
@@ -244,5 +242,3 @@ at::Tensor gather(
    return result;
  }
  }} // namespace torch::cuda
-
-#endif
diff --git a/torch/csrc/cuda/comm.h b/torch/csrc/cuda/comm.h

index 960eddf..8d7671d 100644 (file)
--- a/torch/csrc/cuda/comm.h
+++ b/torch/csrc/cuda/comm.h
@@ -1,7 +1,5 @@
  #pragma once
  
-#ifdef USE_CUDA
-
  #include <ATen/ATen.h>
  #include <ATen/cuda/CUDAContext.h>
  #include <c10/util/Optional.h>
@@ -30,5 +28,3 @@ at::Tensor gather(
      int64_t dim,
      c10::optional<int32_t> destination_index);
  }}
-
-#endif
diff --git a/torch/csrc/cuda/cuda_check.h b/torch/csrc/cuda/cuda_check.h

index 327e90d..b000b1a 100644 (file)
--- a/torch/csrc/cuda/cuda_check.h
+++ b/torch/csrc/cuda/cuda_check.h
@@ -1,6 +1,5 @@
  #pragma once
  
-#ifdef USE_CUDA
  #include <cuda.h>
  #include <cuda_runtime.h>
  #include <nvrtc.h>
@@ -37,5 +36,3 @@ static inline void cudaCheck(cudaError_t result, const char * file, int line) {
  #define TORCH_CUDA_CHECK(result) ::torch::cudaCheck(result,__FILE__,__LINE__);
  
  }
-
-#endif
author	Zachary DeVito <zdevito@fb.com>
	Wed, 16 Jan 2019 00:25:28 +0000 (16:25 -0800)
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
	Wed, 16 Jan 2019 00:43:11 +0000 (16:43 -0800)
tools/build_variables.py		patch \| blob \| history
torch/CMakeLists.txt		patch \| blob \| history
torch/csrc/api/include/torch/nn/parallel/data_parallel.h		patch \| blob \| history
torch/csrc/autograd/functions/comm.cpp		patch \| blob \| history
torch/csrc/autograd/functions/comm.h		patch \| blob \| history
torch/csrc/autograd/profiler.cpp		patch \| blob \| history
torch/csrc/autograd/profiler.h		patch \| blob \| history
torch/csrc/autograd/profiler_cuda.cpp	[new file with mode: 0644]	patch \| blob
torch/csrc/cuda/comm.cpp		patch \| blob \| history
torch/csrc/cuda/comm.h		patch \| blob \| history
torch/csrc/cuda/cuda_check.h		patch \| blob \| history