unify c2 and TH allocator (#16892)

author Dmytro Dzhulgakov <dzhulgakov@fb.com>

Wed, 13 Feb 2019 05:13:25 +0000 (21:13 -0800)

committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>

Wed, 13 Feb 2019 05:16:34 +0000 (21:16 -0800)
author Dmytro Dzhulgakov <dzhulgakov@fb.com>
Wed, 13 Feb 2019 05:13:25 +0000 (21:13 -0800)
committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
Wed, 13 Feb 2019 05:16:34 +0000 (21:16 -0800)
diff --git a/aten/src/TH/THAllocator.cpp b/aten/src/TH/THAllocator.cpp

index 08bf56b..b6972e0 100644 (file)
--- a/aten/src/TH/THAllocator.cpp
+++ b/aten/src/TH/THAllocator.cpp
@@ -10,6 +10,8 @@
  #define TH_ATOMIC_IPC_REFCOUNT 1
  #endif
  
+#include <c10/core/CPUAllocator.h>
+
  #if HAVE_MMAP
  #include <sys/types.h>
  #include <sys/mman.h>
@@ -19,19 +21,8 @@
  #endif
  /* end of stuff for mapped files */
  
-struct THDefaultAllocator final : public at::Allocator {
-  at::DataPtr allocate(size_t size) const override {
-    auto* ptr = THAlloc(size);
-    return {ptr, ptr, &THFree, at::DeviceType::CPU};
-  }
-  at::DeleterFnPtr raw_deleter() const override {
-    return &THFree;
-  }
-};
-
-static THDefaultAllocator th_default_allocator;
  at::Allocator* getTHDefaultAllocator() {
-  return &th_default_allocator;
+  return c10::GetCPUAllocator();
  }
  
  #if defined(_WIN32) || defined(HAVE_MMAP)
diff --git a/aten/src/TH/THGeneral.cpp b/aten/src/TH/THGeneral.cpp

index 2ac3605..e2d6942 100644 (file)
--- a/aten/src/TH/THGeneral.cpp
+++ b/aten/src/TH/THGeneral.cpp
@@ -1,5 +1,9 @@
  #include <TH/THGeneral.h>
  
+#ifdef __cplusplus
+#include <c10/core/CPUAllocator.h>
+#endif
+
  #ifdef _OPENMP
  #include <omp.h>
  #endif
@@ -155,52 +159,12 @@ void THSetGCHandler( void (*torchGCFunction_)(void *data), void *data )
    torchGCData = data;
  }
  
-static void* THAllocInternal(ptrdiff_t size)
-{
-  void *ptr;
-
-  if (size > 5120)
-  {
-#if (defined(__unix) || defined(__APPLE__)) && (!defined(DISABLE_POSIX_MEMALIGN))
-    if (posix_memalign(&ptr, 64, size) != 0)
-      ptr = NULL;
-/*
-#elif defined(_WIN32)
-    ptr = _aligned_malloc(size, 64);
-*/
-#else
-    ptr = malloc(size);
-#endif
-  }
-  else
-  {
-    ptr = malloc(size);
-  }
-
-  return ptr;
-}
-
  void* THAlloc(ptrdiff_t size)
  {
-  void *ptr;
-
    if(size < 0)
      THError("$ Torch: invalid memory size -- maybe an overflow?");
  
-  if(size == 0)
-    return NULL;
-
-  ptr = THAllocInternal(size);
-
-  if(!ptr && torchGCFunction) {
-    torchGCFunction(torchGCData);
-    ptr = THAllocInternal(size);
-  }
-
-  if(!ptr)
-    THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", size/1073741824);
-
-  return ptr;
+  return c10::alloc_cpu(size);
  }
  
  void* THRealloc(void *ptr, ptrdiff_t size)
diff --git a/c10/core/Allocator.cpp b/c10/core/Allocator.cpp

index 223234d..4dd8b1a 100644 (file)
--- a/c10/core/Allocator.cpp
+++ b/c10/core/Allocator.cpp
@@ -16,12 +16,7 @@ at::DataPtr InefficientStdFunctionContext::makeDataPtr(
            device};
  }
  
-} // namespace c10
-
-namespace caffe2 {
-
-C10_API at::Allocator* allocator_array[static_cast<int>(
-    at::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES)];
+C10_API at::Allocator* allocator_array[at::COMPILE_TIME_MAX_DEVICE_TYPES];
  
  void SetAllocator(at::DeviceType t, at::Allocator* alloc) {
    allocator_array[static_cast<int>(t)] = alloc;
@@ -33,4 +28,4 @@ at::Allocator* GetAllocator(const at::DeviceType& t) {
    return alloc;
  }
  
-} // namespace caffe2
+} // namespace c10
diff --git a/c10/core/Allocator.h b/c10/core/Allocator.h

index 398f47e..426b6e8 100644 (file)
--- a/c10/core/Allocator.h
+++ b/c10/core/Allocator.h
@@ -16,7 +16,7 @@ namespace c10 {
  // nullptr DataPtrs can still have a nontrivial device; this allows
  // us to treat zero-size allocations uniformly with non-zero allocations.
  //
-class DataPtr {
+class C10_API DataPtr {
   private:
    c10::detail::UniqueVoidPtr ptr_;
    Device device_;
@@ -181,11 +181,6 @@ struct C10_API InefficientStdFunctionContext {
        Device device);
  };
  
-} // namespace c10
-
-// TODO: move to c10
-namespace caffe2 {
-
  /** Set the allocator for DeviceType `t`. The passed in allocator pointer is
   *  expected to have static lifetime; this function does NOT take ownership
   *  of the raw pointer. (The reason for this is to prevent existing pointers
@@ -210,4 +205,4 @@ struct AllocatorRegisterer {
    static AllocatorRegisterer<t> g_allocator_##d(f); \
    }
  
-} // namespace caffe2
+} // namespace c10
diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp

new file mode 100644 (file)

index 0000000..10cff34
--- /dev/null
+++ b/c10/core/CPUAllocator.cpp
@@ -0,0 +1,170 @@
+#include <c10/core/CPUAllocator.h>
+#include <c10/util/typeid.h>
+#include <c10/core/DeviceType.h>
+
+// TODO: rename flags to C10
+C10_DEFINE_bool(
+    caffe2_report_cpu_memory_usage,
+    false,
+    "If set, print out detailed memory usage");
+
+C10_DEFINE_bool(
+    caffe2_cpu_allocator_do_zero_fill,
+    false,
+    "If set, do memory zerofilling when allocating on CPU");
+
+C10_DEFINE_bool(
+    caffe2_cpu_allocator_do_junk_fill,
+    false,
+    "If set, fill memory with deterministic junk when allocating on CPU");
+
+namespace c10 {
+
+void memset_junk(void* data, size_t num) {
+  // This garbage pattern is NaN when interpreted as floating point values,
+  // or as very large integer values.
+  static constexpr int32_t kJunkPattern = 0x7fedbeef;
+  static constexpr int64_t kJunkPattern64 =
+      static_cast<int64_t>(kJunkPattern) << 32 | kJunkPattern;
+  int32_t int64_count = num / sizeof(kJunkPattern64);
+  int32_t remaining_bytes = num % sizeof(kJunkPattern64);
+  int64_t* data_i64 = reinterpret_cast<int64_t*>(data);
+  for (int i = 0; i < int64_count; i++) {
+    data_i64[i] = kJunkPattern64;
+  }
+  if (remaining_bytes > 0) {
+    memcpy(data_i64 + int64_count, &kJunkPattern64, remaining_bytes);
+  }
+}
+
+void* alloc_cpu(size_t nbytes) {
+  if (nbytes == 0) {
+    return nullptr;
+  }
+
+  void* data;
+#ifdef __ANDROID__
+  data = memalign(gAlignment, nbytes);
+#elif defined(_MSC_VER)
+  data = _aligned_malloc(nbytes, gAlignment);
+#else
+  CAFFE_ENFORCE_EQ(posix_memalign(&data, gAlignment, nbytes), 0);
+#endif
+
+  CAFFE_ENFORCE(
+      data,
+      "DefaultCPUAllocator: not enough memory: you tried to allocate %dGB. Buy new RAM!",
+      nbytes / 1073741824);
+
+  // move data to a thread's NUMA node
+  NUMAMove(data, nbytes, GetCurrentNUMANode());
+  CHECK(
+      !FLAGS_caffe2_cpu_allocator_do_zero_fill ||
+      !FLAGS_caffe2_cpu_allocator_do_junk_fill)
+    << "Cannot request both zero-fill and junk-fill at the same time";
+  if (FLAGS_caffe2_cpu_allocator_do_zero_fill) {
+    memset(data, 0, nbytes);
+  } else if (FLAGS_caffe2_cpu_allocator_do_junk_fill) {
+    memset_junk(data, nbytes);
+  }
+
+  return data;
+}
+
+// A virtual struct that is used to report C10's memory allocation and
+// deallocation status
+class C10_API MemoryAllocationReporter {
+ public:
+  MemoryAllocationReporter() : allocated_(0) {}
+  void New(void* ptr, size_t nbytes);
+  void Delete(void* ptr);
+
+ private:
+  std::mutex mutex_;
+  std::unordered_map<void*, size_t> size_table_;
+  size_t allocated_;
+};
+
+struct C10_API DefaultCPUAllocator final : at::Allocator {
+  DefaultCPUAllocator() {}
+  ~DefaultCPUAllocator() override {}
+  at::DataPtr allocate(size_t nbytes) const override {
+    void* data = alloc_cpu(nbytes);
+    if (FLAGS_caffe2_report_cpu_memory_usage && nbytes > 0) {
+      getMemoryAllocationReporter().New(data, nbytes);
+      return {data, data, &ReportAndDelete, at::Device(at::DeviceType::CPU)};
+    }
+    return {data, data, &Delete, at::Device(at::DeviceType::CPU)};
+  }
+
+#ifdef _MSC_VER
+  static void Delete(void* data) {
+    _aligned_free(data);
+  }
+#else
+  static void Delete(void* data) {
+    free(data);
+  }
+#endif
+
+  static void ReportAndDelete(void* ptr) {
+    if (!ptr) {
+      return;
+    }
+    getMemoryAllocationReporter().Delete(ptr);
+    Delete(ptr);
+  }
+
+  at::DeleterFnPtr raw_deleter() const override {
+    if (FLAGS_caffe2_report_cpu_memory_usage) {
+      return &ReportAndDelete;
+    }
+    return &Delete;
+  }
+
+ protected:
+  static MemoryAllocationReporter& getMemoryAllocationReporter() {
+    static MemoryAllocationReporter reporter_;
+    return reporter_;
+  }
+
+};
+
+void NoDelete(void*) {}
+
+at::Allocator* GetCPUAllocator() {
+  return GetAllocator(DeviceType::CPU);
+}
+
+void SetCPUAllocator(at::Allocator* alloc) {
+  SetAllocator(DeviceType::CPU, alloc);
+}
+
+// Global default CPU Allocator
+static DefaultCPUAllocator g_cpu_alloc;
+
+at::Allocator* GetDefaultCPUAllocator() {
+  return &g_cpu_alloc;
+}
+
+REGISTER_ALLOCATOR(DeviceType::CPU, &g_cpu_alloc);
+
+void MemoryAllocationReporter::New(void* ptr, size_t nbytes) {
+  std::lock_guard<std::mutex> guard(mutex_);
+  size_table_[ptr] = nbytes;
+  allocated_ += nbytes;
+  LOG(INFO) << "C10 alloc " << nbytes << " bytes, total alloc " << allocated_
+            << " bytes.";
+}
+
+void MemoryAllocationReporter::Delete(void* ptr) {
+  std::lock_guard<std::mutex> guard(mutex_);
+  auto it = size_table_.find(ptr);
+  CHECK(it != size_table_.end());
+  allocated_ -= it->second;
+  LOG(INFO) << "C10 deleted " << it->second << " bytes, total alloc "
+            << allocated_ << " bytes.";
+  size_table_.erase(it);
+}
+
+} // namespace c10
diff --git a/c10/core/CPUAllocator.h b/c10/core/CPUAllocator.h

new file mode 100644 (file)

index 0000000..66e940e
--- /dev/null
+++ b/c10/core/CPUAllocator.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <cstring>
+#include <unordered_map>
+
+#include <c10/core/Allocator.h>
+#include <c10/util/Logging.h>
+#include <c10/util/numa.h>
+
+// TODO: rename to c10
+C10_DECLARE_bool(caffe2_report_cpu_memory_usage);
+C10_DECLARE_bool(caffe2_cpu_allocator_do_zero_fill);
+C10_DECLARE_bool(caffe2_cpu_allocator_do_junk_fill);
+
+namespace c10 {
+
+// Use 64-byte alignment should be enough for computation up to AVX512.
+constexpr size_t gAlignment = 64;
+
+using MemoryDeleter = void (*)(void*);
+
+// A helper function that is basically doing nothing.
+C10_API void NoDelete(void*);
+
+// Fill the data memory region of num bytes with a particular garbage pattern.
+// The garbage value is chosen to be NaN if interpreted as floating point value,
+// or a very large integer.
+C10_API void memset_junk(void* data, size_t num);
+
+C10_API void* alloc_cpu(size_t nbytes);
+
+// Get the CPU Alloctor.
+C10_API at::Allocator* GetCPUAllocator();
+// Sets the CPU allocator to the given allocator: the caller gives away the
+// ownership of the pointer.
+C10_API void SetCPUAllocator(at::Allocator* alloc);
+
+// Get the Default CPU Allocator
+C10_API at::Allocator* GetDefaultCPUAllocator();
+
+} // namespace c10
diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h

index 67bac37..625b55e 100644 (file)
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@@ -53,7 +53,7 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
              data_type,
              0,
              at::DataPtr(nullptr, device),
-            caffe2::GetAllocator(device.type()),
+            GetAllocator(device.type()),
              true) {}
  
    StorageImpl& operator=(StorageImpl&& other) = default;
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h

index e0d88df..50ba97d 100644 (file)
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -1188,7 +1188,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
        // know how to reallocate it. However, in order to preserve legacy C2
        // behavior, we allow reallocating the memory using default allocator.
        if (allocator == nullptr) {
-        allocator = caffe2::GetAllocator(storage_.device_type());
+        allocator = GetAllocator(storage_.device_type());
        }
        if (meta.placementNew()) {
          // For types that need placement new, we will call it, as well as
diff --git a/c10/util/numa.cpp b/c10/util/numa.cpp

index 4a5086e..530316a 100644 (file)
--- a/c10/util/numa.cpp
+++ b/c10/util/numa.cpp
@@ -9,6 +9,9 @@ C10_DEFINE_bool(caffe2_cpu_numa_enabled, false, "Use NUMA whenever possible.");
  #define C10_ENABLE_NUMA
  #endif
  
+// This code used to have a lot of VLOGs. However, because allocation might be
+// triggered during static initialization, it's unsafe to invoke VLOG here
+
  namespace c10 {
  
  #ifdef C10_ENABLE_NUMA
@@ -21,7 +24,6 @@ void NUMABind(int numa_node_id) {
      return;
    }
    if (!IsNUMAEnabled()) {
-    VLOG(1) << "NUMA is not enabled";
      return;
    }
  
@@ -39,7 +41,6 @@ void NUMABind(int numa_node_id) {
  
  int GetNUMANode(const void* ptr) {
    if (!IsNUMAEnabled()) {
-    VLOG(1) << "NUMA is not enabled";
      return -1;
    }
    AT_ASSERT(ptr);
@@ -59,7 +60,6 @@ int GetNUMANode(const void* ptr) {
  
  int GetNumNUMANodes() {
    if (!IsNUMAEnabled()) {
-    VLOG(1) << "NUMA is not enabled";
      return -1;
    }
  
@@ -71,7 +71,6 @@ void NUMAMove(void* ptr, size_t size, int numa_node_id) {
      return;
    }
    if (!IsNUMAEnabled()) {
-    VLOG(1) << "NUMA is not enabled";
      return;
    }
    AT_ASSERT(ptr);
@@ -97,7 +96,6 @@ void NUMAMove(void* ptr, size_t size, int numa_node_id) {
  
  int GetCurrentNUMANode() {
    if (!IsNUMAEnabled()) {
-    VLOG(1) << "NUMA is not enabled";
      return -1;
    }
  
@@ -112,29 +110,20 @@ bool IsNUMAEnabled() {
  }
  
  void NUMABind(int numa_node_id) {
-  if (numa_node_id >= 0) {
-    VLOG(1) << "NUMA is not enabled";
-  }
  }
  
  int GetNUMANode(const void* ptr) {
-  VLOG(1) << "NUMA is not enabled";
    return -1;
  }
  
  int GetNumNUMANodes() {
-  VLOG(1) << "NUMA is not enabled";
    return -1;
  }
  
  void NUMAMove(void* ptr, size_t size, int numa_node_id) {
-  if (numa_node_id >= 0) {
-    VLOG(1) << "NUMA is not enabled";
-  }
  }
  
  int GetCurrentNUMANode() {
-  VLOG(1) << "NUMA is not enabled";
    return -1;
  }
  
diff --git a/caffe2/core/allocator.cc b/caffe2/core/allocator.cc

index e502476..1a73342 100644 (file)
--- a/caffe2/core/allocator.cc
+++ b/caffe2/core/allocator.cc
@@ -1,75 +1 @@
-#include <c10/core/Allocator.h>
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include <c10/util/typeid.h>
-
-C10_DEFINE_bool(
-    caffe2_report_cpu_memory_usage,
-    false,
-    "If set, print out detailed memory usage");
-
-C10_DEFINE_bool(
-    caffe2_cpu_allocator_do_zero_fill,
-    false,
-    "If set, do memory zerofilling when allocating on CPU");
-
-C10_DEFINE_bool(
-    caffe2_cpu_allocator_do_junk_fill,
-    false,
-    "If set, fill memory with deterministic junk when allocating on CPU");
-
-namespace caffe2 {
-
-void memset_junk(void* data, size_t num) {
-  // This garbage pattern is NaN when interpretted as floating point values,
-  // or as very large integer values.
-  static constexpr int32_t kJunkPattern = 0x7fedbeef;
-  static constexpr int64_t kJunkPattern64 =
-      static_cast<int64_t>(kJunkPattern) << 32 | kJunkPattern;
-  int32_t int64_count = num / sizeof(kJunkPattern64);
-  int32_t remaining_bytes = num % sizeof(kJunkPattern64);
-  int64_t* data_i64 = reinterpret_cast<int64_t*>(data);
-  for (int i = 0; i < int64_count; i++) {
-    data_i64[i] = kJunkPattern64;
-  }
-  if (remaining_bytes > 0) {
-    memcpy(data_i64 + int64_count, &kJunkPattern64, remaining_bytes);
-  }
-}
-
-void NoDelete(void*) {}
-
-at::Allocator* GetCPUAllocator() {
-  return GetAllocator(CPU);
-}
-
-void SetCPUAllocator(at::Allocator* alloc) {
-  SetAllocator(CPU, alloc);
-}
-
-// Global default CPU Allocator
-static DefaultCPUAllocator g_cpu_alloc;
-
-REGISTER_ALLOCATOR(CPU, &g_cpu_alloc);
-
-MemoryAllocationReporter DefaultCPUAllocator::reporter_;
-
-void MemoryAllocationReporter::New(void* ptr, size_t nbytes) {
-  std::lock_guard<std::mutex> guard(mutex_);
-  size_table_[ptr] = nbytes;
-  allocated_ += nbytes;
-  LOG(INFO) << "Caffe2 alloc " << nbytes << " bytes, total alloc " << allocated_
-            << " bytes.";
-}
-
-void MemoryAllocationReporter::Delete(void* ptr) {
-  std::lock_guard<std::mutex> guard(mutex_);
-  auto it = size_table_.find(ptr);
-  CHECK(it != size_table_.end());
-  allocated_ -= it->second;
-  LOG(INFO) << "Caffe2 deleted " << it->second << " bytes, total alloc "
-            << allocated_ << " bytes.";
-  size_table_.erase(it);
-}
-
-} // namespace caffe2
+#include "caffe2/core/allocator.h"
diff --git a/caffe2/core/allocator.h b/caffe2/core/allocator.h

index c9b1c52..a55143e 100644 (file)
--- a/caffe2/core/allocator.h
+++ b/caffe2/core/allocator.h
@@ -1,117 +1,4 @@
  #ifndef CAFFE2_CORE_ALLOCATOR_H_
  #define CAFFE2_CORE_ALLOCATOR_H_
-
-#include <cstring>
-#include <unordered_map>
-
-#include <c10/core/Allocator.h>
-#include "caffe2/core/logging.h"
-#include "caffe2/core/numa.h"
-
-C10_DECLARE_bool(caffe2_report_cpu_memory_usage);
-C10_DECLARE_bool(caffe2_cpu_allocator_do_zero_fill);
-C10_DECLARE_bool(caffe2_cpu_allocator_do_junk_fill);
-
-namespace caffe2 {
-
-// Use 64-byte alignment should be enough for computation up to AVX512.
-constexpr size_t gCaffe2Alignment = 64;
-
-using MemoryDeleter = void (*)(void*);
-
-// A helper function that is basically doing nothing.
-CAFFE2_API void NoDelete(void*);
-
-// A virtual allocator class to do memory allocation and deallocation.
-struct CAFFE2_API CPUAllocator {
-  CPUAllocator() {}
-  virtual ~CPUAllocator() noexcept {}
-  virtual std::pair<void*, MemoryDeleter> New(size_t nbytes) = 0;
-  virtual MemoryDeleter GetDeleter() = 0;
-};
-
-// A virtual struct that is used to report Caffe2's memory allocation and
-// deallocation status
-class CAFFE2_API MemoryAllocationReporter {
- public:
-  MemoryAllocationReporter() : allocated_(0) {}
-  void New(void* ptr, size_t nbytes);
-  void Delete(void* ptr);
-
- private:
-  std::mutex mutex_;
-  std::unordered_map<void*, size_t> size_table_;
-  size_t allocated_;
-};
-
-// Fill the data memory region of num bytes with a particular garbage pattern.
-// The garbage value is chosen to be NaN if interpreted as floating point value,
-// or a very large integer.
-CAFFE2_API void memset_junk(void* data, size_t num);
-
-struct CAFFE2_API DefaultCPUAllocator final : at::Allocator {
-  DefaultCPUAllocator() {}
-  ~DefaultCPUAllocator() override {}
-  at::DataPtr allocate(size_t nbytes) const override {
-    void* data = nullptr;
-#ifdef __ANDROID__
-    data = memalign(gCaffe2Alignment, nbytes);
-#elif defined(_MSC_VER)
-    data = _aligned_malloc(nbytes, gCaffe2Alignment);
-#else
-    CAFFE_ENFORCE_EQ(posix_memalign(&data, gCaffe2Alignment, nbytes), 0);
-#endif
-    CAFFE_ENFORCE(data);
-    // move data to a thread's NUMA node
-    NUMAMove(data, nbytes, GetCurrentNUMANode());
-    CHECK(
-        !FLAGS_caffe2_cpu_allocator_do_zero_fill ||
-        !FLAGS_caffe2_cpu_allocator_do_junk_fill)
-        << "Cannot request both zero-fill and junk-fill at the same time";
-    if (FLAGS_caffe2_cpu_allocator_do_zero_fill) {
-      memset(data, 0, nbytes);
-    } else if (FLAGS_caffe2_cpu_allocator_do_junk_fill) {
-      memset_junk(data, nbytes);
-    }
-    if (FLAGS_caffe2_report_cpu_memory_usage) {
-      reporter_.New(data, nbytes);
-      return {data, data, &ReportAndDelete, at::Device(at::DeviceType::CPU)};
-    }
-    return {data, data, &Delete, at::Device(at::DeviceType::CPU)};
-  }
-
-#ifdef _MSC_VER
-  static void Delete(void* data) {
-    _aligned_free(data);
-  }
-#else
-  static void Delete(void* data) {
-    free(data);
-  }
-#endif
-
-  static void ReportAndDelete(void* ptr) {
-    reporter_.Delete(ptr);
-    Delete(ptr);
-  }
-
-  at::DeleterFnPtr raw_deleter() const override {
-    if (FLAGS_caffe2_report_cpu_memory_usage) {
-      return &ReportAndDelete;
-    }
-    return &Delete;
-  }
-
- protected:
-  static MemoryAllocationReporter reporter_;
-};
-
-// Get the CPU Alloctor.
-CAFFE2_API at::Allocator* GetCPUAllocator();
-// Sets the CPU allocator to the given allocator: the caller gives away the
-// ownership of the pointer.
-CAFFE2_API void SetCPUAllocator(at::Allocator* alloc);
-
-} // namespace caffe2
-
+#include <c10/core/CPUAllocator.h>
  #endif // CAFFE2_CORE_ALLOCATOR_H_
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu

index 48e5986..97b12fa 100644 (file)
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -278,6 +278,78 @@ static void Caffe2SetCUDAMemoryPool() {
    }
  }
  
+/**
+ * An allocator that does the CPU memory allocation with pinned memory.
+ *
+ * This is needed because if we want to do any asynchronous cuda memcpy,
+ * the underlying CPU memory also needs to be allocated into pinned memory
+ * space. As a result, whenever Caffe2 is built with GPU and there is
+ * GPU present during runtime, at global initialization time we will set
+ * the CPU memory allocator to allocate pinned memory.
+ *
+ * NB: This behavior is probably too agressive. We should consider asking users
+ * to do on-demand memory pinning (like exposed in PyTorch APIs) instead.
+ */
+struct CAFFE2_CUDA_API PinnedCPUAllocator final : public at::Allocator {
+  PinnedCPUAllocator() {
+    baseAllocator_ = GetDefaultCPUAllocator();
+  }
+  ~PinnedCPUAllocator() override {}
+  at::DataPtr allocate(size_t nbytes) const override {
+    if (nbytes == 0) {
+      // replicate c10::alloc_cpu behavior - return nullptr
+      return {nullptr, nullptr, &Delete, at::Device(CPU)};
+    }
+    void* data;
+    at::DataPtr data_ptr;
+    std::lock_guard<std::mutex> lock(CUDAContext::mutex());
+    if (IsNUMAEnabled()) {
+      data_ptr = baseAllocator_->allocate(nbytes);
+      data = data_ptr.get();
+      CAFFE_ENFORCE(data);
+      CUDA_ENFORCE(cudaHostRegister(data, nbytes, cudaHostRegisterDefault));
+    } else {
+      CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
+      data_ptr = {data, data, &Delete, at::Device(CPU)};
+    }
+    memset(data, 0, nbytes);
+    return data_ptr;
+  }
+
+  at::DeleterFnPtr raw_deleter() const override {
+    return &Delete;
+  }
+
+ private:
+  static void Delete(void* data) {
+    if (!data) {
+      return;
+    }
+    // Caffe2 uses a lazy way to figure out if one is actually going to use GPUs
+    // or not. If a CUDAContext::New() call is made, inside the CUDAContext
+    // function we will switch the cpu side allocator to a PinnedCPUAllocator.
+    // But, if one calls CPUContext::New() before any cuda allocations,
+    // PinnedCPUAllocator can still delete the corresponding memory.
+    std::lock_guard<std::mutex> lock(CUDAContext::mutex());
+    if (IsNUMAEnabled()) {
+      CUDA_ENFORCE(cudaHostUnregister(data));
+      GetDefaultCPUAllocator()->raw_deleter()(data);
+    } else {
+      cudaError_t err = cudaFreeHost(data);
+      if (err == cudaErrorInvalidValue) {
+        free(data);
+        // Calling cudaGetLastError will reset the cuda error.
+        cudaError_t _err = cudaGetLastError();
+      } else {
+        // For all other errors, still do a cuda check.
+        CUDA_ENFORCE(err);
+      }
+    }
+  }
+
+  at::Allocator* baseAllocator_;
+};
+
  static PinnedCPUAllocator g_pinned_cpu_alloc;
  
  // An initialization function that sets the CPU side to use pinned cpu
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h

index 9eb7fe5..a4b15a2 100644 (file)
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -341,66 +341,6 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
    static ThreadLocalCUDAObjects& getCudaObjects();
  };
  
-/**
- * An allocator that does the CPU memory allocation with pinned memory.
- *
- * This is needed because if we want to do any asynchronous cuda memcpy,
- * the underlying CPU memory also needs to be allocated into pinned memory
- * space. As a result, whenever Caffe2 is built with GPU and there is
- * GPU present during runtime, at global initialization time we will set
- * the CPU memory allocator to allocate pinned memory.
- */
-struct CAFFE2_CUDA_API PinnedCPUAllocator final : public at::Allocator {
-  PinnedCPUAllocator() {}
-  ~PinnedCPUAllocator() override {}
-  at::DataPtr allocate(size_t nbytes) const override {
-    void* data;
-    at::DataPtr data_ptr;
-    std::lock_guard<std::mutex> lock(CUDAContext::mutex());
-    if (IsNUMAEnabled()) {
-      data_ptr = baseAllocator_.allocate(nbytes);
-      data = data_ptr.get();
-      CAFFE_ENFORCE(data);
-      CUDA_ENFORCE(cudaHostRegister(data, nbytes, cudaHostRegisterDefault));
-    } else {
-      CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
-      data_ptr = {data, data, &Delete, at::Device(CPU)};
-    }
-    memset(data, 0, nbytes);
-    return data_ptr;
-  }
-
-  at::DeleterFnPtr raw_deleter() const override {
-    return &Delete;
-  }
-
- private:
-  static void Delete(void* data) {
-    // Caffe2 uses a lazy way to figure out if one is actually going to use GPUs
-    // or not. If a CUDAContext::New() call is made, inside the CUDAContext
-    // function we will switch the cpu side allocator to a PinnedCPUAllocator.
-    // But, if one calls CPUContext::New() before any cuda allocations,
-    // PinnedCPUAllocator can still delete the corresponding memory.
-    std::lock_guard<std::mutex> lock(CUDAContext::mutex());
-    if (IsNUMAEnabled()) {
-      CUDA_ENFORCE(cudaHostUnregister(data));
-      DefaultCPUAllocator::Delete(data);
-    } else {
-      cudaError_t err = cudaFreeHost(data);
-      if (err == cudaErrorInvalidValue) {
-        free(data);
-        // Calling cudaGetLastError will reset the cuda error.
-        cudaError_t _err = cudaGetLastError();
-      } else {
-        // For all other errors, still do a cuda check.
-        CUDA_ENFORCE(err);
-      }
-    }
-  }
-
-  DefaultCPUAllocator baseAllocator_;
-};
-
  using TensorCUDA = Tensor;
  
  }  // namespace caffe2
diff --git a/caffe2/core/context_test.cc b/caffe2/core/context_test.cc

index d166a64..71f8925 100644 (file)
--- a/caffe2/core/context_test.cc
+++ b/caffe2/core/context_test.cc
@@ -9,7 +9,7 @@ namespace caffe2 {
  TEST(CPUContextTest, TestAllocAlignment) {
    for (int i = 1; i < 10; ++i) {
      auto data = CPUContext::New(i);
-    EXPECT_EQ((reinterpret_cast<size_t>(data.get()) % gCaffe2Alignment), 0);
+    EXPECT_EQ((reinterpret_cast<size_t>(data.get()) % gAlignment), 0);
      // data is freed when out of scope
    }
  }
author	Dmytro Dzhulgakov <dzhulgakov@fb.com>
	Wed, 13 Feb 2019 05:13:25 +0000 (21:13 -0800)
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
	Wed, 13 Feb 2019 05:16:34 +0000 (21:16 -0800)
aten/src/TH/THAllocator.cpp		patch \| blob \| history
aten/src/TH/THGeneral.cpp		patch \| blob \| history
c10/core/Allocator.cpp		patch \| blob \| history
c10/core/Allocator.h		patch \| blob \| history
c10/core/CPUAllocator.cpp	[new file with mode: 0644]	patch \| blob
c10/core/CPUAllocator.h	[new file with mode: 0644]	patch \| blob
c10/core/StorageImpl.h		patch \| blob \| history
c10/core/TensorImpl.h		patch \| blob \| history
c10/util/numa.cpp		patch \| blob \| history
caffe2/core/allocator.cc		patch \| blob \| history
caffe2/core/allocator.h		patch \| blob \| history
caffe2/core/context_gpu.cu		patch \| blob \| history
caffe2/core/context_gpu.h		patch \| blob \| history
caffe2/core/context_test.cc		patch \| blob \| history