Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16892
Replaces https://github.com/pytorch/pytorch/pull/14517
Merged caffe2 and TH CPU Allocators. Mostly using the code from caffe2 allocators.
`memset` of caffe2 allocator is gone now. These two allocators should be almost the same.
Baseline:
```
Running ./tensor_allocation
Run on (48 X 2501 MHz CPU s)
CPU Caches:
L1 Data 32K (x24)
L1 Instruction 32K (x24)
L2 Unified 256K (x24)
L3 Unified 30720K (x2)
-------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------
BM_MakeStorageImpl 148 ns 148 ns 4676594
BM_StorageImplCtor 54 ns 54 ns
12957810
BM_MallocStorageImpl 62 ns 62 ns
11254745
BM_TensorImplCtor 22 ns 22 ns
31939472
BM_MallocTensorImpl 105 ns 105 ns 6505661
BM_Malloc_1 43 ns 43 ns
16464905
BM_MakeTensorFromStorage 126 ns 126 ns 5586116
BM_MakeVariableFromTensor 236 ns 236 ns 2995528
BM_ATenCPUTensorAllocationSmall1 319 ns 319 ns 2268884
BM_ATenCPUTensorAllocationSmall2 318 ns 318 ns 2163332
BM_ATenCPUTensorAllocationMedium1 403 ns 403 ns 1663228
BM_ATenCPUTensorAllocationMedium2 448 ns 448 ns 1595004
BM_ATenCPUTensorAllocationBig1 532 ns 532 ns 1352634
BM_ATenCPUTensorAllocationBig2 4486 ns 4486 ns 160978
```
Changed:
```
Running ./tensor_allocation
Run on (48 X 2501 MHz CPU s)
CPU Caches:
L1 Data 32K (x24)
L1 Instruction 32K (x24)
L2 Unified 256K (x24)
L3 Unified 30720K (x2)
-------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------
BM_MakeStorageImpl 141 ns 141 ns 4803576
BM_StorageImplCtor 55 ns 55 ns
13129391
BM_MallocStorageImpl 64 ns 64 ns
11088143
BM_TensorImplCtor 23 ns 23 ns
31616273
BM_MallocTensorImpl 101 ns 101 ns 7017585
BM_Malloc_1 39 ns 39 ns
18523954
BM_MakeTensorFromStorage 118 ns 118 ns 5877919
BM_MakeVariableFromTensor 452 ns 452 ns 1565722
BM_ATenCPUTensorAllocationSmall1 384 ns 384 ns 1819763
BM_ATenCPUTensorAllocationSmall2 389 ns 389 ns 1857483
BM_ATenCPUTensorAllocationMedium1 425 ns 425 ns 1646284
BM_ATenCPUTensorAllocationMedium2 430 ns 430 ns 1561319
BM_ATenCPUTensorAllocationBig1 508 ns 508 ns 1309969
BM_ATenCPUTensorAllocationBig2 3799 ns 3799 ns 173674
```
lstm benchmark:
Before:
```
INFO:lstm_bench:Iter: 1 / 390. Entries Per Second: 0.7k.
INFO:lstm_bench:Iter: 21 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 41 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 61 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 81 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 101 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 121 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 141 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 161 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 181 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 201 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 221 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 241 / 390. Entries Per Second: 0.7k.
INFO:lstm_bench:Iter: 261 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 281 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 301 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 321 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 341 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 361 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 381 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Done. Total EPS excluding 1st iteration: 0.8k
```
After:
```
INFO:lstm_bench:Iter: 1 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 21 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 41 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 61 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 81 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 101 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 121 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 141 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 161 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 181 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 201 / 390. Entries Per Second: 0.8k.
INFO:lstm_bench:Iter: 221 / 390. Entries Per Second: 0.7k.
INFO:lstm_bench:Iter: 241 / 390. Entries Per Second: 0.7k.
INFO:lstm_bench:Iter: 261 / 390. Entries Per Second: 0.7k.
INFO:lstm_bench:Iter: 281 / 390. Entries Per Second: 0.7k.
INFO:lstm_bench:Iter: 301 / 390. Entries Per Second: 0.7k.
INFO:lstm_bench:Iter: 321 / 390. Entries Per Second: 0.7k.
INFO:lstm_bench:Iter: 341 / 390. Entries Per Second: 0.7k.
INFO:lstm_bench:Iter: 361 / 390. Entries Per Second: 0.7k.
INFO:lstm_bench:Iter: 381 / 390. Entries Per Second: 0.7k.
INFO:lstm_bench:Done. Total EPS excluding 1st iteration: 0.8k
```
Reviewed By: ezyang
Differential Revision:
D13202632
fbshipit-source-id:
db6d2ec756ed15b0732b15396c82ad42302bb79d
#define TH_ATOMIC_IPC_REFCOUNT 1
#endif
+#include <c10/core/CPUAllocator.h>
+
#if HAVE_MMAP
#include <sys/types.h>
#include <sys/mman.h>
#endif
/* end of stuff for mapped files */
-struct THDefaultAllocator final : public at::Allocator {
- at::DataPtr allocate(size_t size) const override {
- auto* ptr = THAlloc(size);
- return {ptr, ptr, &THFree, at::DeviceType::CPU};
- }
- at::DeleterFnPtr raw_deleter() const override {
- return &THFree;
- }
-};
-
-static THDefaultAllocator th_default_allocator;
at::Allocator* getTHDefaultAllocator() {
- return &th_default_allocator;
+ return c10::GetCPUAllocator();
}
#if defined(_WIN32) || defined(HAVE_MMAP)
#include <TH/THGeneral.h>
+#ifdef __cplusplus
+#include <c10/core/CPUAllocator.h>
+#endif
+
#ifdef _OPENMP
#include <omp.h>
#endif
torchGCData = data;
}
-static void* THAllocInternal(ptrdiff_t size)
-{
- void *ptr;
-
- if (size > 5120)
- {
-#if (defined(__unix) || defined(__APPLE__)) && (!defined(DISABLE_POSIX_MEMALIGN))
- if (posix_memalign(&ptr, 64, size) != 0)
- ptr = NULL;
-/*
-#elif defined(_WIN32)
- ptr = _aligned_malloc(size, 64);
-*/
-#else
- ptr = malloc(size);
-#endif
- }
- else
- {
- ptr = malloc(size);
- }
-
- return ptr;
-}
-
void* THAlloc(ptrdiff_t size)
{
- void *ptr;
-
if(size < 0)
THError("$ Torch: invalid memory size -- maybe an overflow?");
- if(size == 0)
- return NULL;
-
- ptr = THAllocInternal(size);
-
- if(!ptr && torchGCFunction) {
- torchGCFunction(torchGCData);
- ptr = THAllocInternal(size);
- }
-
- if(!ptr)
- THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", size/1073741824);
-
- return ptr;
+ return c10::alloc_cpu(size);
}
void* THRealloc(void *ptr, ptrdiff_t size)
device};
}
-} // namespace c10
-
-namespace caffe2 {
-
-C10_API at::Allocator* allocator_array[static_cast<int>(
- at::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES)];
+C10_API at::Allocator* allocator_array[at::COMPILE_TIME_MAX_DEVICE_TYPES];
void SetAllocator(at::DeviceType t, at::Allocator* alloc) {
allocator_array[static_cast<int>(t)] = alloc;
return alloc;
}
-} // namespace caffe2
+} // namespace c10
// nullptr DataPtrs can still have a nontrivial device; this allows
// us to treat zero-size allocations uniformly with non-zero allocations.
//
-class DataPtr {
+class C10_API DataPtr {
private:
c10::detail::UniqueVoidPtr ptr_;
Device device_;
Device device);
};
-} // namespace c10
-
-// TODO: move to c10
-namespace caffe2 {
-
/** Set the allocator for DeviceType `t`. The passed in allocator pointer is
* expected to have static lifetime; this function does NOT take ownership
* of the raw pointer. (The reason for this is to prevent existing pointers
static AllocatorRegisterer<t> g_allocator_##d(f); \
}
-} // namespace caffe2
+} // namespace c10
--- /dev/null
+#include <c10/core/CPUAllocator.h>
+#include <c10/util/typeid.h>
+#include <c10/core/DeviceType.h>
+
+// TODO: rename flags to C10
+C10_DEFINE_bool(
+ caffe2_report_cpu_memory_usage,
+ false,
+ "If set, print out detailed memory usage");
+
+C10_DEFINE_bool(
+ caffe2_cpu_allocator_do_zero_fill,
+ false,
+ "If set, do memory zerofilling when allocating on CPU");
+
+C10_DEFINE_bool(
+ caffe2_cpu_allocator_do_junk_fill,
+ false,
+ "If set, fill memory with deterministic junk when allocating on CPU");
+
+namespace c10 {
+
+void memset_junk(void* data, size_t num) {
+ // This garbage pattern is NaN when interpreted as floating point values,
+ // or as very large integer values.
+ static constexpr int32_t kJunkPattern = 0x7fedbeef;
+ static constexpr int64_t kJunkPattern64 =
+ static_cast<int64_t>(kJunkPattern) << 32 | kJunkPattern;
+ int32_t int64_count = num / sizeof(kJunkPattern64);
+ int32_t remaining_bytes = num % sizeof(kJunkPattern64);
+ int64_t* data_i64 = reinterpret_cast<int64_t*>(data);
+ for (int i = 0; i < int64_count; i++) {
+ data_i64[i] = kJunkPattern64;
+ }
+ if (remaining_bytes > 0) {
+ memcpy(data_i64 + int64_count, &kJunkPattern64, remaining_bytes);
+ }
+}
+
+void* alloc_cpu(size_t nbytes) {
+ if (nbytes == 0) {
+ return nullptr;
+ }
+
+ void* data;
+#ifdef __ANDROID__
+ data = memalign(gAlignment, nbytes);
+#elif defined(_MSC_VER)
+ data = _aligned_malloc(nbytes, gAlignment);
+#else
+ CAFFE_ENFORCE_EQ(posix_memalign(&data, gAlignment, nbytes), 0);
+#endif
+
+ CAFFE_ENFORCE(
+ data,
+ "DefaultCPUAllocator: not enough memory: you tried to allocate %dGB. Buy new RAM!",
+ nbytes / 1073741824);
+
+ // move data to a thread's NUMA node
+ NUMAMove(data, nbytes, GetCurrentNUMANode());
+ CHECK(
+ !FLAGS_caffe2_cpu_allocator_do_zero_fill ||
+ !FLAGS_caffe2_cpu_allocator_do_junk_fill)
+ << "Cannot request both zero-fill and junk-fill at the same time";
+ if (FLAGS_caffe2_cpu_allocator_do_zero_fill) {
+ memset(data, 0, nbytes);
+ } else if (FLAGS_caffe2_cpu_allocator_do_junk_fill) {
+ memset_junk(data, nbytes);
+ }
+
+ return data;
+}
+
+// A virtual struct that is used to report C10's memory allocation and
+// deallocation status
+class C10_API MemoryAllocationReporter {
+ public:
+ MemoryAllocationReporter() : allocated_(0) {}
+ void New(void* ptr, size_t nbytes);
+ void Delete(void* ptr);
+
+ private:
+ std::mutex mutex_;
+ std::unordered_map<void*, size_t> size_table_;
+ size_t allocated_;
+};
+
+struct C10_API DefaultCPUAllocator final : at::Allocator {
+ DefaultCPUAllocator() {}
+ ~DefaultCPUAllocator() override {}
+ at::DataPtr allocate(size_t nbytes) const override {
+ void* data = alloc_cpu(nbytes);
+ if (FLAGS_caffe2_report_cpu_memory_usage && nbytes > 0) {
+ getMemoryAllocationReporter().New(data, nbytes);
+ return {data, data, &ReportAndDelete, at::Device(at::DeviceType::CPU)};
+ }
+ return {data, data, &Delete, at::Device(at::DeviceType::CPU)};
+ }
+
+#ifdef _MSC_VER
+ static void Delete(void* data) {
+ _aligned_free(data);
+ }
+#else
+ static void Delete(void* data) {
+ free(data);
+ }
+#endif
+
+ static void ReportAndDelete(void* ptr) {
+ if (!ptr) {
+ return;
+ }
+ getMemoryAllocationReporter().Delete(ptr);
+ Delete(ptr);
+ }
+
+ at::DeleterFnPtr raw_deleter() const override {
+ if (FLAGS_caffe2_report_cpu_memory_usage) {
+ return &ReportAndDelete;
+ }
+ return &Delete;
+ }
+
+ protected:
+ static MemoryAllocationReporter& getMemoryAllocationReporter() {
+ static MemoryAllocationReporter reporter_;
+ return reporter_;
+ }
+
+};
+
+void NoDelete(void*) {}
+
+at::Allocator* GetCPUAllocator() {
+ return GetAllocator(DeviceType::CPU);
+}
+
+void SetCPUAllocator(at::Allocator* alloc) {
+ SetAllocator(DeviceType::CPU, alloc);
+}
+
+// Global default CPU Allocator
+static DefaultCPUAllocator g_cpu_alloc;
+
+at::Allocator* GetDefaultCPUAllocator() {
+ return &g_cpu_alloc;
+}
+
+REGISTER_ALLOCATOR(DeviceType::CPU, &g_cpu_alloc);
+
+void MemoryAllocationReporter::New(void* ptr, size_t nbytes) {
+ std::lock_guard<std::mutex> guard(mutex_);
+ size_table_[ptr] = nbytes;
+ allocated_ += nbytes;
+ LOG(INFO) << "C10 alloc " << nbytes << " bytes, total alloc " << allocated_
+ << " bytes.";
+}
+
+void MemoryAllocationReporter::Delete(void* ptr) {
+ std::lock_guard<std::mutex> guard(mutex_);
+ auto it = size_table_.find(ptr);
+ CHECK(it != size_table_.end());
+ allocated_ -= it->second;
+ LOG(INFO) << "C10 deleted " << it->second << " bytes, total alloc "
+ << allocated_ << " bytes.";
+ size_table_.erase(it);
+}
+
+} // namespace c10
--- /dev/null
+#pragma once
+
+#include <cstring>
+#include <unordered_map>
+
+#include <c10/core/Allocator.h>
+#include <c10/util/Logging.h>
+#include <c10/util/numa.h>
+
+// TODO: rename to c10
+C10_DECLARE_bool(caffe2_report_cpu_memory_usage);
+C10_DECLARE_bool(caffe2_cpu_allocator_do_zero_fill);
+C10_DECLARE_bool(caffe2_cpu_allocator_do_junk_fill);
+
+namespace c10 {
+
+// Use 64-byte alignment should be enough for computation up to AVX512.
+constexpr size_t gAlignment = 64;
+
+using MemoryDeleter = void (*)(void*);
+
+// A helper function that is basically doing nothing.
+C10_API void NoDelete(void*);
+
+// Fill the data memory region of num bytes with a particular garbage pattern.
+// The garbage value is chosen to be NaN if interpreted as floating point value,
+// or a very large integer.
+C10_API void memset_junk(void* data, size_t num);
+
+C10_API void* alloc_cpu(size_t nbytes);
+
+// Get the CPU Alloctor.
+C10_API at::Allocator* GetCPUAllocator();
+// Sets the CPU allocator to the given allocator: the caller gives away the
+// ownership of the pointer.
+C10_API void SetCPUAllocator(at::Allocator* alloc);
+
+// Get the Default CPU Allocator
+C10_API at::Allocator* GetDefaultCPUAllocator();
+
+} // namespace c10
data_type,
0,
at::DataPtr(nullptr, device),
- caffe2::GetAllocator(device.type()),
+ GetAllocator(device.type()),
true) {}
StorageImpl& operator=(StorageImpl&& other) = default;
// know how to reallocate it. However, in order to preserve legacy C2
// behavior, we allow reallocating the memory using default allocator.
if (allocator == nullptr) {
- allocator = caffe2::GetAllocator(storage_.device_type());
+ allocator = GetAllocator(storage_.device_type());
}
if (meta.placementNew()) {
// For types that need placement new, we will call it, as well as
#define C10_ENABLE_NUMA
#endif
+// This code used to have a lot of VLOGs. However, because allocation might be
+// triggered during static initialization, it's unsafe to invoke VLOG here
+
namespace c10 {
#ifdef C10_ENABLE_NUMA
return;
}
if (!IsNUMAEnabled()) {
- VLOG(1) << "NUMA is not enabled";
return;
}
int GetNUMANode(const void* ptr) {
if (!IsNUMAEnabled()) {
- VLOG(1) << "NUMA is not enabled";
return -1;
}
AT_ASSERT(ptr);
int GetNumNUMANodes() {
if (!IsNUMAEnabled()) {
- VLOG(1) << "NUMA is not enabled";
return -1;
}
return;
}
if (!IsNUMAEnabled()) {
- VLOG(1) << "NUMA is not enabled";
return;
}
AT_ASSERT(ptr);
int GetCurrentNUMANode() {
if (!IsNUMAEnabled()) {
- VLOG(1) << "NUMA is not enabled";
return -1;
}
}
void NUMABind(int numa_node_id) {
- if (numa_node_id >= 0) {
- VLOG(1) << "NUMA is not enabled";
- }
}
int GetNUMANode(const void* ptr) {
- VLOG(1) << "NUMA is not enabled";
return -1;
}
int GetNumNUMANodes() {
- VLOG(1) << "NUMA is not enabled";
return -1;
}
void NUMAMove(void* ptr, size_t size, int numa_node_id) {
- if (numa_node_id >= 0) {
- VLOG(1) << "NUMA is not enabled";
- }
}
int GetCurrentNUMANode() {
- VLOG(1) << "NUMA is not enabled";
return -1;
}
-#include <c10/core/Allocator.h>
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include <c10/util/typeid.h>
-
-C10_DEFINE_bool(
- caffe2_report_cpu_memory_usage,
- false,
- "If set, print out detailed memory usage");
-
-C10_DEFINE_bool(
- caffe2_cpu_allocator_do_zero_fill,
- false,
- "If set, do memory zerofilling when allocating on CPU");
-
-C10_DEFINE_bool(
- caffe2_cpu_allocator_do_junk_fill,
- false,
- "If set, fill memory with deterministic junk when allocating on CPU");
-
-namespace caffe2 {
-
-void memset_junk(void* data, size_t num) {
- // This garbage pattern is NaN when interpretted as floating point values,
- // or as very large integer values.
- static constexpr int32_t kJunkPattern = 0x7fedbeef;
- static constexpr int64_t kJunkPattern64 =
- static_cast<int64_t>(kJunkPattern) << 32 | kJunkPattern;
- int32_t int64_count = num / sizeof(kJunkPattern64);
- int32_t remaining_bytes = num % sizeof(kJunkPattern64);
- int64_t* data_i64 = reinterpret_cast<int64_t*>(data);
- for (int i = 0; i < int64_count; i++) {
- data_i64[i] = kJunkPattern64;
- }
- if (remaining_bytes > 0) {
- memcpy(data_i64 + int64_count, &kJunkPattern64, remaining_bytes);
- }
-}
-
-void NoDelete(void*) {}
-
-at::Allocator* GetCPUAllocator() {
- return GetAllocator(CPU);
-}
-
-void SetCPUAllocator(at::Allocator* alloc) {
- SetAllocator(CPU, alloc);
-}
-
-// Global default CPU Allocator
-static DefaultCPUAllocator g_cpu_alloc;
-
-REGISTER_ALLOCATOR(CPU, &g_cpu_alloc);
-
-MemoryAllocationReporter DefaultCPUAllocator::reporter_;
-
-void MemoryAllocationReporter::New(void* ptr, size_t nbytes) {
- std::lock_guard<std::mutex> guard(mutex_);
- size_table_[ptr] = nbytes;
- allocated_ += nbytes;
- LOG(INFO) << "Caffe2 alloc " << nbytes << " bytes, total alloc " << allocated_
- << " bytes.";
-}
-
-void MemoryAllocationReporter::Delete(void* ptr) {
- std::lock_guard<std::mutex> guard(mutex_);
- auto it = size_table_.find(ptr);
- CHECK(it != size_table_.end());
- allocated_ -= it->second;
- LOG(INFO) << "Caffe2 deleted " << it->second << " bytes, total alloc "
- << allocated_ << " bytes.";
- size_table_.erase(it);
-}
-
-} // namespace caffe2
+#include "caffe2/core/allocator.h"
#ifndef CAFFE2_CORE_ALLOCATOR_H_
#define CAFFE2_CORE_ALLOCATOR_H_
-
-#include <cstring>
-#include <unordered_map>
-
-#include <c10/core/Allocator.h>
-#include "caffe2/core/logging.h"
-#include "caffe2/core/numa.h"
-
-C10_DECLARE_bool(caffe2_report_cpu_memory_usage);
-C10_DECLARE_bool(caffe2_cpu_allocator_do_zero_fill);
-C10_DECLARE_bool(caffe2_cpu_allocator_do_junk_fill);
-
-namespace caffe2 {
-
-// Use 64-byte alignment should be enough for computation up to AVX512.
-constexpr size_t gCaffe2Alignment = 64;
-
-using MemoryDeleter = void (*)(void*);
-
-// A helper function that is basically doing nothing.
-CAFFE2_API void NoDelete(void*);
-
-// A virtual allocator class to do memory allocation and deallocation.
-struct CAFFE2_API CPUAllocator {
- CPUAllocator() {}
- virtual ~CPUAllocator() noexcept {}
- virtual std::pair<void*, MemoryDeleter> New(size_t nbytes) = 0;
- virtual MemoryDeleter GetDeleter() = 0;
-};
-
-// A virtual struct that is used to report Caffe2's memory allocation and
-// deallocation status
-class CAFFE2_API MemoryAllocationReporter {
- public:
- MemoryAllocationReporter() : allocated_(0) {}
- void New(void* ptr, size_t nbytes);
- void Delete(void* ptr);
-
- private:
- std::mutex mutex_;
- std::unordered_map<void*, size_t> size_table_;
- size_t allocated_;
-};
-
-// Fill the data memory region of num bytes with a particular garbage pattern.
-// The garbage value is chosen to be NaN if interpreted as floating point value,
-// or a very large integer.
-CAFFE2_API void memset_junk(void* data, size_t num);
-
-struct CAFFE2_API DefaultCPUAllocator final : at::Allocator {
- DefaultCPUAllocator() {}
- ~DefaultCPUAllocator() override {}
- at::DataPtr allocate(size_t nbytes) const override {
- void* data = nullptr;
-#ifdef __ANDROID__
- data = memalign(gCaffe2Alignment, nbytes);
-#elif defined(_MSC_VER)
- data = _aligned_malloc(nbytes, gCaffe2Alignment);
-#else
- CAFFE_ENFORCE_EQ(posix_memalign(&data, gCaffe2Alignment, nbytes), 0);
-#endif
- CAFFE_ENFORCE(data);
- // move data to a thread's NUMA node
- NUMAMove(data, nbytes, GetCurrentNUMANode());
- CHECK(
- !FLAGS_caffe2_cpu_allocator_do_zero_fill ||
- !FLAGS_caffe2_cpu_allocator_do_junk_fill)
- << "Cannot request both zero-fill and junk-fill at the same time";
- if (FLAGS_caffe2_cpu_allocator_do_zero_fill) {
- memset(data, 0, nbytes);
- } else if (FLAGS_caffe2_cpu_allocator_do_junk_fill) {
- memset_junk(data, nbytes);
- }
- if (FLAGS_caffe2_report_cpu_memory_usage) {
- reporter_.New(data, nbytes);
- return {data, data, &ReportAndDelete, at::Device(at::DeviceType::CPU)};
- }
- return {data, data, &Delete, at::Device(at::DeviceType::CPU)};
- }
-
-#ifdef _MSC_VER
- static void Delete(void* data) {
- _aligned_free(data);
- }
-#else
- static void Delete(void* data) {
- free(data);
- }
-#endif
-
- static void ReportAndDelete(void* ptr) {
- reporter_.Delete(ptr);
- Delete(ptr);
- }
-
- at::DeleterFnPtr raw_deleter() const override {
- if (FLAGS_caffe2_report_cpu_memory_usage) {
- return &ReportAndDelete;
- }
- return &Delete;
- }
-
- protected:
- static MemoryAllocationReporter reporter_;
-};
-
-// Get the CPU Alloctor.
-CAFFE2_API at::Allocator* GetCPUAllocator();
-// Sets the CPU allocator to the given allocator: the caller gives away the
-// ownership of the pointer.
-CAFFE2_API void SetCPUAllocator(at::Allocator* alloc);
-
-} // namespace caffe2
-
+#include <c10/core/CPUAllocator.h>
#endif // CAFFE2_CORE_ALLOCATOR_H_
}
}
+/**
+ * An allocator that does the CPU memory allocation with pinned memory.
+ *
+ * This is needed because if we want to do any asynchronous cuda memcpy,
+ * the underlying CPU memory also needs to be allocated into pinned memory
+ * space. As a result, whenever Caffe2 is built with GPU and there is
+ * GPU present during runtime, at global initialization time we will set
+ * the CPU memory allocator to allocate pinned memory.
+ *
+ * NB: This behavior is probably too agressive. We should consider asking users
+ * to do on-demand memory pinning (like exposed in PyTorch APIs) instead.
+ */
+struct CAFFE2_CUDA_API PinnedCPUAllocator final : public at::Allocator {
+ PinnedCPUAllocator() {
+ baseAllocator_ = GetDefaultCPUAllocator();
+ }
+ ~PinnedCPUAllocator() override {}
+ at::DataPtr allocate(size_t nbytes) const override {
+ if (nbytes == 0) {
+ // replicate c10::alloc_cpu behavior - return nullptr
+ return {nullptr, nullptr, &Delete, at::Device(CPU)};
+ }
+ void* data;
+ at::DataPtr data_ptr;
+ std::lock_guard<std::mutex> lock(CUDAContext::mutex());
+ if (IsNUMAEnabled()) {
+ data_ptr = baseAllocator_->allocate(nbytes);
+ data = data_ptr.get();
+ CAFFE_ENFORCE(data);
+ CUDA_ENFORCE(cudaHostRegister(data, nbytes, cudaHostRegisterDefault));
+ } else {
+ CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
+ data_ptr = {data, data, &Delete, at::Device(CPU)};
+ }
+ memset(data, 0, nbytes);
+ return data_ptr;
+ }
+
+ at::DeleterFnPtr raw_deleter() const override {
+ return &Delete;
+ }
+
+ private:
+ static void Delete(void* data) {
+ if (!data) {
+ return;
+ }
+ // Caffe2 uses a lazy way to figure out if one is actually going to use GPUs
+ // or not. If a CUDAContext::New() call is made, inside the CUDAContext
+ // function we will switch the cpu side allocator to a PinnedCPUAllocator.
+ // But, if one calls CPUContext::New() before any cuda allocations,
+ // PinnedCPUAllocator can still delete the corresponding memory.
+ std::lock_guard<std::mutex> lock(CUDAContext::mutex());
+ if (IsNUMAEnabled()) {
+ CUDA_ENFORCE(cudaHostUnregister(data));
+ GetDefaultCPUAllocator()->raw_deleter()(data);
+ } else {
+ cudaError_t err = cudaFreeHost(data);
+ if (err == cudaErrorInvalidValue) {
+ free(data);
+ // Calling cudaGetLastError will reset the cuda error.
+ cudaError_t _err = cudaGetLastError();
+ } else {
+ // For all other errors, still do a cuda check.
+ CUDA_ENFORCE(err);
+ }
+ }
+ }
+
+ at::Allocator* baseAllocator_;
+};
+
static PinnedCPUAllocator g_pinned_cpu_alloc;
// An initialization function that sets the CPU side to use pinned cpu
static ThreadLocalCUDAObjects& getCudaObjects();
};
-/**
- * An allocator that does the CPU memory allocation with pinned memory.
- *
- * This is needed because if we want to do any asynchronous cuda memcpy,
- * the underlying CPU memory also needs to be allocated into pinned memory
- * space. As a result, whenever Caffe2 is built with GPU and there is
- * GPU present during runtime, at global initialization time we will set
- * the CPU memory allocator to allocate pinned memory.
- */
-struct CAFFE2_CUDA_API PinnedCPUAllocator final : public at::Allocator {
- PinnedCPUAllocator() {}
- ~PinnedCPUAllocator() override {}
- at::DataPtr allocate(size_t nbytes) const override {
- void* data;
- at::DataPtr data_ptr;
- std::lock_guard<std::mutex> lock(CUDAContext::mutex());
- if (IsNUMAEnabled()) {
- data_ptr = baseAllocator_.allocate(nbytes);
- data = data_ptr.get();
- CAFFE_ENFORCE(data);
- CUDA_ENFORCE(cudaHostRegister(data, nbytes, cudaHostRegisterDefault));
- } else {
- CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
- data_ptr = {data, data, &Delete, at::Device(CPU)};
- }
- memset(data, 0, nbytes);
- return data_ptr;
- }
-
- at::DeleterFnPtr raw_deleter() const override {
- return &Delete;
- }
-
- private:
- static void Delete(void* data) {
- // Caffe2 uses a lazy way to figure out if one is actually going to use GPUs
- // or not. If a CUDAContext::New() call is made, inside the CUDAContext
- // function we will switch the cpu side allocator to a PinnedCPUAllocator.
- // But, if one calls CPUContext::New() before any cuda allocations,
- // PinnedCPUAllocator can still delete the corresponding memory.
- std::lock_guard<std::mutex> lock(CUDAContext::mutex());
- if (IsNUMAEnabled()) {
- CUDA_ENFORCE(cudaHostUnregister(data));
- DefaultCPUAllocator::Delete(data);
- } else {
- cudaError_t err = cudaFreeHost(data);
- if (err == cudaErrorInvalidValue) {
- free(data);
- // Calling cudaGetLastError will reset the cuda error.
- cudaError_t _err = cudaGetLastError();
- } else {
- // For all other errors, still do a cuda check.
- CUDA_ENFORCE(err);
- }
- }
- }
-
- DefaultCPUAllocator baseAllocator_;
-};
-
using TensorCUDA = Tensor;
} // namespace caffe2
TEST(CPUContextTest, TestAllocAlignment) {
for (int i = 1; i < 10; ++i) {
auto data = CPUContext::New(i);
- EXPECT_EQ((reinterpret_cast<size_t>(data.get()) % gCaffe2Alignment), 0);
+ EXPECT_EQ((reinterpret_cast<size_t>(data.get()) % gAlignment), 0);
// data is freed when out of scope
}
}