return sp;
}
-cudaError_t
-Legacy::Alloc(void** refPtr, size_t nbytes, cudaStream_t stream) {
- caching_allocator.malloc(refPtr, nbytes, stream);
- return cudaSuccess;
+void* raw_alloc(size_t nbytes) {
+ if (nbytes == 0) {
+ return nullptr;
+ }
+ int device;
+ C10_CUDA_CHECK(cudaGetDevice(&device));
+ void* r = nullptr;
+ caching_allocator.malloc(&r, nbytes, cuda::getCurrentCUDAStream(device));
+ return r;
}
-cudaError_t Legacy::Free(void* ptr) {
+void raw_delete(void* ptr) {
caching_allocator.free(ptr);
- return cudaSuccess;
}
-
} // namespace CUDACachingAllocator
}} // namespace c10::cuda
namespace CUDACachingAllocator {
+C10_CUDA_API void* raw_alloc(size_t nbytes);
+C10_CUDA_API void raw_delete(void* ptr);
+
C10_CUDA_API Allocator* get();
C10_CUDA_API void emptyCache();
C10_CUDA_API void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock);
C10_CUDA_API std::shared_ptr<void> getIpcDevPtr(std::string handle);
-// Caffe2 legacy entrypoint
-
-struct THCCachingAllocator;
-
-class C10_CUDA_API Legacy {
- public:
- cudaError_t Alloc(void** refPtr, size_t nbytes, cudaStream_t stream);
- cudaError_t Free(void* ptr);
-};
-
} // namespace CUDACachingAllocator
}} // namespace c10::cuda
std::unique_ptr<cub::CachingDeviceAllocator> g_cub_allocator;
-std::unique_ptr<c10::cuda::CUDACachingAllocator::Legacy> g_thc_allocator;
-
// an unordered map that holds the map from the cuda memory pointer to the
// device id that it is allocated from. This is used in the cuda memory pool
// cases, where we need the device id to carry out the deletion.
SetUpCub();
} else if (FLAGS_caffe2_cuda_memory_pool == "thc") {
g_cuda_memory_pool_type = CudaMemoryPoolType::THC;
- g_thc_allocator.reset(new c10::cuda::CUDACachingAllocator::Legacy());
} else {
CAFFE_THROW(
"Unrecognized cuda memory pool type: ", FLAGS_caffe2_cuda_memory_pool);
}
return {ptr, ptr, &Delete, at::Device(CUDA, CaffeCudaGetDevice())};
case CudaMemoryPoolType::THC:
- CUDA_ENFORCE(g_thc_allocator->Alloc(&ptr, nbytes, 0 /* stream */));
+ {
+ // The reason we have this stream guard here is to preserve
+ // the historical behavior of the 'thc' allocator in Caffe2,
+ // which is to put all allocations on the same (default)
+ // stream. This behavior is morally wrong (since passing
+ // allocations between streams allows for the possibility
+ // of you handing out some memory that an old stream
+ // is still working on), but it doesn't seem to cause issues
+ // in Caffe2 today. Our hypothesis for why this is the case
+ // is that Caffe2 doesn't really do very many allocations
+ // on the fly; instead they allocate once and then reuse
+ // the allocations for the whole program. In this case,
+ // the hazard is avoided.
+ //
+ // We intend to remove this stream guard, but the benefit
+ // to putting all allocations on the same stream is it
+ // reduces per-stream fragmentation, and this helps
+ // some models that are currently running with the thc
+ // allocator fit in memory. We will need to find some
+ // way of resolving this problem.
+ cuda::CUDAStreamGuard g(
+ Stream(
+ Stream::DEFAULT,
+ Device(kCUDA, CaffeCudaGetDevice())
+ ));
+ ptr = cuda::CUDACachingAllocator::raw_alloc(nbytes);
+ }
if (FLAGS_caffe2_gpu_memory_tracking) {
g_size_map[ptr] = nbytes;
g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
break;
}
case CudaMemoryPoolType::THC: {
- CUDA_ENFORCE(g_thc_allocator->Free(ptr));
+ cuda::CUDACachingAllocator::raw_delete(ptr);
if (FLAGS_caffe2_gpu_memory_tracking) {
g_cuda_device_affiliation.erase(g_cuda_device_affiliation.find(ptr));
}