From b9b0be7af29e3a22dbddcfddb545ca5406dc2c8b Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Fri, 8 Feb 2019 09:29:59 -0800
Subject: [PATCH] Remove Legacy entry point. (#16721)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16721

The very key line is we have to set the stream to the default
stream before calling the allocator.  This is very interesting.
It shouldn't be necessary, but seemingly is!

Reviewed By: dzhulgakov

Differential Revision: D13943193

fbshipit-source-id: c21014917d9fe504fab0ad8abbc025787f559287
---
 c10/cuda/CUDACachingAllocator.cpp | 17 ++++++++++-------
 c10/cuda/CUDACachingAllocator.h   | 13 +++----------
 caffe2/core/context_gpu.cu        | 33 ++++++++++++++++++++++++++++-----
 caffe2/core/context_gpu.h         |  1 +
 4 files changed, 42 insertions(+), 22 deletions(-)
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index a24df49..11047b4 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -660,18 +660,21 @@ std::shared_ptr<void> getIpcDevPtr(std::string handle) {
   return sp;
 }
 
-cudaError_t
-Legacy::Alloc(void** refPtr, size_t nbytes, cudaStream_t stream) {
-  caching_allocator.malloc(refPtr, nbytes, stream);
-  return cudaSuccess;
+void* raw_alloc(size_t nbytes) {
+  if (nbytes == 0) {
+    return nullptr;
+  }
+  int device;
+  C10_CUDA_CHECK(cudaGetDevice(&device));
+  void* r = nullptr;
+  caching_allocator.malloc(&r, nbytes, cuda::getCurrentCUDAStream(device));
+  return r;
 }
 
-cudaError_t Legacy::Free(void* ptr) {
+void raw_delete(void* ptr) {
   caching_allocator.free(ptr);
-  return cudaSuccess;
 }
 
-
 } // namespace CUDACachingAllocator
 
 }} // namespace c10::cuda
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index f30036f..f146513 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -25,6 +25,9 @@ namespace cuda {
 
 namespace CUDACachingAllocator {
 
+C10_CUDA_API void* raw_alloc(size_t nbytes);
+C10_CUDA_API void raw_delete(void* ptr);
+
 C10_CUDA_API Allocator* get();
 C10_CUDA_API void emptyCache();
 C10_CUDA_API void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock);
@@ -41,16 +44,6 @@ C10_CUDA_API std::mutex* getFreeMutex();
 
 C10_CUDA_API std::shared_ptr<void> getIpcDevPtr(std::string handle);
 
-// Caffe2 legacy entrypoint
-
-struct THCCachingAllocator;
-
-class C10_CUDA_API Legacy {
- public:
-  cudaError_t Alloc(void** refPtr, size_t nbytes, cudaStream_t stream);
-  cudaError_t Free(void* ptr);
-};
-
 } // namespace CUDACachingAllocator
 
 }} // namespace c10::cuda
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index 2d5f18d..48e5986 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -159,8 +159,6 @@ CudaMemoryPoolType g_cuda_memory_pool_type;
 
 std::unique_ptr<cub::CachingDeviceAllocator> g_cub_allocator;
 
-std::unique_ptr<c10::cuda::CUDACachingAllocator::Legacy> g_thc_allocator;
-
 // an unordered map that holds the map from the cuda memory pointer to the
 // device id that it is allocated from. This is used in the cuda memory pool
 // cases, where we need the device id to carry out the deletion.
@@ -274,7 +272,6 @@ static void Caffe2SetCUDAMemoryPool() {
     SetUpCub();
   } else if (FLAGS_caffe2_cuda_memory_pool == "thc") {
     g_cuda_memory_pool_type = CudaMemoryPoolType::THC;
-    g_thc_allocator.reset(new c10::cuda::CUDACachingAllocator::Legacy());
   } else {
     CAFFE_THROW(
         "Unrecognized cuda memory pool type: ", FLAGS_caffe2_cuda_memory_pool);
@@ -428,7 +425,33 @@ struct DefaultCUDAAllocator final : public at::Allocator {
         }
         return {ptr, ptr, &Delete, at::Device(CUDA, CaffeCudaGetDevice())};
       case CudaMemoryPoolType::THC:
-        CUDA_ENFORCE(g_thc_allocator->Alloc(&ptr, nbytes, 0 /* stream */));
+        {
+          // The reason we have this stream guard here is to preserve
+          // the historical behavior of the 'thc' allocator in Caffe2,
+          // which is to put all allocations on the same (default)
+          // stream.  This behavior is morally wrong (since passing
+          // allocations between streams allows for the possibility
+          // of you handing out some memory that an old stream
+          // is still working on), but it doesn't seem to cause issues
+          // in Caffe2 today.  Our hypothesis for why this is the case
+          // is that Caffe2 doesn't really do very many allocations
+          // on the fly; instead they allocate once and then reuse
+          // the allocations for the whole program.  In this case,
+          // the hazard is avoided.
+          //
+          // We intend to remove this stream guard, but the benefit
+          // to putting all allocations on the same stream is it
+          // reduces per-stream fragmentation, and this helps
+          // some models that are currently running with the thc
+          // allocator fit in memory.  We will need to find some
+          // way of resolving this problem.
+          cuda::CUDAStreamGuard g(
+            Stream(
+              Stream::DEFAULT,
+              Device(kCUDA, CaffeCudaGetDevice())
+            ));
+          ptr = cuda::CUDACachingAllocator::raw_alloc(nbytes);
+        }
         if (FLAGS_caffe2_gpu_memory_tracking) {
           g_size_map[ptr] = nbytes;
           g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
@@ -486,7 +509,7 @@ struct DefaultCUDAAllocator final : public at::Allocator {
         break;
       }
       case CudaMemoryPoolType::THC: {
-        CUDA_ENFORCE(g_thc_allocator->Free(ptr));
+        cuda::CUDACachingAllocator::raw_delete(ptr);
         if (FLAGS_caffe2_gpu_memory_tracking) {
           g_cuda_device_affiliation.erase(g_cuda_device_affiliation.find(ptr));
         }
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index ff6f613..0e62708 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -23,6 +23,7 @@
 #include <c10/core/Device.h>
 #include <c10/core/Stream.h>
 #include <c10/cuda/CUDAStream.h>
+#include <c10/cuda/CUDAGuard.h>
 
 namespace caffe2 {
 
-- 
2.7.4