Remove Legacy entry point. (#16721)

author Edward Yang <ezyang@fb.com>

Fri, 8 Feb 2019 17:29:59 +0000 (09:29 -0800)

committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>

Fri, 8 Feb 2019 17:33:58 +0000 (09:33 -0800)
author Edward Yang <ezyang@fb.com>
Fri, 8 Feb 2019 17:29:59 +0000 (09:29 -0800)
committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
Fri, 8 Feb 2019 17:33:58 +0000 (09:33 -0800)
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp

index a24df49..11047b4 100644 (file)
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -660,18 +660,21 @@ std::shared_ptr<void> getIpcDevPtr(std::string handle) {
    return sp;
  }
  
-cudaError_t
-Legacy::Alloc(void** refPtr, size_t nbytes, cudaStream_t stream) {
-  caching_allocator.malloc(refPtr, nbytes, stream);
-  return cudaSuccess;
+void* raw_alloc(size_t nbytes) {
+  if (nbytes == 0) {
+    return nullptr;
+  }
+  int device;
+  C10_CUDA_CHECK(cudaGetDevice(&device));
+  void* r = nullptr;
+  caching_allocator.malloc(&r, nbytes, cuda::getCurrentCUDAStream(device));
+  return r;
  }
  
-cudaError_t Legacy::Free(void* ptr) {
+void raw_delete(void* ptr) {
    caching_allocator.free(ptr);
-  return cudaSuccess;
  }
  
-
  } // namespace CUDACachingAllocator
  
  }} // namespace c10::cuda
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h

index f30036f..f146513 100644 (file)
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -25,6 +25,9 @@ namespace cuda {
  
  namespace CUDACachingAllocator {
  
+C10_CUDA_API void* raw_alloc(size_t nbytes);
+C10_CUDA_API void raw_delete(void* ptr);
+
  C10_CUDA_API Allocator* get();
  C10_CUDA_API void emptyCache();
  C10_CUDA_API void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock);
@@ -41,16 +44,6 @@ C10_CUDA_API std::mutex* getFreeMutex();
  
  C10_CUDA_API std::shared_ptr<void> getIpcDevPtr(std::string handle);
  
-// Caffe2 legacy entrypoint
-
-struct THCCachingAllocator;
-
-class C10_CUDA_API Legacy {
- public:
-  cudaError_t Alloc(void** refPtr, size_t nbytes, cudaStream_t stream);
-  cudaError_t Free(void* ptr);
-};
-
  } // namespace CUDACachingAllocator
  
  }} // namespace c10::cuda
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu

index 2d5f18d..48e5986 100644 (file)
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -159,8 +159,6 @@ CudaMemoryPoolType g_cuda_memory_pool_type;
  
  std::unique_ptr<cub::CachingDeviceAllocator> g_cub_allocator;
  
-std::unique_ptr<c10::cuda::CUDACachingAllocator::Legacy> g_thc_allocator;
-
  // an unordered map that holds the map from the cuda memory pointer to the
  // device id that it is allocated from. This is used in the cuda memory pool
  // cases, where we need the device id to carry out the deletion.
@@ -274,7 +272,6 @@ static void Caffe2SetCUDAMemoryPool() {
      SetUpCub();
    } else if (FLAGS_caffe2_cuda_memory_pool == "thc") {
      g_cuda_memory_pool_type = CudaMemoryPoolType::THC;
-    g_thc_allocator.reset(new c10::cuda::CUDACachingAllocator::Legacy());
    } else {
      CAFFE_THROW(
          "Unrecognized cuda memory pool type: ", FLAGS_caffe2_cuda_memory_pool);
@@ -428,7 +425,33 @@ struct DefaultCUDAAllocator final : public at::Allocator {
          }
          return {ptr, ptr, &Delete, at::Device(CUDA, CaffeCudaGetDevice())};
        case CudaMemoryPoolType::THC:
-        CUDA_ENFORCE(g_thc_allocator->Alloc(&ptr, nbytes, 0 /* stream */));
+        {
+          // The reason we have this stream guard here is to preserve
+          // the historical behavior of the 'thc' allocator in Caffe2,
+          // which is to put all allocations on the same (default)
+          // stream.  This behavior is morally wrong (since passing
+          // allocations between streams allows for the possibility
+          // of you handing out some memory that an old stream
+          // is still working on), but it doesn't seem to cause issues
+          // in Caffe2 today.  Our hypothesis for why this is the case
+          // is that Caffe2 doesn't really do very many allocations
+          // on the fly; instead they allocate once and then reuse
+          // the allocations for the whole program.  In this case,
+          // the hazard is avoided.
+          //
+          // We intend to remove this stream guard, but the benefit
+          // to putting all allocations on the same stream is it
+          // reduces per-stream fragmentation, and this helps
+          // some models that are currently running with the thc
+          // allocator fit in memory.  We will need to find some
+          // way of resolving this problem.
+          cuda::CUDAStreamGuard g(
+            Stream(
+              Stream::DEFAULT,
+              Device(kCUDA, CaffeCudaGetDevice())
+            ));
+          ptr = cuda::CUDACachingAllocator::raw_alloc(nbytes);
+        }
          if (FLAGS_caffe2_gpu_memory_tracking) {
            g_size_map[ptr] = nbytes;
            g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
@@ -486,7 +509,7 @@ struct DefaultCUDAAllocator final : public at::Allocator {
          break;
        }
        case CudaMemoryPoolType::THC: {
-        CUDA_ENFORCE(g_thc_allocator->Free(ptr));
+        cuda::CUDACachingAllocator::raw_delete(ptr);
          if (FLAGS_caffe2_gpu_memory_tracking) {
            g_cuda_device_affiliation.erase(g_cuda_device_affiliation.find(ptr));
          }
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h

index ff6f613..0e62708 100644 (file)
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -23,6 +23,7 @@
  #include <c10/core/Device.h>
  #include <c10/core/Stream.h>
  #include <c10/cuda/CUDAStream.h>
+#include <c10/cuda/CUDAGuard.h>
  
  namespace caffe2 {
author	Edward Yang <ezyang@fb.com>
	Fri, 8 Feb 2019 17:29:59 +0000 (09:29 -0800)
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
	Fri, 8 Feb 2019 17:33:58 +0000 (09:33 -0800)
c10/cuda/CUDACachingAllocator.cpp		patch \| blob \| history
c10/cuda/CUDACachingAllocator.h		patch \| blob \| history
caffe2/core/context_gpu.cu		patch \| blob \| history
caffe2/core/context_gpu.h		patch \| blob \| history