Use correct workspace alloc call in MIOpen conv operator (#15712)
authorashishfarmer <ashish.farmer@amd.com>
Tue, 8 Jan 2019 19:23:01 +0000 (11:23 -0800)
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>
Tue, 8 Jan 2019 19:38:45 +0000 (11:38 -0800)
Summary:
This PR contains changes for:
1. Using memory alloc from HIPContext while allocating workspace for MIOpen conv and transpose_conv operators rather than direct HIP mem alloc
2. Minor cleanup and removing an unnecessary sync call from MIOpen conv op

Differential Revision: D13598894

Pulled By: bddppq

fbshipit-source-id: 44886161abdf91cd29c7c93b3e23620e1b09c7c9

caffe2/operators/hip/conv_op_miopen.hip
caffe2/operators/hip/conv_transpose_op_miopen.hip

index b82a6c8..3de03bf 100644 (file)
 
 namespace caffe2 {
 
-// Earlier in the days Caffe sets the default miopen workspace to 8MB. We bump
-// it up to 64MB in Caffe2, as this enables the use of Winograd in many cases,
-// something very beneficial to more recent CNN models.
-static constexpr size_t kCONV_MIOPEN_WORKSPACE_LIMIT_BYTES = 64 * 1024 * 1024;
-
 class MIOPENConvOpBase : public ConvPoolOpBase<HIPContext> {
  public:
   MIOPENConvOpBase(const OperatorDef& operator_def, Workspace* ws)
@@ -33,9 +28,6 @@ class MIOPENConvOpBase : public ConvPoolOpBase<HIPContext> {
         miopen_wrapper_(&context_),
         miopen_state_(
             OperatorBase::GetSingleArgument<size_t>("miopen_state", 0)),
-        miopen_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
-            "ws_nbytes_limit",
-            kCONV_MIOPEN_WORKSPACE_LIMIT_BYTES)),
         exhaustive_search_(
             OperatorBase::GetSingleArgument<bool>("exhaustive_search", false)),
         alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)),
@@ -81,7 +73,6 @@ class MIOPENConvOpBase : public ConvPoolOpBase<HIPContext> {
   miopenConvolutionDescriptor_t conv_desc_;
   miopenConvolutionMode_t mode_;
   size_t miopen_state_;
-  const size_t miopen_ws_nbytes_limit_;
   bool exhaustive_search_;
   const float alpha_;
   const float beta_;
@@ -97,17 +88,9 @@ class MIOPENConvOp final : public MIOPENConvOpBase {
             OperatorBase::GetSingleArgument<int>("returnedAlgoCount_", 1)),
         bestAlgoFound_(
             OperatorBase::GetSingleArgument<bool>("bestAlgoFound_", false)),
-        fwdConvWs_(nullptr),
-        fwdConvWsSize_(0),
         fwdAlgo_(miopenConvolutionFwdAlgoGEMM) {}
 
-  ~MIOPENConvOp() {
-    if (fwdConvWs_) {
-      hipFree(fwdConvWs_);
-      fwdConvWs_ = nullptr;
-      fwdConvWsSize_ = 0;
-    }
-  }
+  ~MIOPENConvOp() {}
 
   template <
       typename T_X,
@@ -122,7 +105,6 @@ class MIOPENConvOp final : public MIOPENConvOpBase {
   const int requestAlgoCount_;
   int returnedAlgoCount_;
   bool bestAlgoFound_;
-  char* fwdConvWs_;
   size_t fwdConvWsSize_;
   miopenConvFwdAlgorithm_t fwdAlgo_;
   // Input: X, W, b
@@ -144,28 +126,13 @@ class MIOPENConvGradientOp final : public MIOPENConvOpBase {
         bestWeightAlgoFound_(
             OperatorBase::GetSingleArgument<bool>("bestAlgoFound", false)),
         bwdWeiAlgo_(miopenConvolutionBwdWeightsAlgoGEMM),
-        bwdDataAlgo_(miopenConvolutionBwdDataAlgoGEMM),
-        bwdWeightWsSize_(0),
-        bwdDataWsSize_(0),
-        bwdWeightWs_(nullptr),
-        bwdDataWs_(nullptr) {
+        bwdDataAlgo_(miopenConvolutionBwdDataAlgoGEMM) {
     CAFFE_ENFORCE(
         !(no_bias_ && OutputSize() == 3),
         "If bias is not present, you should not have 3 grad output.");
   }
 
-  ~MIOPENConvGradientOp() {
-    if (bwdWeightWs_) {
-      hipFree(bwdWeightWs_);
-      bwdWeightWs_ = nullptr;
-      bwdWeightWsSize_ = 0;
-    }
-    if (bwdDataWs_) {
-      hipFree(bwdDataWs_);
-      bwdDataWs_ = nullptr;
-      bwdDataWsSize_ = 0;
-    }
-  }
+  ~MIOPENConvGradientOp() {}
 
   template <
       typename T_X,
@@ -189,8 +156,6 @@ class MIOPENConvGradientOp final : public MIOPENConvOpBase {
   miopenConvBwdDataAlgorithm_t bwdDataAlgo_;
   size_t bwdWeightWsSize_;
   size_t bwdDataWsSize_;
-  char* bwdWeightWs_;
-  char* bwdDataWs_;
   // input: X, W, dY
   // output: dW, db, and optionally dX
   INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
@@ -300,9 +265,6 @@ bool MIOPENConvOp::DoRunWithType() {
           conv_desc_,
           top_desc_,
           &fwdConvWsSize_));
-      if ((fwdConvWsSize_ > 0) && (fwdConvWs_ == nullptr)) {
-          HIP_CHECK(hipMalloc(&fwdConvWs_, fwdConvWsSize_));
-      }
 
       miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
           MIOPEN_ENFORCE(miopenFindConvolutionForwardAlgorithm(
@@ -317,7 +279,7 @@ bool MIOPENConvOp::DoRunWithType() {
             requestAlgoCount_,
             &returnedAlgoCount_,
             &perf,
-            fwdConvWs_,
+            state->workspace().get(fwdConvWsSize_),
             fwdConvWsSize_,
             false));
       });
@@ -338,7 +300,7 @@ bool MIOPENConvOp::DoRunWithType() {
         &beta_,
         top_desc_,
         Y->template mutable_data<T_Y>(),
-        fwdConvWs_,
+        state->workspace().get(fwdConvWsSize_),
         fwdConvWsSize_));
   });
 
@@ -498,9 +460,6 @@ bool MIOPENConvGradientOp::DoRunWithType() {
           conv_desc_,
           bottom_desc_,
           &bwdDataWsSize_));
-      if ((bwdDataWsSize_ > 0) && (bwdDataWs_ == nullptr)) {
-          HIP_CHECK(hipMalloc(&bwdDataWs_, bwdDataWsSize_));
-      }
 
       miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
         MIOPEN_ENFORCE(miopenFindConvolutionBackwardDataAlgorithm(
@@ -515,7 +474,7 @@ bool MIOPENConvGradientOp::DoRunWithType() {
           requestAlgoCount_,
           &returnedAlgoCount_,
           &perf,
-          bwdDataWs_,
+          state->workspace().get(bwdDataWsSize_),
           bwdDataWsSize_,
           false));
       });
@@ -534,9 +493,6 @@ bool MIOPENConvGradientOp::DoRunWithType() {
             conv_desc_,
             weight_desc_,
             &bwdWeightWsSize_));
-        if ((bwdWeightWsSize_ > 0) && (bwdWeightWs_ == nullptr)) {
-          HIP_CHECK(hipMalloc(&bwdWeightWs_, bwdWeightWsSize_));
-        }
 
         miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
           MIOPEN_ENFORCE(miopenFindConvolutionBackwardWeightsAlgorithm(
@@ -551,7 +507,7 @@ bool MIOPENConvGradientOp::DoRunWithType() {
             requestAlgoCount_,
             &returnedAlgoCount_,
             &perf,
-            bwdWeightWs_,
+            state->workspace().get(bwdWeightWsSize_),
             bwdWeightWsSize_,
             false));
         });
@@ -573,7 +529,7 @@ bool MIOPENConvGradientOp::DoRunWithType() {
           &beta_,
           bottom_desc_,
           dX->template mutable_data<T_DX>(),
-          bwdDataWs_,
+          state->workspace().get(bwdDataWsSize_),
           bwdDataWsSize_));
         });
   }
@@ -591,13 +547,10 @@ bool MIOPENConvGradientOp::DoRunWithType() {
       &beta_,
       weight_desc_,
       dW->template mutable_data<T_DW>(),
-      bwdWeightWs_,
+      state->workspace().get(bwdWeightWsSize_),
       bwdWeightWsSize_));
   });
 
-  // Synchronize the work across groups.
-  hipDeviceSynchronize();
-
   ////////////////////////////////////// BIAS ///////////////////////////
   if (!no_bias_) {
       auto* dbias = Output(BIAS_OR_INPUT_GRAD);
index f3d2f28..0f4d6a2 100644 (file)
@@ -4,8 +4,6 @@
 
 namespace caffe2 {
 
-static constexpr size_t kCONV_MIOPEN_WORKSPACE_LIMIT_BYTES = 64 * 1024 * 1024;
-
 class MIOPENConvTransposeOpBase : public ConvTransposeUnpoolBase<HIPContext> {
  public:
   MIOPENConvTransposeOpBase(const OperatorDef& operator_def, Workspace* ws)
@@ -13,9 +11,6 @@ class MIOPENConvTransposeOpBase : public ConvTransposeUnpoolBase<HIPContext> {
         miopen_wrapper_(&context_),
         miopen_state_(
             OperatorBase::GetSingleArgument<size_t>("miopen_state", 0)),
-        miopen_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
-            "ws_nbytes_limit",
-            kCONV_MIOPEN_WORKSPACE_LIMIT_BYTES)),
         exhaustive_search_(
             OperatorBase::GetSingleArgument<bool>("exhaustive_search", false)),
         alpha_(OperatorBase::GetSingleArgument<float>("alpha", 1.0)),
@@ -48,7 +43,6 @@ class MIOPENConvTransposeOpBase : public ConvTransposeUnpoolBase<HIPContext> {
   miopenTensorDescriptor_t top_desc_for_bias_;
   miopenConvolutionDescriptor_t conv_desc_;
   size_t miopen_state_;
-  const size_t miopen_ws_nbytes_limit_;
   bool exhaustive_search_;
   const float alpha_;
   const float beta_;
@@ -65,17 +59,9 @@ class MIOPENConvTransposeOp final : public MIOPENConvTransposeOpBase {
             OperatorBase::GetSingleArgument<int>("returnedAlgoCount_", 1)),
         bestAlgoFound_(
             OperatorBase::GetSingleArgument<bool>("bestAlgoFound_", false)),
-        fwdConvWs_(nullptr),
-        fwdConvWsSize_(0),
         fwdAlgo_(miopenConvolutionFwdAlgoGEMM) {}
 
-  ~MIOPENConvTransposeOp() {
-    if (fwdConvWs_) {
-      hipFree(fwdConvWs_);
-      fwdConvWs_ = nullptr;
-      fwdConvWsSize_ = 0;
-    }
-  }
+  ~MIOPENConvTransposeOp() {}
 
   bool RunOnDevice() override;
 
@@ -83,7 +69,6 @@ class MIOPENConvTransposeOp final : public MIOPENConvTransposeOpBase {
   const int requestAlgoCount_;
   int returnedAlgoCount_;
   bool bestAlgoFound_;
-  char* fwdConvWs_;
   size_t fwdConvWsSize_;
   miopenConvFwdAlgorithm_t fwdAlgo_;
   // Input: X, W, b
@@ -106,28 +91,13 @@ class MIOPENConvTransposeGradientOp final : public MIOPENConvTransposeOpBase {
         bestWeightAlgoFound_(
             OperatorBase::GetSingleArgument<bool>("bestAlgoFound", false)),
         bwdWeiAlgo_(miopenConvolutionBwdWeightsAlgoGEMM),
-        bwdDataAlgo_(miopenConvolutionBwdDataAlgoGEMM),
-        bwdWeightWsSize_(0),
-        bwdDataWsSize_(0),
-        bwdWeightWs_(nullptr),
-        bwdDataWs_(nullptr) {
+        bwdDataAlgo_(miopenConvolutionBwdDataAlgoGEMM) {
     CAFFE_ENFORCE(
         !(no_bias_ && OutputSize() == 3),
         "If bias is not present, you should not have 3 grad output.");
   }
 
-  ~MIOPENConvTransposeGradientOp() {
-    if (bwdWeightWs_) {
-      hipFree(bwdWeightWs_);
-      bwdWeightWs_ = nullptr;
-      bwdWeightWsSize_ = 0;
-    }
-    if (bwdDataWs_) {
-      hipFree(bwdDataWs_);
-      bwdDataWs_ = nullptr;
-      bwdDataWsSize_ = 0;
-    }
-  }
+  ~MIOPENConvTransposeGradientOp() {}
 
   bool RunOnDevice() override;
 
@@ -141,8 +111,6 @@ class MIOPENConvTransposeGradientOp final : public MIOPENConvTransposeOpBase {
   miopenConvBwdDataAlgorithm_t bwdDataAlgo_;
   size_t bwdWeightWsSize_;
   size_t bwdDataWsSize_;
-  char* bwdWeightWs_;
-  char* bwdDataWs_;
   // input: X, W, dY
   // output: dW, db, and optionally dX
   INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
@@ -230,9 +198,6 @@ bool MIOPENConvTransposeOp<T>::RunOnDevice() {
           conv_desc_,
           top_desc_,
           &fwdConvWsSize_));
-      if ((fwdConvWsSize_ > 0) && (fwdConvWs_ == nullptr)) {
-          HIP_CHECK(hipMalloc(&fwdConvWs_, fwdConvWsSize_));
-      }
 
       miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
           MIOPEN_ENFORCE(miopenFindConvolutionForwardAlgorithm(
@@ -247,7 +212,7 @@ bool MIOPENConvTransposeOp<T>::RunOnDevice() {
             requestAlgoCount_,
             &returnedAlgoCount_,
             &perf,
-            fwdConvWs_,
+            state->workspace().get(fwdConvWsSize_),
             fwdConvWsSize_,
             false));
       });
@@ -269,7 +234,7 @@ bool MIOPENConvTransposeOp<T>::RunOnDevice() {
         &beta_,
         top_desc_,
         Y->template mutable_data<T>(),
-        fwdConvWs_,
+        state->workspace().get(fwdConvWsSize_),
         fwdConvWsSize_));
   });
 
@@ -374,9 +339,6 @@ bool MIOPENConvTransposeGradientOp<T>::RunOnDevice() {
           conv_desc_,
           bottom_desc_,
           &bwdDataWsSize_));
-      if ((bwdDataWsSize_ > 0) && (bwdDataWs_ == nullptr)) {
-          HIP_CHECK(hipMalloc(&bwdDataWs_, bwdDataWsSize_));
-      }
 
       miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
         MIOPEN_ENFORCE(miopenFindConvolutionBackwardDataAlgorithm(
@@ -391,7 +353,7 @@ bool MIOPENConvTransposeGradientOp<T>::RunOnDevice() {
           requestAlgoCount_,
           &returnedAlgoCount_,
           &perf,
-          bwdDataWs_,
+          state->workspace().get(bwdDataWsSize_),
           bwdDataWsSize_,
           false));
       });
@@ -410,9 +372,6 @@ bool MIOPENConvTransposeGradientOp<T>::RunOnDevice() {
             conv_desc_,
             weight_desc_,
             &bwdWeightWsSize_));
-        if ((bwdWeightWsSize_ > 0) && (bwdWeightWs_ == nullptr)) {
-          HIP_CHECK(hipMalloc(&bwdWeightWs_, bwdWeightWsSize_));
-        }
 
         miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
           MIOPEN_ENFORCE(miopenFindConvolutionBackwardWeightsAlgorithm(
@@ -427,7 +386,7 @@ bool MIOPENConvTransposeGradientOp<T>::RunOnDevice() {
             requestAlgoCount_,
             &returnedAlgoCount_,
             &perf,
-            bwdWeightWs_,
+            state->workspace().get(bwdWeightWsSize_),
             bwdWeightWsSize_,
             false));
         });
@@ -449,7 +408,7 @@ bool MIOPENConvTransposeGradientOp<T>::RunOnDevice() {
           &beta_,
           bottom_desc_,
           dX->template mutable_data<T>(),
-          bwdDataWs_,
+          state->workspace().get(bwdDataWsSize_),
           bwdDataWsSize_));
         });
   }
@@ -467,7 +426,7 @@ bool MIOPENConvTransposeGradientOp<T>::RunOnDevice() {
       &beta_,
       weight_desc_,
       dW->template mutable_data<T>(),
-      bwdWeightWs_,
+      state->workspace().get(bwdWeightWsSize_),
       bwdWeightWsSize_));
   });