From cc402d8fa157cbd995a42dea41e38836db20fbc4 Mon Sep 17 00:00:00 2001 From: ashishfarmer Date: Tue, 8 Jan 2019 11:23:01 -0800 Subject: [PATCH] Use correct workspace alloc call in MIOpen conv operator (#15712) Summary: This PR contains changes for: 1. Using memory alloc from HIPContext while allocating workspace for MIOpen conv and transpose_conv operators rather than direct HIP mem alloc 2. Minor cleanup and removing an unnecessary sync call from MIOpen conv op Differential Revision: D13598894 Pulled By: bddppq fbshipit-source-id: 44886161abdf91cd29c7c93b3e23620e1b09c7c9 --- caffe2/operators/hip/conv_op_miopen.hip | 65 ++++------------------- caffe2/operators/hip/conv_transpose_op_miopen.hip | 59 ++++---------------- 2 files changed, 18 insertions(+), 106 deletions(-) diff --git a/caffe2/operators/hip/conv_op_miopen.hip b/caffe2/operators/hip/conv_op_miopen.hip index b82a6c8..3de03bf 100644 --- a/caffe2/operators/hip/conv_op_miopen.hip +++ b/caffe2/operators/hip/conv_op_miopen.hip @@ -21,11 +21,6 @@ namespace caffe2 { -// Earlier in the days Caffe sets the default miopen workspace to 8MB. We bump -// it up to 64MB in Caffe2, as this enables the use of Winograd in many cases, -// something very beneficial to more recent CNN models. -static constexpr size_t kCONV_MIOPEN_WORKSPACE_LIMIT_BYTES = 64 * 1024 * 1024; - class MIOPENConvOpBase : public ConvPoolOpBase { public: MIOPENConvOpBase(const OperatorDef& operator_def, Workspace* ws) @@ -33,9 +28,6 @@ class MIOPENConvOpBase : public ConvPoolOpBase { miopen_wrapper_(&context_), miopen_state_( OperatorBase::GetSingleArgument("miopen_state", 0)), - miopen_ws_nbytes_limit_(OperatorBase::GetSingleArgument( - "ws_nbytes_limit", - kCONV_MIOPEN_WORKSPACE_LIMIT_BYTES)), exhaustive_search_( OperatorBase::GetSingleArgument("exhaustive_search", false)), alpha_(OperatorBase::GetSingleArgument("alpha", 1.0)), @@ -81,7 +73,6 @@ class MIOPENConvOpBase : public ConvPoolOpBase { miopenConvolutionDescriptor_t conv_desc_; miopenConvolutionMode_t mode_; size_t miopen_state_; - const size_t miopen_ws_nbytes_limit_; bool exhaustive_search_; const float alpha_; const float beta_; @@ -97,17 +88,9 @@ class MIOPENConvOp final : public MIOPENConvOpBase { OperatorBase::GetSingleArgument("returnedAlgoCount_", 1)), bestAlgoFound_( OperatorBase::GetSingleArgument("bestAlgoFound_", false)), - fwdConvWs_(nullptr), - fwdConvWsSize_(0), fwdAlgo_(miopenConvolutionFwdAlgoGEMM) {} - ~MIOPENConvOp() { - if (fwdConvWs_) { - hipFree(fwdConvWs_); - fwdConvWs_ = nullptr; - fwdConvWsSize_ = 0; - } - } + ~MIOPENConvOp() {} template < typename T_X, @@ -122,7 +105,6 @@ class MIOPENConvOp final : public MIOPENConvOpBase { const int requestAlgoCount_; int returnedAlgoCount_; bool bestAlgoFound_; - char* fwdConvWs_; size_t fwdConvWsSize_; miopenConvFwdAlgorithm_t fwdAlgo_; // Input: X, W, b @@ -144,28 +126,13 @@ class MIOPENConvGradientOp final : public MIOPENConvOpBase { bestWeightAlgoFound_( OperatorBase::GetSingleArgument("bestAlgoFound", false)), bwdWeiAlgo_(miopenConvolutionBwdWeightsAlgoGEMM), - bwdDataAlgo_(miopenConvolutionBwdDataAlgoGEMM), - bwdWeightWsSize_(0), - bwdDataWsSize_(0), - bwdWeightWs_(nullptr), - bwdDataWs_(nullptr) { + bwdDataAlgo_(miopenConvolutionBwdDataAlgoGEMM) { CAFFE_ENFORCE( !(no_bias_ && OutputSize() == 3), "If bias is not present, you should not have 3 grad output."); } - ~MIOPENConvGradientOp() { - if (bwdWeightWs_) { - hipFree(bwdWeightWs_); - bwdWeightWs_ = nullptr; - bwdWeightWsSize_ = 0; - } - if (bwdDataWs_) { - hipFree(bwdDataWs_); - bwdDataWs_ = nullptr; - bwdDataWsSize_ = 0; - } - } + ~MIOPENConvGradientOp() {} template < typename T_X, @@ -189,8 +156,6 @@ class MIOPENConvGradientOp final : public MIOPENConvOpBase { miopenConvBwdDataAlgorithm_t bwdDataAlgo_; size_t bwdWeightWsSize_; size_t bwdDataWsSize_; - char* bwdWeightWs_; - char* bwdDataWs_; // input: X, W, dY // output: dW, db, and optionally dX INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD); @@ -300,9 +265,6 @@ bool MIOPENConvOp::DoRunWithType() { conv_desc_, top_desc_, &fwdConvWsSize_)); - if ((fwdConvWsSize_ > 0) && (fwdConvWs_ == nullptr)) { - HIP_CHECK(hipMalloc(&fwdConvWs_, fwdConvWsSize_)); - } miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { MIOPEN_ENFORCE(miopenFindConvolutionForwardAlgorithm( @@ -317,7 +279,7 @@ bool MIOPENConvOp::DoRunWithType() { requestAlgoCount_, &returnedAlgoCount_, &perf, - fwdConvWs_, + state->workspace().get(fwdConvWsSize_), fwdConvWsSize_, false)); }); @@ -338,7 +300,7 @@ bool MIOPENConvOp::DoRunWithType() { &beta_, top_desc_, Y->template mutable_data(), - fwdConvWs_, + state->workspace().get(fwdConvWsSize_), fwdConvWsSize_)); }); @@ -498,9 +460,6 @@ bool MIOPENConvGradientOp::DoRunWithType() { conv_desc_, bottom_desc_, &bwdDataWsSize_)); - if ((bwdDataWsSize_ > 0) && (bwdDataWs_ == nullptr)) { - HIP_CHECK(hipMalloc(&bwdDataWs_, bwdDataWsSize_)); - } miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { MIOPEN_ENFORCE(miopenFindConvolutionBackwardDataAlgorithm( @@ -515,7 +474,7 @@ bool MIOPENConvGradientOp::DoRunWithType() { requestAlgoCount_, &returnedAlgoCount_, &perf, - bwdDataWs_, + state->workspace().get(bwdDataWsSize_), bwdDataWsSize_, false)); }); @@ -534,9 +493,6 @@ bool MIOPENConvGradientOp::DoRunWithType() { conv_desc_, weight_desc_, &bwdWeightWsSize_)); - if ((bwdWeightWsSize_ > 0) && (bwdWeightWs_ == nullptr)) { - HIP_CHECK(hipMalloc(&bwdWeightWs_, bwdWeightWsSize_)); - } miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { MIOPEN_ENFORCE(miopenFindConvolutionBackwardWeightsAlgorithm( @@ -551,7 +507,7 @@ bool MIOPENConvGradientOp::DoRunWithType() { requestAlgoCount_, &returnedAlgoCount_, &perf, - bwdWeightWs_, + state->workspace().get(bwdWeightWsSize_), bwdWeightWsSize_, false)); }); @@ -573,7 +529,7 @@ bool MIOPENConvGradientOp::DoRunWithType() { &beta_, bottom_desc_, dX->template mutable_data(), - bwdDataWs_, + state->workspace().get(bwdDataWsSize_), bwdDataWsSize_)); }); } @@ -591,13 +547,10 @@ bool MIOPENConvGradientOp::DoRunWithType() { &beta_, weight_desc_, dW->template mutable_data(), - bwdWeightWs_, + state->workspace().get(bwdWeightWsSize_), bwdWeightWsSize_)); }); - // Synchronize the work across groups. - hipDeviceSynchronize(); - ////////////////////////////////////// BIAS /////////////////////////// if (!no_bias_) { auto* dbias = Output(BIAS_OR_INPUT_GRAD); diff --git a/caffe2/operators/hip/conv_transpose_op_miopen.hip b/caffe2/operators/hip/conv_transpose_op_miopen.hip index f3d2f28..0f4d6a2 100644 --- a/caffe2/operators/hip/conv_transpose_op_miopen.hip +++ b/caffe2/operators/hip/conv_transpose_op_miopen.hip @@ -4,8 +4,6 @@ namespace caffe2 { -static constexpr size_t kCONV_MIOPEN_WORKSPACE_LIMIT_BYTES = 64 * 1024 * 1024; - class MIOPENConvTransposeOpBase : public ConvTransposeUnpoolBase { public: MIOPENConvTransposeOpBase(const OperatorDef& operator_def, Workspace* ws) @@ -13,9 +11,6 @@ class MIOPENConvTransposeOpBase : public ConvTransposeUnpoolBase { miopen_wrapper_(&context_), miopen_state_( OperatorBase::GetSingleArgument("miopen_state", 0)), - miopen_ws_nbytes_limit_(OperatorBase::GetSingleArgument( - "ws_nbytes_limit", - kCONV_MIOPEN_WORKSPACE_LIMIT_BYTES)), exhaustive_search_( OperatorBase::GetSingleArgument("exhaustive_search", false)), alpha_(OperatorBase::GetSingleArgument("alpha", 1.0)), @@ -48,7 +43,6 @@ class MIOPENConvTransposeOpBase : public ConvTransposeUnpoolBase { miopenTensorDescriptor_t top_desc_for_bias_; miopenConvolutionDescriptor_t conv_desc_; size_t miopen_state_; - const size_t miopen_ws_nbytes_limit_; bool exhaustive_search_; const float alpha_; const float beta_; @@ -65,17 +59,9 @@ class MIOPENConvTransposeOp final : public MIOPENConvTransposeOpBase { OperatorBase::GetSingleArgument("returnedAlgoCount_", 1)), bestAlgoFound_( OperatorBase::GetSingleArgument("bestAlgoFound_", false)), - fwdConvWs_(nullptr), - fwdConvWsSize_(0), fwdAlgo_(miopenConvolutionFwdAlgoGEMM) {} - ~MIOPENConvTransposeOp() { - if (fwdConvWs_) { - hipFree(fwdConvWs_); - fwdConvWs_ = nullptr; - fwdConvWsSize_ = 0; - } - } + ~MIOPENConvTransposeOp() {} bool RunOnDevice() override; @@ -83,7 +69,6 @@ class MIOPENConvTransposeOp final : public MIOPENConvTransposeOpBase { const int requestAlgoCount_; int returnedAlgoCount_; bool bestAlgoFound_; - char* fwdConvWs_; size_t fwdConvWsSize_; miopenConvFwdAlgorithm_t fwdAlgo_; // Input: X, W, b @@ -106,28 +91,13 @@ class MIOPENConvTransposeGradientOp final : public MIOPENConvTransposeOpBase { bestWeightAlgoFound_( OperatorBase::GetSingleArgument("bestAlgoFound", false)), bwdWeiAlgo_(miopenConvolutionBwdWeightsAlgoGEMM), - bwdDataAlgo_(miopenConvolutionBwdDataAlgoGEMM), - bwdWeightWsSize_(0), - bwdDataWsSize_(0), - bwdWeightWs_(nullptr), - bwdDataWs_(nullptr) { + bwdDataAlgo_(miopenConvolutionBwdDataAlgoGEMM) { CAFFE_ENFORCE( !(no_bias_ && OutputSize() == 3), "If bias is not present, you should not have 3 grad output."); } - ~MIOPENConvTransposeGradientOp() { - if (bwdWeightWs_) { - hipFree(bwdWeightWs_); - bwdWeightWs_ = nullptr; - bwdWeightWsSize_ = 0; - } - if (bwdDataWs_) { - hipFree(bwdDataWs_); - bwdDataWs_ = nullptr; - bwdDataWsSize_ = 0; - } - } + ~MIOPENConvTransposeGradientOp() {} bool RunOnDevice() override; @@ -141,8 +111,6 @@ class MIOPENConvTransposeGradientOp final : public MIOPENConvTransposeOpBase { miopenConvBwdDataAlgorithm_t bwdDataAlgo_; size_t bwdWeightWsSize_; size_t bwdDataWsSize_; - char* bwdWeightWs_; - char* bwdDataWs_; // input: X, W, dY // output: dW, db, and optionally dX INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD); @@ -230,9 +198,6 @@ bool MIOPENConvTransposeOp::RunOnDevice() { conv_desc_, top_desc_, &fwdConvWsSize_)); - if ((fwdConvWsSize_ > 0) && (fwdConvWs_ == nullptr)) { - HIP_CHECK(hipMalloc(&fwdConvWs_, fwdConvWsSize_)); - } miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { MIOPEN_ENFORCE(miopenFindConvolutionForwardAlgorithm( @@ -247,7 +212,7 @@ bool MIOPENConvTransposeOp::RunOnDevice() { requestAlgoCount_, &returnedAlgoCount_, &perf, - fwdConvWs_, + state->workspace().get(fwdConvWsSize_), fwdConvWsSize_, false)); }); @@ -269,7 +234,7 @@ bool MIOPENConvTransposeOp::RunOnDevice() { &beta_, top_desc_, Y->template mutable_data(), - fwdConvWs_, + state->workspace().get(fwdConvWsSize_), fwdConvWsSize_)); }); @@ -374,9 +339,6 @@ bool MIOPENConvTransposeGradientOp::RunOnDevice() { conv_desc_, bottom_desc_, &bwdDataWsSize_)); - if ((bwdDataWsSize_ > 0) && (bwdDataWs_ == nullptr)) { - HIP_CHECK(hipMalloc(&bwdDataWs_, bwdDataWsSize_)); - } miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { MIOPEN_ENFORCE(miopenFindConvolutionBackwardDataAlgorithm( @@ -391,7 +353,7 @@ bool MIOPENConvTransposeGradientOp::RunOnDevice() { requestAlgoCount_, &returnedAlgoCount_, &perf, - bwdDataWs_, + state->workspace().get(bwdDataWsSize_), bwdDataWsSize_, false)); }); @@ -410,9 +372,6 @@ bool MIOPENConvTransposeGradientOp::RunOnDevice() { conv_desc_, weight_desc_, &bwdWeightWsSize_)); - if ((bwdWeightWsSize_ > 0) && (bwdWeightWs_ == nullptr)) { - HIP_CHECK(hipMalloc(&bwdWeightWs_, bwdWeightWsSize_)); - } miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { MIOPEN_ENFORCE(miopenFindConvolutionBackwardWeightsAlgorithm( @@ -427,7 +386,7 @@ bool MIOPENConvTransposeGradientOp::RunOnDevice() { requestAlgoCount_, &returnedAlgoCount_, &perf, - bwdWeightWs_, + state->workspace().get(bwdWeightWsSize_), bwdWeightWsSize_, false)); }); @@ -449,7 +408,7 @@ bool MIOPENConvTransposeGradientOp::RunOnDevice() { &beta_, bottom_desc_, dX->template mutable_data(), - bwdDataWs_, + state->workspace().get(bwdDataWsSize_), bwdDataWsSize_)); }); } @@ -467,7 +426,7 @@ bool MIOPENConvTransposeGradientOp::RunOnDevice() { &beta_, weight_desc_, dW->template mutable_data(), - bwdWeightWs_, + state->workspace().get(bwdWeightWsSize_), bwdWeightWsSize_)); }); -- 2.7.4