// Initialize CUDA streams and cuDNN.
stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
+ workspaceSizeInBytes = 0;
+ workspace = NULL;
for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
CUDA_CHECK(cudaStreamCreate(&stream_[g]));
Dtype* top_data = top[i]->mutable_gpu_data();
const Dtype* weight = this->blobs_[0]->gpu_data();
+ size_t workspace_limit_bytes = this->kernel_h_ *
+ this->kernel_w_ *
+ this->channels_ *
+ sizeof(int) + 1;
+
// Forward through cuDNN in parallel over groups.
for (int g = 0; g < this->group_; g++) {
cudnnConvolutionFwdAlgo_t algo;
filter_desc_,
conv_descs_[i],
top_descs_[i],
- CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
- 0, // memoryLimitInBytes,
+ CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+ workspace_limit_bytes, // memoryLimitInBytes,
&algo));
// get minimum size of the workspace needed for the desired algorithm
conv_descs_[i],
top_descs_[i],
algo,
- &workspaceSizeInBytes));
+ &workspaceSizeInBytes_temp));
if (workspaceSizeInBytes_temp > workspaceSizeInBytes) {
workspaceSizeInBytes = workspaceSizeInBytes_temp;
// free the existing workspace and allocate a new (larger) one
cudaFree(this->workspace);
- cudaMalloc(&(this->workspace), workspaceSizeInBytes);
+ cudaError_t err = cudaMalloc(&(this->workspace), workspaceSizeInBytes);
+ if (err != cudaSuccess) {
+ // force zero memory path
+ algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+ workspace = NULL;
+ workspaceSizeInBytes = 0;
+ }
}
// Filters.