From 734eb31035c45079a3b4a6a2aac28fb7a28773cf Mon Sep 17 00:00:00 2001 From: mwootton Date: Tue, 8 Jan 2019 12:23:30 -0800 Subject: [PATCH] Cache workspace size in the BenchmarkCache. (#15742) Summary: Cache the workspace size information for MIOpen for a given configuration as opposed to inquiring it every time. This reduces overhead significantly as inquiring the workspace size forces a full read of the performance database in MIOpen and this database has grown significantly in recent releases. This caching gets us back to ideal performance. Pull Request resolved: https://github.com/pytorch/pytorch/pull/15742 Differential Revision: D13598932 Pulled By: bddppq fbshipit-source-id: 4e65d247b71dec828293cf0562aac3fbd4fad83a --- aten/src/ATen/native/miopen/Conv_miopen.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp index 1635161..c04e100 100644 --- a/aten/src/ATen/native/miopen/Conv_miopen.cpp +++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp @@ -336,6 +336,10 @@ BenchmarkCache fwd_algos; BenchmarkCache bwd_data_algos; BenchmarkCache bwd_filter_algos; +BenchmarkCache fwd_wssizes; +BenchmarkCache bwd_data_wssizes; +BenchmarkCache bwd_filter_wssizes; + struct Workspace { Workspace(size_t size) : size(size), data(NULL) { data = THCudaMalloc(globalContext().lazyInitCUDA(), size); @@ -409,6 +413,7 @@ struct algorithm_search { static constexpr auto DEFAULT_ALGO = miopenConvolutionFwdAlgoGEMM; static BenchmarkCache& cache() { return fwd_algos; } + static BenchmarkCache& wsscache() { return fwd_wssizes; } static perf_t findAlgorithm(const ConvolutionArgs& args) { int perf_count; @@ -438,6 +443,7 @@ struct algorithm_search { static constexpr auto DEFAULT_ALGO = miopenConvolutionBwdDataAlgoGEMM; static BenchmarkCache& cache() { return bwd_data_algos; } + static BenchmarkCache& wsscache() { return bwd_data_wssizes; } static perf_t findAlgorithm(const ConvolutionArgs& args) { int perf_count; @@ -467,6 +473,7 @@ struct algorithm_search { static constexpr auto DEFAULT_ALGO = miopenConvolutionBwdWeightsAlgoGEMM; static BenchmarkCache& cache() { return bwd_filter_algos; } + static BenchmarkCache& wsscache() { return bwd_filter_wssizes; } static perf_t findAlgorithm(const ConvolutionArgs& args) { int perf_count; @@ -493,6 +500,7 @@ template void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) { using search = algorithm_search; auto& cache = search::cache(); + auto& wsscache = search::wsscache(); if (cache.find(args.params, algo)) { return; @@ -512,6 +520,7 @@ void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) { *algo = reinterpret_cast(perfResults); cache.insert(args.params, *algo); + wsscache.insert(args.params, perfResults.memory); THCCachingAllocator_emptyCache(); } @@ -526,7 +535,7 @@ Workspace chooseAlgorithm( using search = algorithm_search; size_t workspace_size; - workspace_size = getWorkspaceSize(args, *algo); + search::wsscache().find(args.params, &workspace_size); try { return Workspace(workspace_size); } catch (const std::exception& e) { @@ -535,9 +544,9 @@ Workspace chooseAlgorithm( // switch to default algorithm and record it in the cache to prevent // further OOM errors *algo = search::DEFAULT_ALGO; - search::cache().insert(args.params, *algo); - workspace_size = getWorkspaceSize(args, *algo); + search::cache().insert(args.params, *algo); + search::wsscache().insert(args.params, workspace_size); return Workspace(workspace_size); } } -- 2.7.4