From 2cc97784950739a0a71abad59ee263e7583ea080 Mon Sep 17 00:00:00 2001 From: Raghavan Raman Date: Fri, 10 Sep 2021 12:35:24 -0700 Subject: [PATCH] [MicroBench] Added a log_vml version of the signed log1p kernel (#64205) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64205 The log_vml version of the micro-bench is over **2x** faster than the log1p version. Here are the perf numbers: ``` --------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... --------------------------------------------------------------------------------------------- SignedLog1pBench/ATen/10/1467 45915 ns 45908 ns 14506 GB/s=2.5564G/s SignedLog1pBench/NNC/10/1467 40469 ns 40466 ns 17367 GB/s=2.9002G/s SignedLog1pBench/NNCLogVml/10/1467 19560 ns 19559 ns 35902 GB/s=6.00016G/s ``` Thanks to bertmaher for pointing this out. Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D30644716 Pulled By: navahgar fbshipit-source-id: ba2b32c79d4265cd48a2886b0c62d0e89ff69c19 --- benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp | 76 +++++++++++++++++++----- 1 file changed, 61 insertions(+), 15 deletions(-) diff --git a/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp b/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp index 44781f5..155b408 100644 --- a/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp +++ b/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -22,7 +23,7 @@ class SignedLog1pBench : public benchmark::Fixture { } void TearDown(benchmark::State& state) override { - TORCH_CHECK(at::allclose(ref_, output_)); + TORCH_CHECK(at::allclose(ref_, output_, 1e-3, 1e-3)); state.counters["GB/s"] = benchmark::Counter( uint64_t(state.iterations()) * 2 * output_.nbytes(), benchmark::Counter::kIsRate); @@ -55,24 +56,63 @@ class SignedLog1pBench : public benchmark::Fixture { [&](const VarHandle& m, const VarHandle& n) { return log1p(abs_result.load(m, n)); }); - Tensor sign = Compute( - "aten_sign", + Tensor sign_result = computeSign( + {input_ph.handle()}, {input_size_int_[0], input_size_int_[1]}); + Tensor output = Compute( + "aten_mul", {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}}, [&](const VarHandle& m, const VarHandle& n) { - return CompareSelect::make( - input_ph.load(m, n), - ExprHandle(0.0f), - ExprHandle(-1), - ExprHandle(1), - kLT); + return sign_result.load(m, n) * log1p_result.load(m, n); }); + LoopNest nest({output}, {abs_result, log1p_result, sign_result, output}); + GRAPH_DEBUG("Original Stmt: ", *nest.root_stmt()); + nest.inlineIntermediateBufs(true); + nest.prepareForCodegen(); + nest.simplify(); + nest.vectorizeInnerLoops(); + nest.simplify(); + GRAPH_DEBUG("Final stmt: ", *nest.root_stmt()); + + // StmtPtr s = IRSimplifier::simplify(nest.root_stmt()); + std::vector buf_args; + buf_args.emplace_back(input_ph); + buf_args.emplace_back(output); + LLVMCodeGen cg(nest.root_stmt(), buf_args); + + std::vector call_args; + for (auto _ : state) { + output_ = at::empty_like(ref_); + call_args.clear(); + call_args.emplace_back(input_.data_ptr()); + call_args.emplace_back(output_.data_ptr()); + cg.call(call_args); + } + } + + void runNNCLogVml(benchmark::State& state) { + Placeholder input_ph( + "input", kFloat, {input_size_int_[0], input_size_int_[1]}); + Tensor abs_result = Compute( + "aten_abs", + {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}}, + [&](const VarHandle& m, const VarHandle& n) { + return abs(input_ph.load(m, n)); + }); + Tensor log_vml_result = Compute( + "aten_log1p", + {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}}, + [&](const VarHandle& m, const VarHandle& n) { + return log_vml(abs_result.load(m, n) + ExprHandle(1)); + }); + Tensor sign_result = computeSign( + {input_ph.handle()}, {input_size_int_[0], input_size_int_[1]}); Tensor output = Compute( "aten_mul", {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}}, [&](const VarHandle& m, const VarHandle& n) { - return sign.load(m, n) * log1p_result.load(m, n); + return sign_result.load(m, n) * log_vml_result.load(m, n); }); - LoopNest nest({output}, {abs_result, log1p_result, sign, output}); + LoopNest nest({output}, {abs_result, log_vml_result, sign_result, output}); GRAPH_DEBUG("Original Stmt: ", *nest.root_stmt()); nest.inlineIntermediateBufs(true); nest.prepareForCodegen(); @@ -83,16 +123,16 @@ class SignedLog1pBench : public benchmark::Fixture { // StmtPtr s = IRSimplifier::simplify(nest.root_stmt()); std::vector buf_args; - buf_args.push_back(input_ph); - buf_args.push_back(output); + buf_args.emplace_back(input_ph); + buf_args.emplace_back(output); LLVMCodeGen cg(nest.root_stmt(), buf_args); std::vector call_args; for (auto _ : state) { output_ = at::empty_like(ref_); call_args.clear(); - call_args.push_back(input_.data_ptr()); - call_args.push_back(output_.data_ptr()); + call_args.emplace_back(input_.data_ptr()); + call_args.emplace_back(output_.data_ptr()); cg.call(call_args); } } @@ -115,6 +155,12 @@ BENCHMARK_DEFINE_F(SignedLog1pBench, NNC)(benchmark::State& state) { runNNC(state); } +BENCHMARK_DEFINE_F(SignedLog1pBench, NNCLogVml)(benchmark::State& state) { + runNNCLogVml(state); +} + BENCHMARK_REGISTER_F(SignedLog1pBench, ATen)->Args({10, 1467}); BENCHMARK_REGISTER_F(SignedLog1pBench, NNC)->Args({10, 1467}); + +BENCHMARK_REGISTER_F(SignedLog1pBench, NNCLogVml)->Args({10, 1467}); -- 2.7.4