From c76dd17b2086b760ac38e1e12ec3d4df6268d0b3 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Mon, 26 Feb 2018 09:24:38 -0800 Subject: [PATCH] [XLA:GPU] Fix HLO profiling when multiple streams are involved. We were enqueueing the timer on the main stream, but not blocking the substreams, so the results were nonsensical. PiperOrigin-RevId: 187032412 --- .../compiler/xla/service/gpu/gpu_executable.cc | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index 623d671..04b37d9 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -46,12 +46,14 @@ namespace { class HloExecutionProfiler { public: // If profiling is enabled, start an execution timer running. - explicit HloExecutionProfiler(bool do_profile, HloExecutionProfile* profile, - se::Stream* stream, - const HloComputation* computation) + explicit HloExecutionProfiler( + bool do_profile, HloExecutionProfile* profile, se::Stream* stream, + const std::vector::SmartPtr>& sub_streams, + const HloComputation* computation) : do_profile_(do_profile), profile_(profile), stream_(stream), + sub_streams_(sub_streams), computation_(computation) { if (do_profile_) { clock_rate_ghz_ = @@ -70,6 +72,7 @@ class HloExecutionProfiler { CHECK(!finished_execution_) << "Call FinishExecution only once!"; finished_execution_ = true; if (do_profile_) { + stream_->ThenWaitFor(&sub_streams_); stream_->ThenStopTimer(execution_timer_.get()); stream_->BlockHostUntilDone().IgnoreError(); profile_->set_total_cycles_executed( @@ -88,6 +91,7 @@ class HloExecutionProfiler { // that the hlo_instruction took to execute in the profile. void FinishOperation(const HloInstruction* hlo_instruction) { if (do_profile_) { + stream_->ThenWaitFor(&sub_streams_); stream_->ThenStopTimer(per_op_timer_.get()); stream_->BlockHostUntilDone().IgnoreError(); profile_->SetCyclesTakenBy( @@ -100,6 +104,7 @@ class HloExecutionProfiler { double clock_rate_ghz_; HloExecutionProfile* profile_; se::Stream* stream_; + const std::vector::SmartPtr>& sub_streams_; const HloComputation* computation_; std::unique_ptr execution_timer_; std::unique_ptr per_op_timer_; @@ -147,13 +152,9 @@ Status GpuExecutable::ExecuteThunks( LOG(WARNING) << "PROFILING: profiling is enabled"; } - HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream, - hlo_module_->entry_computation()); - - uint64 start_micros = tensorflow::Env::Default()->NowMicros(); - // Stream 0 indicates `main_stream` and substreams start from stream 1. std::vector::SmartPtr> sub_streams; + sub_streams.reserve(thunk_schedule_->StreamCount() - 1); while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) { sub_streams.emplace_back(); TF_ASSIGN_OR_RETURN( @@ -161,6 +162,10 @@ Status GpuExecutable::ExecuteThunks( run_options->BorrowStream(main_stream->parent()->device_ordinal())); } + HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream, + sub_streams, hlo_module_->entry_computation()); + uint64 start_micros = tensorflow::Env::Default()->NowMicros(); + // The next event enqueued on stream N must not run until the thunk at // last_blocking_thunk_for_stream[N] completes. std::map last_blocking_thunk_for_stream; -- 2.7.4