[XLA:GPU] Fix HLO profiling when multiple streams are involved.

author Justin Lebar <jlebar@google.com>

Mon, 26 Feb 2018 17:24:38 +0000 (09:24 -0800)

committer TensorFlower Gardener <gardener@tensorflow.org>

Mon, 26 Feb 2018 17:29:01 +0000 (09:29 -0800)
author Justin Lebar <jlebar@google.com>
Mon, 26 Feb 2018 17:24:38 +0000 (09:24 -0800)
committer TensorFlower Gardener <gardener@tensorflow.org>
Mon, 26 Feb 2018 17:29:01 +0000 (09:29 -0800)
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc

index 623d6714de501000e38b7698620925f66425f157..04b37d913e0bc8f8226057f107da05fd1e675010 100644 (file)
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -46,12 +46,14 @@ namespace {
  class HloExecutionProfiler {
   public:
    // If profiling is enabled, start an execution timer running.
-  explicit HloExecutionProfiler(bool do_profile, HloExecutionProfile* profile,
-                                se::Stream* stream,
-                                const HloComputation* computation)
+  explicit HloExecutionProfiler(
+      bool do_profile, HloExecutionProfile* profile, se::Stream* stream,
+      const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams,
+      const HloComputation* computation)
        : do_profile_(do_profile),
          profile_(profile),
          stream_(stream),
+        sub_streams_(sub_streams),
          computation_(computation) {
      if (do_profile_) {
        clock_rate_ghz_ =
@@ -70,6 +72,7 @@ class HloExecutionProfiler {
      CHECK(!finished_execution_) << "Call FinishExecution only once!";
      finished_execution_ = true;
      if (do_profile_) {
+      stream_->ThenWaitFor(&sub_streams_);
        stream_->ThenStopTimer(execution_timer_.get());
        stream_->BlockHostUntilDone().IgnoreError();
        profile_->set_total_cycles_executed(
@@ -88,6 +91,7 @@ class HloExecutionProfiler {
    // that the hlo_instruction took to execute in the profile.
    void FinishOperation(const HloInstruction* hlo_instruction) {
      if (do_profile_) {
+      stream_->ThenWaitFor(&sub_streams_);
        stream_->ThenStopTimer(per_op_timer_.get());
        stream_->BlockHostUntilDone().IgnoreError();
        profile_->SetCyclesTakenBy(
@@ -100,6 +104,7 @@ class HloExecutionProfiler {
    double clock_rate_ghz_;
    HloExecutionProfile* profile_;
    se::Stream* stream_;
+  const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams_;
    const HloComputation* computation_;
    std::unique_ptr<se::Timer> execution_timer_;
    std::unique_ptr<se::Timer> per_op_timer_;
@@ -147,13 +152,9 @@ Status GpuExecutable::ExecuteThunks(
      LOG(WARNING) << "PROFILING: profiling is enabled";
    }
  
-  HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
-                                hlo_module_->entry_computation());
-
-  uint64 start_micros = tensorflow::Env::Default()->NowMicros();
-
    // Stream 0 indicates `main_stream` and substreams start from stream 1.
    std::vector<Pool<se::Stream>::SmartPtr> sub_streams;
+  sub_streams.reserve(thunk_schedule_->StreamCount() - 1);
    while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
      sub_streams.emplace_back();
      TF_ASSIGN_OR_RETURN(
@@ -161,6 +162,10 @@ Status GpuExecutable::ExecuteThunks(
          run_options->BorrowStream(main_stream->parent()->device_ordinal()));
    }
  
+  HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
+                                sub_streams, hlo_module_->entry_computation());
+  uint64 start_micros = tensorflow::Env::Default()->NowMicros();
+
    // The next event enqueued on stream N must not run until the thunk at
    // last_blocking_thunk_for_stream[N] completes.
    std::map<int32, const Thunk*> last_blocking_thunk_for_stream;
author	Justin Lebar <jlebar@google.com>
	Mon, 26 Feb 2018 17:24:38 +0000 (09:24 -0800)
committer	TensorFlower Gardener <gardener@tensorflow.org>
	Mon, 26 Feb 2018 17:29:01 +0000 (09:29 -0800)