[XLA] Clean up our handling of ExecutionProfile and add a test case
authorSanjoy Das <sanjoy@google.com>
Wed, 10 Jan 2018 22:28:01 +0000 (14:28 -0800)
committerTensorFlower Gardener <gardener@tensorflow.org>
Wed, 10 Jan 2018 22:32:14 +0000 (14:32 -0800)
ExecutionProfile::compute_cycle_count never worked for CPU and GPU with Hlo
profiling disabled, as far as I can tell.
PiperOrigin-RevId: 181517824

tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
tensorflow/compiler/xla/service/cpu/cpu_executable.cc
tensorflow/compiler/xla/service/executable.cc
tensorflow/compiler/xla/service/gpu/gpu_executable.cc
tensorflow/compiler/xla/service/hlo_execution_profile.h
tensorflow/compiler/xla/tests/BUILD
tensorflow/compiler/xla/tests/execution_profile_test.cc [new file with mode: 0644]

index 4ab61e616edf57b506d51ae1c309bbe18bfff0b6..9636f6b5b334233f96c408ccc9779b8f34ad9220 100644 (file)
@@ -483,6 +483,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
 
   HloComputation* entry_computation = module->entry_computation();
   std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx;
+  std::unordered_map<const HloComputation*, int64> computation_to_profile_idx;
   std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
   std::unique_ptr<HloProfilePrinter> hlo_profile_printer;
   if (module->config().hlo_profiling_enabled()) {
@@ -506,6 +507,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     TF_RETURN_IF_ERROR(entry_computation->Accept(&cost_analysis));
     hlo_profile_printer =
         CreateHloProfilePrinter(*hlo_profile_index_map, cost_analysis);
+    computation_to_profile_idx =
+        hlo_profile_index_map->computation_to_profile_idx();
   }
 
   std::unique_ptr<Executable> cpu_executable;
@@ -517,18 +520,6 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   const string xla_dump_hlo_proto_to =
       module->config().debug_options().xla_dump_hlo_proto_to();
 
-  // We always profile the entry computation as a whole, even if hlo profiling
-  // is disabled.  When hlo profiling is diabled, the executor passes in a
-  // profile counter array of just one element, which corresponds to the whole
-  // computation.
-  std::unordered_map<const HloComputation*, int64> computation_to_profile_idx;
-  if (hlo_profile_index_map) {
-    computation_to_profile_idx =
-        hlo_profile_index_map->computation_to_profile_idx();
-  } else {
-    computation_to_profile_idx[entry_computation] = 0;
-  }
-
   if (options::CpuParallelBackendRequested(module->config())) {
     VLOG(1) << "Using parallel cpu backend";
 
index 028f827337979de14ec557a8f0d7a47f095bf55e..f335bd1bbc7376d1cccc0fa6aa1c0a6d6ad559ab 100644 (file)
@@ -147,17 +147,13 @@ Status CpuExecutable::ExecuteComputeFunction(
 
   uint64 start_micros = tensorflow::Env::Default()->NowMicros();
 
-  // Allocate profiling counters for each hlo instruction that we would like to
-  // profile.  Even when not Hlo profiling, we allocate a counter for the entire
-  // computation, which we use to update ExecutionProfile below.
-  std::vector<int64>* profile_counters = nullptr;
-  std::vector<int64> profile_counter_for_entry_computation;
-  if (hlo_execution_profile) {
-    profile_counters = hlo_execution_profile->mutable_profile_counters();
-  } else {
-    profile_counters = &profile_counter_for_entry_computation;
-    profile_counter_for_entry_computation.push_back(0);
-  }
+  size_t profile_counters_size =
+      hlo_execution_profile ? hlo_execution_profile->profile_counters().size()
+                            : 0;
+  int64* profile_counters =
+      hlo_execution_profile
+          ? hlo_execution_profile->mutable_profile_counters()->data()
+          : nullptr;
 
   // Call the computation function following the calling convention.
   std::vector<void*> buffer_pointers;
@@ -172,7 +168,7 @@ Status CpuExecutable::ExecuteComputeFunction(
     VLOG(3) << tensorflow::strings::Printf(
         "  func(void* result, void* params[%zu], void* temps[%zu], "
         "uint64 profile_counters[%zu])",
-        args_array.size(), buffer_pointers.size(), profile_counters->size());
+        args_array.size(), buffer_pointers.size(), profile_counters_size);
     VLOG(3) << tensorflow::strings::Printf("    result = %p", result_buffer);
     auto ptr_printer = [](string* out, const void* p) {
       tensorflow::strings::StrAppend(out, tensorflow::strings::Printf("%p", p));
@@ -184,11 +180,11 @@ Status CpuExecutable::ExecuteComputeFunction(
         "    temps = [%s]",
         tensorflow::str_util::Join(buffer_pointers, ", ", ptr_printer).c_str());
     VLOG(3) << tensorflow::strings::Printf("    profile_counters = %p",
-                                           profile_counters->data());
+                                           profile_counters);
   }
 
   compute_function_(result_buffer, run_options, args_array.data(),
-                    buffer_pointers.data(), profile_counters->data());
+                    buffer_pointers.data(), profile_counters);
 
   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
 
@@ -196,13 +192,11 @@ Status CpuExecutable::ExecuteComputeFunction(
     tensorflow::mutex_lock lock(mutex_);
     const double nanoseconds = (end_micros - start_micros) * 1000.0;
     execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
-
+    // If hlo profiling was disabled then the cycle count is left empty.
     if (hlo_execution_profile) {
       execution_profile_.set_compute_cycle_count(
           hlo_execution_profile->total_cycles_executed(
               *module().entry_computation()));
-    } else {
-      execution_profile_.set_compute_cycle_count(profile_counters->back());
     }
   }
 
index c9a6ad5edbd35041f180e06c7ae0229b2b00b95a..21e7fbea291721dfc446bae2a7002a8ec2520be4 100644 (file)
@@ -87,6 +87,10 @@ StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
     VLOG(1) << "done with block-host-until-done";
 
     // Merge in run-time profile information from execution_profile.
+    //
+    // TODO(b/71713097): This is buggy -- even though the mutex takes care of
+    // C++ level races, some other concurrent ExecuteOnStreamWrapper call could
+    // have rewritten the execution_profile before we get to it.
     profile->MergeFrom(execution_profile());
 
     // Overall execution time (in nanoseconds) from the executor timer.
index 5b019e5289b8cb86ece1c728b506ca81e2fb3555..51d164cdf427f9513bc340e090832a9b064b999c 100644 (file)
@@ -66,7 +66,9 @@ class HloExecutionProfiler {
 
   // If profiling is enabled, sets the total cycle count on the profile from the
   // execution timer.
-  ~HloExecutionProfiler() {
+  void FinishExecution() {
+    CHECK(!finished_execution_) << "Call FinishExecution only once!";
+    finished_execution_ = true;
     if (do_profile_) {
       stream_->ThenStopTimer(execution_timer_.get());
       stream_->BlockHostUntilDone().IgnoreError();
@@ -101,6 +103,7 @@ class HloExecutionProfiler {
   const HloComputation* computation_;
   std::unique_ptr<se::Timer> execution_timer_;
   std::unique_ptr<se::Timer> per_op_timer_;
+  bool finished_execution_ = false;
 };
 
 }  // namespace
@@ -143,9 +146,12 @@ Status GpuExecutable::ExecuteThunks(
   if (do_profile) {
     LOG(WARNING) << "PROFILING: profiling is enabled";
   }
+
   HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
                                 hlo_module_->entry_computation());
 
+  uint64 start_micros = tensorflow::Env::Default()->NowMicros();
+
   // Stream 0 indicates `main_stream` and substreams start from stream 1.
   std::vector<Pool<se::Stream>::SmartPtr> sub_streams;
   while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
@@ -222,6 +228,22 @@ Status GpuExecutable::ExecuteThunks(
     }
   }
 
+  profiler.FinishExecution();
+  uint64 end_micros = tensorflow::Env::Default()->NowMicros();
+
+  {
+    tensorflow::mutex_lock lock(mutex_);
+    const double nanoseconds = (end_micros - start_micros) * 1000.0;
+    execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
+
+    // If hlo profiling was disabled then the cycle count is left empty.
+    if (do_profile) {
+      execution_profile_.set_compute_cycle_count(
+          hlo_execution_profile->total_cycles_executed(
+              *module().entry_computation()));
+    }
+  }
+
   return Status::OK();
 }
 
index 470fd4ce3c205d84152238f4b18daad77e403f68..1a6b069609cb58bcc9659b4457453758a277bc0e 100644 (file)
@@ -125,6 +125,9 @@ class HloExecutionProfile {
   }
 
   std::vector<int64>* mutable_profile_counters() { return &profile_counters_; }
+  const std::vector<int64>& profile_counters() const {
+    return profile_counters_;
+  }
 
  private:
   const HloProfilePrinter& hlo_profile_printer_;
index 9ae4526d78c086c15ee4311761b318672b9e3a53..1a66ec3ce3c617f709ce01d469519e9651ceff34 100644 (file)
@@ -1405,6 +1405,31 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "execution_profile_test",
+    srcs = ["execution_profile_test.cc"],
+    deps = [
+        ":client_library_test_base",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
+xla_test(
+    name = "execution_profile_test_with_xla_hlo_profile",
+    srcs = ["execution_profile_test.cc"],
+    args = ["--xla_hlo_profile"],
+    deps = [
+        ":client_library_test_base",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "replay_test",
     srcs = ["replay_test.cc"],
diff --git a/tensorflow/compiler/xla/tests/execution_profile_test.cc b/tensorflow/compiler/xla/tests/execution_profile_test.cc
new file mode 100644 (file)
index 0000000..644cbbf
--- /dev/null
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class ExecutionProfileTest : public ClientLibraryTestBase {};
+
+XLA_TEST_F(ExecutionProfileTest,
+           DISABLED_ON_CPU_PARALLEL(ExecuteWithExecutionProfile)) {
+  Shape shape = ShapeUtil::MakeShape(F32, {256, 256});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GlobalData> input,
+      client_->TransferToServer(
+          *Literal::CreateR2F32Linspace(1e0, 1e5, 256, 256)));
+
+  ComputationBuilder b(client_, TestName() + ".add");
+  b.Dot(b.Parameter(0, shape, "param_0"), b.Parameter(1, shape, "param_1"));
+  TF_ASSERT_OK_AND_ASSIGN(Computation dot_product, b.Build());
+
+  ExecutionProfile execution_profile;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GlobalData> data,
+      client_->Execute(dot_product, {input.get(), input.get()},
+                       &execution_options_, &execution_profile));
+
+  VLOG(3) << "execution_profile.compute_cycle_count() = "
+          << execution_profile.compute_cycle_count();
+  VLOG(3) << "execution_profile.compute_and_transfer_time_ns() = "
+          << execution_profile.compute_and_transfer_time_ns();
+  VLOG(3) << "execution_profile.compute_time_ns() = "
+          << execution_profile.compute_time_ns();
+
+  bool hlo_profiling_enabled =
+      execution_options_.debug_options().xla_hlo_profile();
+
+  // If HLO profiling is enabled we always expect cycle count to be populated.
+  // If HLO profiling is disabled then depending on the backend the cycle count
+  // may or may not be populated.
+  if (hlo_profiling_enabled) {
+    EXPECT_GT(execution_profile.compute_cycle_count(), 0);
+  }
+
+  EXPECT_GT(execution_profile.compute_and_transfer_time_ns(), 0);
+  EXPECT_GT(execution_profile.compute_time_ns(), 0);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computed, client_->Transfer(*data, &shape));
+  (void)computed;
+}
+
+}  // namespace
+}  // namespace xla