HloComputation* entry_computation = module->entry_computation();
std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx;
+ std::unordered_map<const HloComputation*, int64> computation_to_profile_idx;
std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
std::unique_ptr<HloProfilePrinter> hlo_profile_printer;
if (module->config().hlo_profiling_enabled()) {
TF_RETURN_IF_ERROR(entry_computation->Accept(&cost_analysis));
hlo_profile_printer =
CreateHloProfilePrinter(*hlo_profile_index_map, cost_analysis);
+ computation_to_profile_idx =
+ hlo_profile_index_map->computation_to_profile_idx();
}
std::unique_ptr<Executable> cpu_executable;
const string xla_dump_hlo_proto_to =
module->config().debug_options().xla_dump_hlo_proto_to();
- // We always profile the entry computation as a whole, even if hlo profiling
- // is disabled. When hlo profiling is diabled, the executor passes in a
- // profile counter array of just one element, which corresponds to the whole
- // computation.
- std::unordered_map<const HloComputation*, int64> computation_to_profile_idx;
- if (hlo_profile_index_map) {
- computation_to_profile_idx =
- hlo_profile_index_map->computation_to_profile_idx();
- } else {
- computation_to_profile_idx[entry_computation] = 0;
- }
-
if (options::CpuParallelBackendRequested(module->config())) {
VLOG(1) << "Using parallel cpu backend";
uint64 start_micros = tensorflow::Env::Default()->NowMicros();
- // Allocate profiling counters for each hlo instruction that we would like to
- // profile. Even when not Hlo profiling, we allocate a counter for the entire
- // computation, which we use to update ExecutionProfile below.
- std::vector<int64>* profile_counters = nullptr;
- std::vector<int64> profile_counter_for_entry_computation;
- if (hlo_execution_profile) {
- profile_counters = hlo_execution_profile->mutable_profile_counters();
- } else {
- profile_counters = &profile_counter_for_entry_computation;
- profile_counter_for_entry_computation.push_back(0);
- }
+ size_t profile_counters_size =
+ hlo_execution_profile ? hlo_execution_profile->profile_counters().size()
+ : 0;
+ int64* profile_counters =
+ hlo_execution_profile
+ ? hlo_execution_profile->mutable_profile_counters()->data()
+ : nullptr;
// Call the computation function following the calling convention.
std::vector<void*> buffer_pointers;
VLOG(3) << tensorflow::strings::Printf(
" func(void* result, void* params[%zu], void* temps[%zu], "
"uint64 profile_counters[%zu])",
- args_array.size(), buffer_pointers.size(), profile_counters->size());
+ args_array.size(), buffer_pointers.size(), profile_counters_size);
VLOG(3) << tensorflow::strings::Printf(" result = %p", result_buffer);
auto ptr_printer = [](string* out, const void* p) {
tensorflow::strings::StrAppend(out, tensorflow::strings::Printf("%p", p));
" temps = [%s]",
tensorflow::str_util::Join(buffer_pointers, ", ", ptr_printer).c_str());
VLOG(3) << tensorflow::strings::Printf(" profile_counters = %p",
- profile_counters->data());
+ profile_counters);
}
compute_function_(result_buffer, run_options, args_array.data(),
- buffer_pointers.data(), profile_counters->data());
+ buffer_pointers.data(), profile_counters);
uint64 end_micros = tensorflow::Env::Default()->NowMicros();
tensorflow::mutex_lock lock(mutex_);
const double nanoseconds = (end_micros - start_micros) * 1000.0;
execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
-
+ // If hlo profiling was disabled then the cycle count is left empty.
if (hlo_execution_profile) {
execution_profile_.set_compute_cycle_count(
hlo_execution_profile->total_cycles_executed(
*module().entry_computation()));
- } else {
- execution_profile_.set_compute_cycle_count(profile_counters->back());
}
}
VLOG(1) << "done with block-host-until-done";
// Merge in run-time profile information from execution_profile.
+ //
+ // TODO(b/71713097): This is buggy -- even though the mutex takes care of
+ // C++ level races, some other concurrent ExecuteOnStreamWrapper call could
+ // have rewritten the execution_profile before we get to it.
profile->MergeFrom(execution_profile());
// Overall execution time (in nanoseconds) from the executor timer.
// If profiling is enabled, sets the total cycle count on the profile from the
// execution timer.
- ~HloExecutionProfiler() {
+ void FinishExecution() {
+ CHECK(!finished_execution_) << "Call FinishExecution only once!";
+ finished_execution_ = true;
if (do_profile_) {
stream_->ThenStopTimer(execution_timer_.get());
stream_->BlockHostUntilDone().IgnoreError();
const HloComputation* computation_;
std::unique_ptr<se::Timer> execution_timer_;
std::unique_ptr<se::Timer> per_op_timer_;
+ bool finished_execution_ = false;
};
} // namespace
if (do_profile) {
LOG(WARNING) << "PROFILING: profiling is enabled";
}
+
HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
hlo_module_->entry_computation());
+ uint64 start_micros = tensorflow::Env::Default()->NowMicros();
+
// Stream 0 indicates `main_stream` and substreams start from stream 1.
std::vector<Pool<se::Stream>::SmartPtr> sub_streams;
while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
}
}
+ profiler.FinishExecution();
+ uint64 end_micros = tensorflow::Env::Default()->NowMicros();
+
+ {
+ tensorflow::mutex_lock lock(mutex_);
+ const double nanoseconds = (end_micros - start_micros) * 1000.0;
+ execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
+
+ // If hlo profiling was disabled then the cycle count is left empty.
+ if (do_profile) {
+ execution_profile_.set_compute_cycle_count(
+ hlo_execution_profile->total_cycles_executed(
+ *module().entry_computation()));
+ }
+ }
+
return Status::OK();
}
}
std::vector<int64>* mutable_profile_counters() { return &profile_counters_; }
+ const std::vector<int64>& profile_counters() const {
+ return profile_counters_;
+ }
private:
const HloProfilePrinter& hlo_profile_printer_;
],
)
+xla_test(
+ name = "execution_profile_test",
+ srcs = ["execution_profile_test.cc"],
+ deps = [
+ ":client_library_test_base",
+ "//tensorflow/compiler/xla/client:computation_builder",
+ "//tensorflow/compiler/xla/client:global_data",
+ "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+ "//tensorflow/core:test",
+ ],
+)
+
+xla_test(
+ name = "execution_profile_test_with_xla_hlo_profile",
+ srcs = ["execution_profile_test.cc"],
+ args = ["--xla_hlo_profile"],
+ deps = [
+ ":client_library_test_base",
+ "//tensorflow/compiler/xla/client:computation_builder",
+ "//tensorflow/compiler/xla/client:global_data",
+ "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+ "//tensorflow/core:test",
+ ],
+)
+
xla_test(
name = "replay_test",
srcs = ["replay_test.cc"],
--- /dev/null
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class ExecutionProfileTest : public ClientLibraryTestBase {};
+
+XLA_TEST_F(ExecutionProfileTest,
+ DISABLED_ON_CPU_PARALLEL(ExecuteWithExecutionProfile)) {
+ Shape shape = ShapeUtil::MakeShape(F32, {256, 256});
+
+ TF_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<GlobalData> input,
+ client_->TransferToServer(
+ *Literal::CreateR2F32Linspace(1e0, 1e5, 256, 256)));
+
+ ComputationBuilder b(client_, TestName() + ".add");
+ b.Dot(b.Parameter(0, shape, "param_0"), b.Parameter(1, shape, "param_1"));
+ TF_ASSERT_OK_AND_ASSIGN(Computation dot_product, b.Build());
+
+ ExecutionProfile execution_profile;
+ TF_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<GlobalData> data,
+ client_->Execute(dot_product, {input.get(), input.get()},
+ &execution_options_, &execution_profile));
+
+ VLOG(3) << "execution_profile.compute_cycle_count() = "
+ << execution_profile.compute_cycle_count();
+ VLOG(3) << "execution_profile.compute_and_transfer_time_ns() = "
+ << execution_profile.compute_and_transfer_time_ns();
+ VLOG(3) << "execution_profile.compute_time_ns() = "
+ << execution_profile.compute_time_ns();
+
+ bool hlo_profiling_enabled =
+ execution_options_.debug_options().xla_hlo_profile();
+
+ // If HLO profiling is enabled we always expect cycle count to be populated.
+ // If HLO profiling is disabled then depending on the backend the cycle count
+ // may or may not be populated.
+ if (hlo_profiling_enabled) {
+ EXPECT_GT(execution_profile.compute_cycle_count(), 0);
+ }
+
+ EXPECT_GT(execution_profile.compute_and_transfer_time_ns(), 0);
+ EXPECT_GT(execution_profile.compute_time_ns(), 0);
+
+ TF_ASSERT_OK_AND_ASSIGN(auto computed, client_->Transfer(*data, &shape));
+ (void)computed;
+}
+
+} // namespace
+} // namespace xla