[TF:XLA] Delete cumulative_total_size to simplify the DFS scheduler.

author Dimitris Vardoulakis <dimvar@google.com>

Mon, 21 May 2018 21:25:04 +0000 (14:25 -0700)

committer TensorFlower Gardener <gardener@tensorflow.org>

Mon, 21 May 2018 21:27:45 +0000 (14:27 -0700)
author Dimitris Vardoulakis <dimvar@google.com>
Mon, 21 May 2018 21:25:04 +0000 (14:25 -0700)
committer TensorFlower Gardener <gardener@tensorflow.org>
Mon, 21 May 2018 21:27:45 +0000 (14:27 -0700)
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.cc b/tensorflow/compiler/xla/service/hlo_scheduling.cc

index 51c29d4..854aa94 100644 (file)
--- a/tensorflow/compiler/xla/service/hlo_scheduling.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.cc
@@ -437,7 +437,6 @@ StatusOr<std::vector<const HloInstruction*>> DFSMemoryScheduler(
    // simply users-1 for each instruction.  By subtracting 1, we're saying that
    // instructions with no users or a single user don't count; instructions with
    // lots of fan-out will be visited earlier.
-  int64 cumulative_total_size = 0;
    tensorflow::gtl::FlatMap<const HloInstruction*, int64> extra_users;
    tensorflow::gtl::FlatMap<const HloInstruction*, int64> total_sizes;
    for (const HloInstruction* hlo : computation.MakeInstructionPostOrder()) {
@@ -450,14 +449,12 @@ StatusOr<std::vector<const HloInstruction*>> DFSMemoryScheduler(
      int64 logical_buffer_size = SumLogicalBufferSizes(
          points_to_analysis.GetBuffersDefinedByInstruction(hlo), size_function);
      total_sizes[hlo] = logical_buffer_size;
-    cumulative_total_size += logical_buffer_size;
      tensorflow::gtl::FlatSet<const HloInstruction*> unique_operands(
          hlo->operands().begin(), hlo->operands().end());
      for (const HloInstruction* operand : unique_operands) {
        extra_users[hlo] += extra_users[operand];
        total_sizes[hlo] += total_sizes[operand];
      }
-    total_sizes[hlo] = std::min(total_sizes[hlo], cumulative_total_size);
    }
    CHECK_EQ(extra_users.size(), computation.instruction_count());
    CHECK_EQ(total_sizes.size(), computation.instruction_count());
@@ -512,13 +509,14 @@ StatusOr<std::vector<const HloInstruction*>> DefaultMemoryScheduler(
      const LogicalBuffer::SizeFunction& size_function,
      const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
          memory_by_computation) {
-  // We try both a list-scheduler based ordering and a DFS based ordering, and
-  // choose whichever returns a lower min-memory, not accounting for
-  // fragmentation.
-  //
-  // Note that this is just a heuristic. One obvious inaccuracy is that the
-  // memory required for sub-computations might be different when considered
-  // within the caller's context. But it's good enough for now.
+  // We try a few schedulers and choose whichever returns a lower min-memory,
+  // not accounting for fragmentation.
+  // - List is a scheduler that uses greedy heuristics.
+  // - DFS visits HLOs in postorder, with a heuristic to decide the order of
+  //   children.
+  // - Postorder does not use any heuristics.
+  // List wins for most of our benchmarks; postorder-based schedulers win for
+  // some RNNs.
    TF_ASSIGN_OR_RETURN(
        std::vector<const HloInstruction*> list_sequence,
        ListMemoryScheduler(computation, points_to_analysis, size_function,
author	Dimitris Vardoulakis <dimvar@google.com>
	Mon, 21 May 2018 21:25:04 +0000 (14:25 -0700)
committer	TensorFlower Gardener <gardener@tensorflow.org>
	Mon, 21 May 2018 21:27:45 +0000 (14:27 -0700)