From d66adb41874acddfd9e01f46e064965ee39850ca Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 26 Apr 2018 11:40:46 -0700 Subject: [PATCH] Simplify, test and document logic in instruction fusion that decides whether we allow fusion when an operation needs to be duplicated. PiperOrigin-RevId: 194429279 --- tensorflow/compiler/xla/service/BUILD | 1 + .../compiler/xla/service/instruction_fusion.cc | 166 ++++++++++----------- .../compiler/xla/service/instruction_fusion.h | 17 ++- .../xla/service/instruction_fusion_test.cc | 156 +++++++++++++++++++ 4 files changed, 249 insertions(+), 91 deletions(-) diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index d55da36..f39bfb8 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -1206,6 +1206,7 @@ tf_cc_test( ":instruction_fusion", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/compiler/xla/tools/parser:hlo_parser", ], ) diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc index b9ccfed..dc1a39e 100644 --- a/tensorflow/compiler/xla/service/instruction_fusion.cc +++ b/tensorflow/compiler/xla/service/instruction_fusion.cc @@ -128,11 +128,11 @@ namespace xla { return false; } -// An "effectively unary" operation is one that has one "large" +// An "effectively at most unary" operation is one that has at most one "large" // input with the others being negligible in terms of memory usage. // We use "has a smaller true rank than the output" as a heuristic // for "negligible" memory usage. -bool InstructionFusion::EffectivelyUnary(HloInstruction* hlo) { +bool InstructionFusion::EffectivelyAtMostUnary(HloInstruction* hlo) { int64 output_rank = 0; ShapeUtil::ForEachSubshape( hlo->shape(), @@ -156,66 +156,91 @@ bool InstructionFusion::EffectivelyUnary(HloInstruction* hlo) { } bool InstructionFusion::CanFuseOnAllPaths( - const HloReachabilityMap& reachability_map, HloInstruction* producer, - HloInstruction* consumer, DoNotFuseSet* do_not_fuse) { - auto could_fuse_on_all_paths = [&] { - // First check to see if we have already marked this producer as infeasible - // to fuse into consumer. - if (do_not_fuse->count(producer) > 0) { + HloInstruction* producer, HloInstruction* consumer, + const HloReachabilityMap& reachability_map, + const DoNotFuseSet& do_not_fuse) { + if (consumer == producer) { + return true; + } + if (!consumer->IsFusable()) { + return false; + } + for (int64 i = 0, e = consumer->operand_count(); i < e; ++i) { + auto* consumer_operand = consumer->mutable_operand(i); + // If the operand is not on a path to the producer, it doesn't matter + // whether it's fusable. + if (!reachability_map.IsReachable(producer, consumer_operand)) { + continue; + } + if (do_not_fuse.count(consumer_operand) > 0 || !ShouldFuse(consumer, i)) { return false; } - // Make sure it is possible for producer and consumer to exist in a fusion - // node. - if (!producer->IsFusable() || !consumer->IsFusable()) { + // The producer is reachable from consumer_operand which means we need + // to be able to fuse consumer_operand into consumer in order for + // producer to be fusable into consumer on all paths. + // Perform the recursive step: make sure producer can be fused into + // consumer_operand on all paths. + if (!CanFuseOnAllPaths(producer, consumer_operand, reachability_map, + do_not_fuse)) { return false; } - // We do an upward walk of the graph from consumer towards all paths which - // lead to producer to find any unfusable paths. - for (int64 i = 0, e = consumer->operand_count(); i < e; ++i) { - auto* consumer_operand = consumer->mutable_operand(i); - if (consumer_operand == producer) { - // This is the base case: our upward crawl ends but we need to make sure - // that fusion from consumer can happen. - if (!ShouldFuse(consumer, i)) { - return false; - } - } else if (reachability_map.IsReachable(producer, consumer_operand)) { - // The reachability map told us that consumer_operand is a node on the - // path to producer. We need to further investigate from - // consumer_operand. - - // First check if we have already ruled out fusing producer into - // consumer_operand. - if (do_not_fuse->count(consumer_operand) > 0) { - return false; - } - // Make sure it is possible for consumer_operand to exist in a fusion - // node. - if (!consumer_operand->IsFusable()) { - return false; - } - // The producer is reachable from consumer_operand which means we need - // to be able to fuse consumer_operand into consumer in order for - // producer to be fusable into consumer on all paths. - if (!ShouldFuse(consumer, i)) { - return false; - } - // Perform the recursive step: make sure producer can be fused into - // consumer_operand on all paths. - if (!CanFuseOnAllPaths(reachability_map, producer, consumer_operand, - do_not_fuse)) { - return false; - } + } + return true; +} + +InstructionFusion::DoNotFuseSet InstructionFusion::ComputeGloballyUnfusable( + tensorflow::gtl::ArraySlice post_order) { + auto reachability = computation_->ComputeReachability(); + + // Forbid fusion of producers that: + // a) Need to be duplicated, unless they can be fused into all consumers + // via all paths. + // b) Are more than unary, that is, fusing them would likely lead to an + // increase in memory bandwidth use. + // + // Note that if we allow fusion by these global rules, we may still forbid + // fusing operations that require duplication later depending on + // is_expensive_(). + DoNotFuseSet do_not_fuse; + for (HloInstruction* consumer : post_order) { + for (HloInstruction* producer : consumer->operands()) { + if (do_not_fuse.count(producer) > 0) { + continue; } + + // If the producer is effectively not more than unary, duplicating it + // will not increase the number of relevant inputs read, as the fusion + // node will only need to read at most 1 relevant input (the input of + // the producer). In that case, we do not forbid fusion of the operation + // here. + if (EffectivelyAtMostUnary(producer)) { + continue; + } + // Otherwise we will forbid fusing the op unless we can fuse it into + // all of its consumers on all paths. + // + // That means, that for: + // A --> B (fusable) + // \-> C (non-fusable) + // A will be not allowed to be fused into B, as it cannot be fused into C. + // + // Similarly, for: + // A -------------> B + // \-> C -> D -/ + // If: + // - A is fusable into B and C, and D is fusable into B + // - C is *not* fusable into D + // A will be not allowed to be fused into B, as it cannot be fused via + // all paths. + if (producer->IsFusable() && + CanFuseOnAllPaths(producer, consumer, *reachability, do_not_fuse)) { + continue; + } + do_not_fuse.insert(producer); } - return true; - }; - if (could_fuse_on_all_paths()) { - return true; } - // We couldn't fuse on all paths, record this result. - do_not_fuse->insert(producer); - return false; + + return do_not_fuse; } StatusOr InstructionFusion::Run(HloModule* module) { @@ -244,36 +269,7 @@ StatusOr InstructionFusion::Run(HloModule* module) { InsertOrDie(&post_order_index, post_order[i], i); } - DoNotFuseSet do_not_fuse; - auto reachability = computation->ComputeReachability(); - - auto cheap_to_duplicate = [this](HloInstruction* producer) { - if (producer->opcode() == HloOpcode::kBroadcast) { - return true; - } - if (producer->opcode() == HloOpcode::kConstant && - ShapeUtil::IsEffectiveScalar(producer->shape())) { - return true; - } - if (EffectivelyUnary(producer)) { - return true; - } - return false; - }; - - for (HloInstruction* consumer : post_order) { - for (HloInstruction* producer : consumer->operands()) { - if (cheap_to_duplicate(producer)) { - continue; - } - if (CanFuseOnAllPaths(*reachability, producer, consumer, - &do_not_fuse)) { - CHECK_EQ(do_not_fuse.count(producer), 0); - } else { - CHECK_GT(do_not_fuse.count(producer), 0); - } - } - } + DoNotFuseSet do_not_fuse = ComputeGloballyUnfusable(post_order); // Instruction fusion effectively fuses edges in the computation graph // (producer instruction -> consumer instruction) so we iterate over all diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h index 152d088..2ea1fcf 100644 --- a/tensorflow/compiler/xla/service/instruction_fusion.h +++ b/tensorflow/compiler/xla/service/instruction_fusion.h @@ -70,11 +70,11 @@ class InstructionFusion : public HloPassInterface { virtual HloInstruction* Fuse(HloInstruction* producer, HloInstruction* consumer); - // An "effectively unary" operation is one that has one "large" + // An "effectively unary" operation is one that has at most one "large" // input with the others being negligible in terms of memory usage. // We use "has a smaller true rank than the output" as a heuristic // for "negligible" memory usage. - bool EffectivelyUnary(HloInstruction* hlo); + bool EffectivelyAtMostUnary(HloInstruction* hlo); // Returns true if fusing producer into consumer would cause producer to be // duplicated. This is the case if producer has uses other than consumer. @@ -95,11 +95,16 @@ class InstructionFusion : public HloPassInterface { // The set of producers whose consumers we cannot fuse into. using DoNotFuseSet = std::unordered_set; - // Whether or not we can fuse consumer into original_producer on all paths + // Whether or not we can fuse producer into consumer on all paths // from the producer to the consumer where nodes are HLOs and edges are uses. - bool CanFuseOnAllPaths(const HloReachabilityMap& reachability_map, - HloInstruction* producer, HloInstruction* consumer, - DoNotFuseSet* do_not_fuse); + bool CanFuseOnAllPaths(HloInstruction* producer, HloInstruction* consumer, + const HloReachabilityMap& reachability_map, + const DoNotFuseSet& do_not_fuse); + + // Computes the set of nodes that we do not want to fuse into any of their + // consumers based on a global analysis of the HLO graph. + DoNotFuseSet ComputeGloballyUnfusable( + tensorflow::gtl::ArraySlice post_order); // Used to determine if an HLO is expensive. Expensive operations will not be // duplicated. diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc index 0fa2c95..e78b99a 100644 --- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc +++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc @@ -17,6 +17,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_matchers.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" namespace xla { @@ -92,6 +93,161 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusable) { .ValueOrDie()); } +// Counts the number of HLO ops with a given op code in the specified module. +static int Count(const HloModule& module, HloOpcode op) { + int count = 0; + for (const auto* computation : module.computations()) { + for (const auto* instruction : computation->instructions()) { + if (instruction->opcode() == op) { + ++count; + } + } + } + return count; +} + +TEST_F(InstructionFusionTest, FuseCheapNonDuplicatableOps) { + auto module = tools::Parse(R"( + HloModule test_module + ENTRY OutputFusion { + p0 = f32[4,3]{1,0} parameter(0) + add = f32[4,3]{1,0} add(p0, p0) + ROOT root = f32[4,3]{1,0} subtract(add, add) + })") + .ValueOrDie(); + // Expect the add and subtraction to be fused. + EXPECT_TRUE( + InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true) + .Run(module.get()) + .ValueOrDie()) + << module->ToString(); + EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1) << module->ToString(); + + // Make sure the add hasn't been duplicated. + EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1) << module->ToString(); +} + +TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) { + // Make sure we do not duplicate the add, as we cannot fuse through the rng. + // + // p0 -> add -------------------------> sub + // \-> abs1 -> rng -> abs2 -/ + auto module = tools::Parse(R"( + HloModule test_module + ENTRY OutputFusion { + p0 = f32[4,3]{1,0} parameter(0) + add = f32[4,3]{1,0} add(p0, p0) + abs1 = f32[4,3]{1,0} abs(add) + rng = f32[4,3]{1,0} rng(abs1), distribution=rng_uniform + abs2 = f32[4,3]{1,0} abs(rng) + ROOT root = f32[4,3]{1,0} subtract(abs2, add) + })") + .ValueOrDie(); + // We expect abs2 to be fused into root. + EXPECT_TRUE( + InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true) + .Run(module.get()) + .ValueOrDie()) + << module->ToString(); + EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1) << module->ToString(); + + // Make sure the add hasn't been duplicated. + EXPECT_EQ(Count(*module, HloOpcode::kAdd), 1) << module->ToString(); + + // Use a log node with a second consumer to break the fusion. + // + // p0 -> add -------------------------> sub + // \-> abs1 -> log -> abs2 -/ + // \-> send + module = tools::Parse(R"( + HloModule test_module + ENTRY OutputFusion { + p0 = f32[4,3]{1,0} parameter(0) + add = f32[4,3]{1,0} add(p0, p0) + abs1 = f32[4,3]{1,0} abs(add) + log = f32[4,3]{1,0} log(abs1) + send = f32[4,3]{1,0} send(log), channel_id=0 + abs2 = f32[4,3]{1,0} abs(log) + ROOT root = f32[4,3]{1,0} subtract(abs2, add) + })") + .ValueOrDie(); + + // We expect abs2 to be fused into root and abs1 to be fused into log. + EXPECT_TRUE( + InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true) + .Run(module.get()) + .ValueOrDie()) + << module->ToString(); + EXPECT_EQ(Count(*module, HloOpcode::kFusion), 2) << module->ToString(); + + // Make sure the add hasn't been duplicated. + EXPECT_EQ(Count(*module, HloOpcode::kAdd), 1) << module->ToString(); + + // Make sure we still fuse ops where one operand in the chain to the producer + // can't be fused. + // + // p0 ---> add1 -----------> sub + // \ \-> add2 -/ + // \-> log -/ + // \-> send + module = tools::Parse(R"( + HloModule test_module + ENTRY OutputFusion { + p0 = f32[4,3]{1,0} parameter(0) + add1 = f32[4,3]{1,0} add(p0, p0) + log = f32[4,3]{1,0} log(p0) + send = f32[4,3]{1,0} send(log), channel_id=0 + add2 = f32[4,3]{1,0} add(log, add1) + ROOT root = f32[4,3]{1,0} subtract(add1, add2) + })") + .ValueOrDie(); + + // Expect the add1 and add2 to be fused into root. + EXPECT_TRUE( + InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true) + .Run(module.get()) + .ValueOrDie()) + << module->ToString(); + EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1) << module->ToString(); + + // Make sure we didn't duplicate any adds. + EXPECT_EQ(Count(*module, HloOpcode::kAdd), 2) << module->ToString(); + + // A variant of the above that allows the algorithm to put add2 into the set + // of unfusable ops to short-circuit the decision whether add1 should be fused + // into sub2. + // + // /---------------\ + // p0 ---> add1 ---> add2 ------> sub2 + // \------> sub1 + // log -/ + // \-> send + module = tools::Parse(R"( + HloModule test_module + ENTRY OutputFusion { + p0 = f32[4,3]{1,0} parameter(0) + add1 = f32[4,3]{1,0} add(p0, p0) + add2 = f32[4,3]{1,0} add(add1, add1) + log = f32[4,3]{1,0} log(add2) + send = f32[4,3]{1,0} send(log), channel_id=0 + sub1 = f32[4,3]{1,0} subtract(log, add2) + sub2 = f32[4,3]{1,0} subtract(add2, add1) + ROOT root = (f32[4,3]{1,0}, f32[4,3]{1,0}) tuple(sub1, sub2) + })") + .ValueOrDie(); + + // Expect sub1 and sub2 to be fused into root. + EXPECT_TRUE( + InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true) + .Run(module.get()) + .ValueOrDie()) + << module->ToString(); + EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1) << module->ToString(); + + // Make sure we didn't duplicate any adds. + EXPECT_EQ(Count(*module, HloOpcode::kAdd), 2) << module->ToString(); +} + TEST_F(InstructionFusionTest, AllowUnaryDuplication) { HloComputation::Builder builder(TestName()); auto shape = ShapeUtil::MakeShape(F32, {16, 16}); -- 2.7.4