From 67b6696f9620734369ae99e7895fa6570d7faca6 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Tue, 22 May 2018 12:34:51 -0700
Subject: [PATCH] [XLA:GPU] Emit fused reduces from batchnorm expander

This is an intermediate step until we have working multi-output fusion. Once
we have it, this change should be reverted as it might interfere with fusion.

PiperOrigin-RevId: 197605814
---
 tensorflow/compiler/xla/service/gpu/gpu_compiler.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index d50153d..1445684 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -157,11 +157,13 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
       if (hlo_module->config().debug_options().xla_gpu_use_cudnn_batchnorm()) {
         pass.AddPass<CudnnBatchNormRewriter>();
       }
+      // TODO(kramerb): Remove use_fusion once instruction fusion can create
+      // multi-output fusions from the unfused expander output.
       pass.AddPass<BatchNormExpander>(
           /*rewrite_training_op=*/true,
           /*rewrite_inference_op=*/true,
           /*rewrite_grad_op=*/true,
-          /*use_fusion=*/false);
+          /*use_fusion=*/true);
 
       // Rewrite gather ops into smaller ones.
       pass.AddPass<GatherExpander>();
-- 
2.7.4