Turn on heuristic (mostly-NHWC) convolution layout assignment for (V100, fp16) by...

author A. Unique TensorFlower <gardener@tensorflow.org>

Sat, 26 May 2018 00:22:17 +0000 (17:22 -0700)

committer TensorFlower Gardener <gardener@tensorflow.org>

Sat, 26 May 2018 00:26:59 +0000 (17:26 -0700)
author A. Unique TensorFlower <gardener@tensorflow.org>
Sat, 26 May 2018 00:22:17 +0000 (17:22 -0700)
committer TensorFlower Gardener <gardener@tensorflow.org>
Sat, 26 May 2018 00:26:59 +0000 (17:26 -0700)
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_options.cc b/tensorflow/compiler/xla/service/gpu/gpu_options.cc

index 174aaf1..35b4b4e 100644 (file)
--- a/tensorflow/compiler/xla/service/gpu/gpu_options.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_options.cc
@@ -20,8 +20,8 @@ namespace xla {
  namespace gpu {
  
  bool ConvUseLayoutHeuristic(const HloModuleConfig& config) {
-  return config.debug_options().xla_backend_extra_options().count(
-      "xla_gpu_experimental_conv_use_layout_heuristic");
+  return !config.debug_options().xla_backend_extra_options().count(
+      "xla_gpu_experimental_conv_disable_layout_heuristic");
  }
  
  }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc

index d07d197..ae4e305 100644 (file)
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1171,9 +1171,13 @@ Status IrEmitterUnnested::EmitColumnReduction(
    //   4567  // Numbers indicate tile IDs.
    //
    // Each tile is first partially reduced to a scalar by a thread, and then the
-  // scalar is accumulated to the output vector using atomic operations. We
-  // choose 16 as the tile size, which matches Eigen's ColumnReduceKernel.
-  constexpr int64 kTileSize = 16;
+  // scalar is accumulated to the output vector using atomic operations.
+  //
+  // We choose 128 as the tile size based on empirical evidence. It's big enough
+  // to reduce the amount of atomic adds in the end, maximizing the memory
+  // bandwidth.
+  constexpr int64 kTileSize = 128;
+
    // If the height is not a multiple of the tile size, we pad the bottom of the
    // input matrix.
    const int64 height_in_tiles = CeilOfRatio(height, kTileSize);
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD

index 1a12fd0..a62d49e 100644 (file)
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -808,7 +808,7 @@ xla_test(
      name = "convolution_test_gpu_alternative_layout",
      timeout = "long",
      srcs = ["convolution_test.cc"],
-    backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_use_layout_heuristic"]},
+    backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_disable_layout_heuristic"]},
      backends = ["gpu"],
      shard_count = 25,
      deps = CONVOLUTION_TEST_DEPS,
author	A. Unique TensorFlower <gardener@tensorflow.org>
	Sat, 26 May 2018 00:22:17 +0000 (17:22 -0700)
committer	TensorFlower Gardener <gardener@tensorflow.org>
	Sat, 26 May 2018 00:26:59 +0000 (17:26 -0700)
tensorflow/compiler/xla/service/gpu/gpu_options.cc		patch \| blob \| history
tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc		patch \| blob \| history
tensorflow/compiler/xla/tests/BUILD		patch \| blob \| history