namespace gpu {
bool ConvUseLayoutHeuristic(const HloModuleConfig& config) {
- return config.debug_options().xla_backend_extra_options().count(
- "xla_gpu_experimental_conv_use_layout_heuristic");
+ return !config.debug_options().xla_backend_extra_options().count(
+ "xla_gpu_experimental_conv_disable_layout_heuristic");
}
} // namespace gpu
// 4567 // Numbers indicate tile IDs.
//
// Each tile is first partially reduced to a scalar by a thread, and then the
- // scalar is accumulated to the output vector using atomic operations. We
- // choose 16 as the tile size, which matches Eigen's ColumnReduceKernel.
- constexpr int64 kTileSize = 16;
+ // scalar is accumulated to the output vector using atomic operations.
+ //
+ // We choose 128 as the tile size based on empirical evidence. It's big enough
+ // to reduce the amount of atomic adds in the end, maximizing the memory
+ // bandwidth.
+ constexpr int64 kTileSize = 128;
+
// If the height is not a multiple of the tile size, we pad the bottom of the
// input matrix.
const int64 height_in_tiles = CeilOfRatio(height, kTileSize);
name = "convolution_test_gpu_alternative_layout",
timeout = "long",
srcs = ["convolution_test.cc"],
- backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_use_layout_heuristic"]},
+ backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_disable_layout_heuristic"]},
backends = ["gpu"],
shard_count = 25,
deps = CONVOLUTION_TEST_DEPS,