[XLA:GPU] Zero out input buffers before running cudnn conv autotune.

author Justin Lebar <jlebar@google.com>

Sat, 5 May 2018 19:34:32 +0000 (12:34 -0700)

committer TensorFlower Gardener <gardener@tensorflow.org>

Mon, 7 May 2018 23:21:52 +0000 (16:21 -0700)
author Justin Lebar <jlebar@google.com>
Sat, 5 May 2018 19:34:32 +0000 (12:34 -0700)
committer TensorFlower Gardener <gardener@tensorflow.org>
Mon, 7 May 2018 23:21:52 +0000 (16:21 -0700)
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc

index c4c56c5..41ee45f 100644 (file)
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -197,22 +197,42 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
    // We don't put any data in these buffers, because (in theory, anyway) the
    // speed of a conv isn't affected by the data being convolved.
    ScratchAllocator input_output_allocator(device_ordinal, allocator);
-  se::port::StatusOr<DeviceMemoryBase> input_buf =
+  StatusOr<DeviceMemoryBase> maybe_input_buf =
        input_output_allocator.AllocateBytes(&stream,
                                             ShapeUtil::ByteSizeOf(input_shape));
-  se::port::StatusOr<DeviceMemoryBase> filter_buf =
+  StatusOr<DeviceMemoryBase> maybe_filter_buf =
        input_output_allocator.AllocateBytes(&stream,
                                             ShapeUtil::ByteSizeOf(filter_shape));
-  se::port::StatusOr<DeviceMemoryBase> output_buf =
+  StatusOr<DeviceMemoryBase> maybe_output_buf =
        input_output_allocator.AllocateBytes(&stream,
                                             ShapeUtil::ByteSizeOf(output_shape));
-  if (!input_buf.ok() || !filter_buf.ok() || !output_buf.ok()) {
+  if (!maybe_input_buf.ok() || !maybe_filter_buf.ok() ||
+      !maybe_output_buf.ok()) {
      LOG(WARNING)
          << "Couldn't allocate space for input/filter/output of convolution "
          << instr->ToString() << ".  Falling back to default algorithm.";
      return nullopt;
    }
  
+  DeviceMemoryBase input_buf = maybe_input_buf.ValueOrDie();
+  DeviceMemoryBase filter_buf = maybe_filter_buf.ValueOrDie();
+  DeviceMemoryBase output_buf = maybe_output_buf.ValueOrDie();
+
+  // Although we don't have evidence this matters, zero out the buffers before
+  // autotuning.  It's conceivable that using uninitialized memory as the inputs
+  // might affect performance if e.g. the inputs contain denormals, and this is
+  // easy enough.
+  if (!stream.ThenMemZero(&input_buf, input_buf.size())
+           .ThenMemZero(&filter_buf, filter_buf.size())
+           .ThenMemZero(&output_buf, output_buf.size())
+           .BlockHostUntilDone()
+           .ok()) {
+    LOG(WARNING)
+        << "Couldn't zero out input/filter/output buffer for convolution "
+        << instr->ToString() << ".  Falling back to default algorithm.";
+    return nullopt;
+  }
+
    const bool use_winograd_nonfused = ShouldIncludeWinogradNonfusedAlgo(
        input_shape, output_shape, dnums, stream_exec_);
    se::dnn::ProfileResult best_result;
@@ -225,12 +245,12 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
      VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
              << instr->ToString();
  
-    bool launch_ok = RunCudnnConvolution(
-                         kind, input_shape, filter_shape, output_shape,
-                         input_buf.ValueOrDie(), filter_buf.ValueOrDie(),
-                         output_buf.ValueOrDie(), &scratch_allocator, window,
-                         dnums, AlgorithmConfig(alg), &stream, &profile_result)
-                         .ok();
+    bool launch_ok =
+        RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
+                            input_buf, filter_buf, output_buf,
+                            &scratch_allocator, window, dnums,
+                            AlgorithmConfig(alg), &stream, &profile_result)
+            .ok();
  
      if (launch_ok && profile_result.is_valid()) {
        int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
author	Justin Lebar <jlebar@google.com>
	Sat, 5 May 2018 19:34:32 +0000 (12:34 -0700)
committer	TensorFlower Gardener <gardener@tensorflow.org>
	Mon, 7 May 2018 23:21:52 +0000 (16:21 -0700)