From e69000c347ddf023a3b1926d812881fd8c5a055b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 21 Mar 2018 04:09:32 -0700
Subject: [PATCH] We were ValueOrDie()ing in one place, and TF_CHECK_OK()ing in
 another. Both should gracefully return an error condition.

Add some tests to check this.

PiperOrigin-RevId: 189888700
---
 tensorflow/compiler/jit/xla_device_context.cc | 12 +++--
 tensorflow/compiler/jit/xla_launch_util.cc    | 14 ++++--
 tensorflow/compiler/tests/BUILD               | 20 ++++++++
 tensorflow/compiler/tests/oom_test.py         | 72 +++++++++++++++++++++++++++
 4 files changed, 111 insertions(+), 7 deletions(-)
 create mode 100644 tensorflow/compiler/tests/oom_test.py

diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 88f7c15..93e0dbb 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -36,10 +36,14 @@ XlaDeviceAllocator::~XlaDeviceAllocator() = default;
 string XlaDeviceAllocator::Name() { return "xla"; }
 
 void* XlaDeviceAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
-  se::DeviceMemoryBase dmem =
-      backend_->memory_allocator()
-          ->Allocate(device_ordinal_, num_bytes, /*retry_on_failure=*/false)
-          .ValueOrDie();
+  auto status_or_dmem = backend_->memory_allocator()->Allocate(
+      device_ordinal_, num_bytes, /*retry_on_failure=*/false);
+  if (!status_or_dmem.status().ok()) {
+    LOG(ERROR) << "Failed to allocate memory: "
+               << status_or_dmem.status().ToString();
+    return nullptr;
+  }
+  se::DeviceMemoryBase dmem = status_or_dmem.ValueOrDie();
   VLOG(2) << "Allocated XLA device tensor " << dmem.opaque() << "(" << num_bytes
           << ")";
   return dmem.opaque();
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index bb7316c..21f58c8 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -56,12 +56,20 @@ XlaAllocator::XlaAllocator(const gpu::Platform* platform,
                            OpKernelContext* op_context)
     : xla::DeviceMemoryAllocator(platform), op_context_(op_context) {}
 
-XlaAllocator::~XlaAllocator() { CHECK(allocated_.empty()); }
+XlaAllocator::~XlaAllocator() {
+  for (void* ptr : allocated_) {
+    op_context_->device()->GetAllocator({})->DeallocateRaw(ptr);
+  }
+}
 
 xla::StatusOr<gpu::DeviceMemoryBase> XlaAllocator::Allocate(
     int device_ordinal, uint64 size, bool retry_on_failure) {
   void* data = op_context_->device()->GetAllocator({})->AllocateRaw(
       Allocator::kAllocatorAlignment, size);
+  if (!data) {
+    return errors::ResourceExhausted(
+        "OOM when allocating temporary tensor with size ", size);
+  }
   allocated_.insert(data);
   return gpu::DeviceMemoryBase(data, size);
 }
@@ -182,8 +190,8 @@ void XlaComputationLaunchContext::PopulateOutputs(
         // Copy host -> device. (Empty tensors don't have backing buffers.)
         VLOG(1) << "Constant output tensor on device";
 
-        TF_CHECK_OK(
-            ctx->allocate_output(i, const_tensor.shape(), &output_tensor));
+        OP_REQUIRES_OK(
+            ctx, ctx->allocate_output(i, const_tensor.shape(), &output_tensor));
 
         const void* src_ptr = DMAHelper::base(&const_tensor);
         void* dst_ptr = DMAHelper::base(output_tensor);
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index bbb6089..26d4ca0 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -192,6 +192,26 @@ tf_xla_py_test(
 )
 
 tf_xla_py_test(
+    name = "oom_test",
+    size = "medium",
+    srcs = ["oom_test.py"],
+    disabled_backends = [
+        "cpu",
+        "cpu_ondemand",
+    ],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+tf_xla_py_test(
     name = "conv2d_test",
     size = "medium",
     srcs = ["conv2d_test.py"],
diff --git a/tensorflow/compiler/tests/oom_test.py b/tensorflow/compiler/tests/oom_test.py
new file mode 100644
index 0000000..66be0d6
--- /dev/null
+++ b/tensorflow/compiler/tests/oom_test.py
@@ -0,0 +1,72 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for out-of-memory conditions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.platform import googletest
+
+
+class OutOfMemoryTest(xla_test.XLATestCase):
+
+  def testOutputOutOfMemory(self):
+    """Allocates tensors until out of memory.
+
+    Generates a large rank-1 tensor. The tensor is an output of an XLA
+    computation, not constant.
+
+    Check that a ResourceExhaustedError is raised and can be caught.
+    """
+    size = 5e8
+    with self.test_session():
+      # Force the compiled code to not be constant by feeding in an addend.
+      p = array_ops.placeholder(dtypes.float32, shape=[])
+      with self.test_scope():
+        # Create a large R1 tensor.
+        c = array_ops.zeros([size]) + p
+
+        self.assertRaises(
+            errors.ResourceExhaustedError, lambda: c.eval(feed_dict={p: 1.0}))
+
+  def testConstantOutOfMemory(self):
+    """Allocates constant tensors until out of memory.
+
+    Generates a large rank-1 tensor and a small rank-1 tensor. The tensors are
+    constant outputs of an XLA computation, not variable.
+
+    Multiple constant outputs are created, one small, one large. The small
+    tensor will have already been allocated when the large tensor fails.
+
+    Check that a ResourceExhaustedError is raised and can be caught.
+    """
+    size = 5e8
+    with self.test_session() as sess:
+      with self.test_scope():
+        # Create two R1 tensors, size 5 and size n.
+        b = array_ops.zeros([5])
+        c = array_ops.zeros([size])
+        e = control_flow_ops.tuple([b, c])
+        self.assertRaises(errors.ResourceExhaustedError, lambda: sess.run(e))
+
+
+if __name__ == "__main__":
+  googletest.main()
-- 
2.7.4