From e69000c347ddf023a3b1926d812881fd8c5a055b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 21 Mar 2018 04:09:32 -0700 Subject: [PATCH] We were ValueOrDie()ing in one place, and TF_CHECK_OK()ing in another. Both should gracefully return an error condition. Add some tests to check this. PiperOrigin-RevId: 189888700 --- tensorflow/compiler/jit/xla_device_context.cc | 12 +++-- tensorflow/compiler/jit/xla_launch_util.cc | 14 ++++-- tensorflow/compiler/tests/BUILD | 20 ++++++++ tensorflow/compiler/tests/oom_test.py | 72 +++++++++++++++++++++++++++ 4 files changed, 111 insertions(+), 7 deletions(-) create mode 100644 tensorflow/compiler/tests/oom_test.py diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc index 88f7c15..93e0dbb 100644 --- a/tensorflow/compiler/jit/xla_device_context.cc +++ b/tensorflow/compiler/jit/xla_device_context.cc @@ -36,10 +36,14 @@ XlaDeviceAllocator::~XlaDeviceAllocator() = default; string XlaDeviceAllocator::Name() { return "xla"; } void* XlaDeviceAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { - se::DeviceMemoryBase dmem = - backend_->memory_allocator() - ->Allocate(device_ordinal_, num_bytes, /*retry_on_failure=*/false) - .ValueOrDie(); + auto status_or_dmem = backend_->memory_allocator()->Allocate( + device_ordinal_, num_bytes, /*retry_on_failure=*/false); + if (!status_or_dmem.status().ok()) { + LOG(ERROR) << "Failed to allocate memory: " + << status_or_dmem.status().ToString(); + return nullptr; + } + se::DeviceMemoryBase dmem = status_or_dmem.ValueOrDie(); VLOG(2) << "Allocated XLA device tensor " << dmem.opaque() << "(" << num_bytes << ")"; return dmem.opaque(); diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc index bb7316c..21f58c8 100644 --- a/tensorflow/compiler/jit/xla_launch_util.cc +++ b/tensorflow/compiler/jit/xla_launch_util.cc @@ -56,12 +56,20 @@ XlaAllocator::XlaAllocator(const gpu::Platform* platform, OpKernelContext* op_context) : xla::DeviceMemoryAllocator(platform), op_context_(op_context) {} -XlaAllocator::~XlaAllocator() { CHECK(allocated_.empty()); } +XlaAllocator::~XlaAllocator() { + for (void* ptr : allocated_) { + op_context_->device()->GetAllocator({})->DeallocateRaw(ptr); + } +} xla::StatusOr XlaAllocator::Allocate( int device_ordinal, uint64 size, bool retry_on_failure) { void* data = op_context_->device()->GetAllocator({})->AllocateRaw( Allocator::kAllocatorAlignment, size); + if (!data) { + return errors::ResourceExhausted( + "OOM when allocating temporary tensor with size ", size); + } allocated_.insert(data); return gpu::DeviceMemoryBase(data, size); } @@ -182,8 +190,8 @@ void XlaComputationLaunchContext::PopulateOutputs( // Copy host -> device. (Empty tensors don't have backing buffers.) VLOG(1) << "Constant output tensor on device"; - TF_CHECK_OK( - ctx->allocate_output(i, const_tensor.shape(), &output_tensor)); + OP_REQUIRES_OK( + ctx, ctx->allocate_output(i, const_tensor.shape(), &output_tensor)); const void* src_ptr = DMAHelper::base(&const_tensor); void* dst_ptr = DMAHelper::base(output_tensor); diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD index bbb6089..26d4ca0 100644 --- a/tensorflow/compiler/tests/BUILD +++ b/tensorflow/compiler/tests/BUILD @@ -192,6 +192,26 @@ tf_xla_py_test( ) tf_xla_py_test( + name = "oom_test", + size = "medium", + srcs = ["oom_test.py"], + disabled_backends = [ + "cpu", + "cpu_ondemand", + ], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:array_ops_gen", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:gradient_checker", + "//tensorflow/python:gradients", + "//tensorflow/python:math_ops", + "//tensorflow/python:platform_test", + ], +) + +tf_xla_py_test( name = "conv2d_test", size = "medium", srcs = ["conv2d_test.py"], diff --git a/tensorflow/compiler/tests/oom_test.py b/tensorflow/compiler/tests/oom_test.py new file mode 100644 index 0000000..66be0d6 --- /dev/null +++ b/tensorflow/compiler/tests/oom_test.py @@ -0,0 +1,72 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Functional tests for out-of-memory conditions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.compiler.tests import xla_test +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.platform import googletest + + +class OutOfMemoryTest(xla_test.XLATestCase): + + def testOutputOutOfMemory(self): + """Allocates tensors until out of memory. + + Generates a large rank-1 tensor. The tensor is an output of an XLA + computation, not constant. + + Check that a ResourceExhaustedError is raised and can be caught. + """ + size = 5e8 + with self.test_session(): + # Force the compiled code to not be constant by feeding in an addend. + p = array_ops.placeholder(dtypes.float32, shape=[]) + with self.test_scope(): + # Create a large R1 tensor. + c = array_ops.zeros([size]) + p + + self.assertRaises( + errors.ResourceExhaustedError, lambda: c.eval(feed_dict={p: 1.0})) + + def testConstantOutOfMemory(self): + """Allocates constant tensors until out of memory. + + Generates a large rank-1 tensor and a small rank-1 tensor. The tensors are + constant outputs of an XLA computation, not variable. + + Multiple constant outputs are created, one small, one large. The small + tensor will have already been allocated when the large tensor fails. + + Check that a ResourceExhaustedError is raised and can be caught. + """ + size = 5e8 + with self.test_session() as sess: + with self.test_scope(): + # Create two R1 tensors, size 5 and size n. + b = array_ops.zeros([5]) + c = array_ops.zeros([size]) + e = control_flow_ops.tuple([b, c]) + self.assertRaises(errors.ResourceExhaustedError, lambda: sess.run(e)) + + +if __name__ == "__main__": + googletest.main() -- 2.7.4