From 2e707494c4b1058e1186c67b1030f635bdf52dac Mon Sep 17 00:00:00 2001 From: Guangda Lai Date: Thu, 22 Feb 2018 12:24:22 -0800 Subject: [PATCH] Fix BaseGPUDevice, let it report the actual memory limit of the allocator. Also added a helper method to reset ProcessState. PiperOrigin-RevId: 186655996 --- tensorflow/core/common_runtime/gpu/gpu_device.cc | 29 +++++++++++++++++------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc index 15ff15f..8357cc5 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc @@ -1013,21 +1013,34 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options, GpuIdUtil::CheckValidTfGpuId(tf_gpu_id); CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id); int numa_node = dev_locality.numa_node(); - Bytes allocated_bytes = static_cast(memory_limit); gpu::StreamExecutor* se = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); const gpu::DeviceDescription& desc = se->GetDeviceDescription(); - LOG(INFO) << "Creating TensorFlow device (" << device_name << " with " - << (memory_limit >> 20) << " MB memory) -> physical GPU (" - << GetShortDeviceDescription(cuda_gpu_id, desc) << ")"; ProcessState* process_state = ProcessState::singleton(); + Allocator* gpu_allocator = process_state->GetGPUAllocator( + options.config.gpu_options(), tf_gpu_id, memory_limit); + if (gpu_allocator == nullptr) { + return errors::Internal("Failed to get memory allocator for TF GPU ", + tf_gpu_id.value(), " with ", memory_limit, + " bytes of memory."); + } + AllocatorStats stats; + gpu_allocator->GetStats(&stats); + // 'memory_limit' is the required memory size, but if the allocator with given + // tf_gpu_id was created before, we'll use it instead of creating a new one + // (as TF gpu device is a shared resource), in which case the actual memory + // limit represented by 'stats.bytes_limit' used by that allocator may be + // different (which should be an error). + // + // TODO(laigd): report error if memory_limit doesn't match stats.bytes_limit. BaseGPUDevice* gpu_device = CreateGPUDevice( - options, device_name, allocated_bytes, dev_locality, tf_gpu_id, - GetShortDeviceDescription(cuda_gpu_id, desc), - process_state->GetGPUAllocator(options.config.gpu_options(), tf_gpu_id, - memory_limit), + options, device_name, static_cast(stats.bytes_limit), dev_locality, + tf_gpu_id, GetShortDeviceDescription(cuda_gpu_id, desc), gpu_allocator, process_state->GetCPUAllocator(numa_node)); + LOG(INFO) << "Created TensorFlow device (" << device_name << " with " + << (stats.bytes_limit >> 20) << " MB memory) -> physical GPU (" + << GetShortDeviceDescription(cuda_gpu_id, desc) << ")"; TF_RETURN_IF_ERROR(gpu_device->Init(options)); devices->push_back(gpu_device); -- 2.7.4