From: Peter Hawkins Date: Fri, 4 May 2018 17:18:46 +0000 (-0700) Subject: Change RecvTensor RPC implementation to use DeviceContext::CopyDeviceTensorToCPU... X-Git-Tag: upstream/v1.9.0_rc1~162^2^2~5 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3db0e545d2460be0392dfcaa304231cd2105648e;p=platform%2Fupstream%2Ftensorflow.git Change RecvTensor RPC implementation to use DeviceContext::CopyDeviceTensorToCPU rather than calling GPUUtil::CopyGPUTensorToCPU. The direct call into the GPU code is problematic for non-GPU devices. PiperOrigin-RevId: 195433287 --- diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD index e973a22..c2719f5 100644 --- a/tensorflow/core/distributed_runtime/rpc/BUILD +++ b/tensorflow/core/distributed_runtime/rpc/BUILD @@ -169,7 +169,6 @@ tf_cuda_library( ":grpc_worker_service_impl", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", - "//tensorflow/core:gpu_runtime", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:worker_proto_cc", diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc index bbf7391..26fad1f 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc @@ -23,9 +23,6 @@ limitations under the License. #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/device_mgr.h" #include "tensorflow/core/common_runtime/dma_helper.h" -#if GOOGLE_CUDA -#include "tensorflow/core/common_runtime/gpu/gpu_util.h" -#endif // GOOGLE_CUDA #include "tensorflow/core/common_runtime/local_device.h" #include "tensorflow/core/common_runtime/process_util.h" #include "tensorflow/core/common_runtime/step_stats_collector.h" @@ -439,10 +436,10 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts, opts->SetCancelCallback([this, step_id]() { AbortStep(step_id); }); env_->rendezvous_mgr->RecvLocalAsync( step_id, parsed, - [opts, response, done, src_dev](const Status& status, - const Rendezvous::Args& send_args, - const Rendezvous::Args& recv_args, - const Tensor& val, const bool is_dead) { + [opts, response, done, src_dev, request]( + const Status& status, const Rendezvous::Args& send_args, + const Rendezvous::Args& recv_args, const Tensor& val, + const bool is_dead) { opts->ClearCancelCallback(); if (status.ok()) { // DMA can only be used for Tensors that do not fall into @@ -455,8 +452,7 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts, { // Non-DMA cases. if (src_dev->tensorflow_gpu_device_info() && (!on_host)) { -#if GOOGLE_CUDA - const DeviceContext* send_dev_context = send_args.device_context; + DeviceContext* send_dev_context = send_args.device_context; AllocatorAttributes alloc_attrs; alloc_attrs.set_gpu_compatible(true); alloc_attrs.set_on_host(true); @@ -465,7 +461,8 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts, CHECK(send_dev_context) << "send dev name: " << src_dev->name() << " gpu_info: " << src_dev->tensorflow_gpu_device_info(); - // "val" is on a GPU. Uses GPUUtil to fill the copy on host. + // "val" is on an accelerator device. Uses the device_context to + // fill the copy on host. StatusCallback copy_ready = [response, done, copy, is_dead](const Status& s) { // The value is now ready to be returned on the wire. @@ -474,11 +471,8 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts, delete copy; }; - GPUUtil::CopyGPUTensorToCPU(src_dev, send_dev_context, &val, copy, - copy_ready); -#else - done(errors::Internal("No GPU device in process")); -#endif // GOOGLE_CUDA + send_dev_context->CopyDeviceTensorToCPU( + &val, request->rendezvous_key(), src_dev, copy, copy_ready); } else { grpc::EncodeTensorToByteBuffer(is_dead, val, response); done(Status::OK());