From: Peter Hawkins <phawkins@google.com>
Date: Fri, 4 May 2018 17:18:46 +0000 (-0700)
Subject: Change RecvTensor RPC implementation to use DeviceContext::CopyDeviceTensorToCPU... 
X-Git-Tag: upstream/v1.9.0_rc1~162^2^2~5
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3db0e545d2460be0392dfcaa304231cd2105648e;p=platform%2Fupstream%2Ftensorflow.git

Change RecvTensor RPC implementation to use DeviceContext::CopyDeviceTensorToCPU rather than calling GPUUtil::CopyGPUTensorToCPU. The direct call into the GPU code is problematic for non-GPU devices.

PiperOrigin-RevId: 195433287
---

diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index e973a22..c2719f5 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -169,7 +169,6 @@ tf_cuda_library(
         ":grpc_worker_service_impl",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:worker_proto_cc",
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index bbf7391..26fad1f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -23,9 +23,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
-#if GOOGLE_CUDA
-#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
-#endif  // GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
@@ -439,10 +436,10 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
   opts->SetCancelCallback([this, step_id]() { AbortStep(step_id); });
   env_->rendezvous_mgr->RecvLocalAsync(
       step_id, parsed,
-      [opts, response, done, src_dev](const Status& status,
-                                      const Rendezvous::Args& send_args,
-                                      const Rendezvous::Args& recv_args,
-                                      const Tensor& val, const bool is_dead) {
+      [opts, response, done, src_dev, request](
+          const Status& status, const Rendezvous::Args& send_args,
+          const Rendezvous::Args& recv_args, const Tensor& val,
+          const bool is_dead) {
         opts->ClearCancelCallback();
         if (status.ok()) {
           // DMA can only be used for Tensors that do not fall into
@@ -455,8 +452,7 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
           {
             // Non-DMA cases.
             if (src_dev->tensorflow_gpu_device_info() && (!on_host)) {
-#if GOOGLE_CUDA
-              const DeviceContext* send_dev_context = send_args.device_context;
+              DeviceContext* send_dev_context = send_args.device_context;
               AllocatorAttributes alloc_attrs;
               alloc_attrs.set_gpu_compatible(true);
               alloc_attrs.set_on_host(true);
@@ -465,7 +461,8 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
               CHECK(send_dev_context)
                   << "send dev name: " << src_dev->name()
                   << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
-              // "val" is on a GPU. Uses GPUUtil to fill the copy on host.
+              // "val" is on an accelerator device. Uses the device_context to
+              // fill the copy on host.
               StatusCallback copy_ready = [response, done, copy,
                                            is_dead](const Status& s) {
                 // The value is now ready to be returned on the wire.
@@ -474,11 +471,8 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
                 delete copy;
               };
 
-              GPUUtil::CopyGPUTensorToCPU(src_dev, send_dev_context, &val, copy,
-                                          copy_ready);
-#else
-              done(errors::Internal("No GPU device in process"));
-#endif  // GOOGLE_CUDA
+              send_dev_context->CopyDeviceTensorToCPU(
+                  &val, request->rendezvous_key(), src_dev, copy, copy_ready);
             } else {
               grpc::EncodeTensorToByteBuffer(is_dead, val, response);
               done(Status::OK());