Fix grappler to use CudaGpuId instead of TfGpuId to query device states.
authorGuangda Lai <laigd@google.com>
Sat, 10 Feb 2018 06:47:30 +0000 (22:47 -0800)
committerTensorFlower Gardener <gardener@tensorflow.org>
Sat, 10 Feb 2018 06:51:28 +0000 (22:51 -0800)
PiperOrigin-RevId: 185233116

tensorflow/core/grappler/clusters/BUILD
tensorflow/core/grappler/clusters/single_machine.cc
tensorflow/core/grappler/clusters/utils.cc
tensorflow/core/grappler/clusters/utils.h
tensorflow/core/grappler/costs/BUILD
tensorflow/core/grappler/costs/utils.cc

index 5b8ce373bcf87a10875e764ba5cdbec96d58c080..b15a709c5b8c0efcbffc2f9be1e1b250cd736533 100644 (file)
@@ -26,13 +26,12 @@ config_setting(
 tf_cuda_library(
     name = "utils",
     srcs = ["utils.cc"],
-    hdrs = [
-        "utils.h",
-    ],
+    hdrs = ["utils.h"],
     visibility = ["//visibility:public"],
     deps = [
         "//third_party/eigen3",
         "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_id",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ] + select({
@@ -104,6 +103,7 @@ cc_library(
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_id",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/kernels:ops_util",
index 862ce4ae8883f394fd299914e245a69f1962f564..3e97b31f2cb49353f887ed4a07b34301105308ef 100644 (file)
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/cc/training/queue_runner.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/grappler/clusters/utils.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -79,13 +80,17 @@ Status SingleMachine::Provision() {
 
   std::vector<DeviceAttributes> devices;
   TF_RETURN_IF_ERROR(session_->ListDevices(&devices));
-  int gpu_id = 0;
   for (const auto& dev : devices) {
     DeviceProperties attr;
     if (dev.device_type() == "CPU") {
       attr = GetLocalCPUInfo();
     } else if (dev.device_type() == "GPU") {
-      attr = GetLocalGPUInfo(gpu_id++);
+      DeviceNameUtils::ParsedName parsed;
+      if (!DeviceNameUtils::ParseFullName(dev.name(), &parsed)) {
+        return errors::InvalidArgument(
+            strings::StrCat("Not able to parse GPU device name: ", dev.name()));
+      }
+      attr = GetLocalGPUInfo(TfGpuId(parsed.id));
     } else {
       attr.set_type(dev.device_type());
     }
index aacd2ccb72df07ac6b31c9bd5b96deca499038e4..3e7a7a3356aa68ddc04f14db61bb0b250b8412a9 100644 (file)
@@ -27,6 +27,8 @@ limitations under the License.
 #include "include/libxsmm.h"
 #endif
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
@@ -66,13 +68,14 @@ DeviceProperties GetLocalCPUInfo() {
   return device;
 }
 
-DeviceProperties GetLocalGPUInfo(int gpu_id) {
+DeviceProperties GetLocalGPUInfo(TfGpuId tf_gpu_id) {
   DeviceProperties device;
   device.set_type("GPU");
 
 #if GOOGLE_CUDA
   cudaDeviceProp properties;
-  cudaError_t error = cudaGetDeviceProperties(&properties, gpu_id);
+  CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
+  cudaError_t error = cudaGetDeviceProperties(&properties, cuda_gpu_id.value());
   if (error == cudaSuccess) {
     device.set_vendor("NVidia");
     device.set_model(properties.name);
@@ -94,6 +97,10 @@ DeviceProperties GetLocalGPUInfo(int gpu_id) {
     // double data rate (DDR).
     device.set_bandwidth(properties.memoryBusWidth / 8 *
                          properties.memoryClockRate * 2);
+  } else {
+    LOG(ERROR) << "Failed to get device properties, error code: " << error;
+    device.set_type("UNKNOWN");
+    return device;
   }
 
   (*device.mutable_environment())["architecture"] =
@@ -110,9 +117,9 @@ DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device) {
     return GetLocalCPUInfo();
   } else if (device.type == "GPU") {
     if (device.has_id) {
-      return GetLocalGPUInfo(device.id);
+      return GetLocalGPUInfo(TfGpuId(device.id));
     } else {
-      return GetLocalGPUInfo(0);
+      return GetLocalGPUInfo(TfGpuId(0));
     }
   }
   DeviceProperties result;
index 191942040a1fdd276bb50f799ce314389c2cb0fe..4ea7e9839004e6f76a9ab71062073ea216b31de6 100644 (file)
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
 #define TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -27,7 +28,7 @@ DeviceProperties GetLocalCPUInfo();
 
 // Returns the DeviceProperties for the specified GPU attached to the server on
 // which grappler is running.
-DeviceProperties GetLocalGPUInfo(int gpu_id);
+DeviceProperties GetLocalGPUInfo(TfGpuId tf_gpu_id);
 
 // Returns the DeviceProperties of the specified device
 DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device);
index 0fe01e9c9e094ebfa7fd1e6200d775ef61775184..5336df1f51dbb5dd5f48857a088ece1b1a04dbb5 100644 (file)
@@ -142,6 +142,7 @@ tf_cuda_library(
         "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
+        "//tensorflow/core:gpu_id",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
index 602f69f12ea9d24ebd94da73a2a76d1992f3bfb1..ac30090607043a9d5231d7bfc185508eceb090cd 100644 (file)
@@ -26,6 +26,7 @@ limitations under the License.
 #include "cuda/include/cudnn.h"
 #endif
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
@@ -203,7 +204,7 @@ DeviceProperties GetDeviceInfo(const string& device_str) {
   DeviceNameUtils::ParsedName parsed;
   if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
     if (parsed.type == "GPU") {
-      return GetLocalGPUInfo(parsed.id);
+      return GetLocalGPUInfo(TfGpuId(parsed.id));
     } else if (parsed.type == "CPU") {
       return GetLocalCPUInfo();
     }