Setting default openmp settings for MKL kernels (#19136)

author Jayaram Bobba <jayaram.bobba@intel.com>

Tue, 22 May 2018 19:03:24 +0000 (12:03 -0700)

committer Rasmus Munk Larsen <rmlarsen@google.com>

Tue, 22 May 2018 19:03:24 +0000 (12:03 -0700)
author Jayaram Bobba <jayaram.bobba@intel.com>
Tue, 22 May 2018 19:03:24 +0000 (12:03 -0700)
committer Rasmus Munk Larsen <rmlarsen@google.com>
Tue, 22 May 2018 19:03:24 +0000 (12:03 -0700)
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD

index e536451..bf49495 100644 (file)
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -3289,7 +3289,10 @@ tf_cc_tests_gpu(
  tf_cc_test_mkl(
      name = "mkl_runtime_tests",
      size = "small",
-    srcs = ["common_runtime/mkl_cpu_allocator_test.cc"],
+    srcs = [
+        "common_runtime/mkl_cpu_allocator_test.cc",
+        "common_runtime/mkl_threadpool_device_test.cc",
+    ],
      linkstatic = 1,
      deps = [
          ":core",
diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc

new file mode 100644 (file)

index 0000000..5d583a8
--- /dev/null
+++ b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+#ifdef _OPENMP
+TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) {
+  SessionOptions options;
+  unsetenv("OMP_NUM_THREADS");
+
+  ThreadPoolDevice* tp = new ThreadPoolDevice(
+      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
+
+  const int ht = port::NumHyperthreadsPerCore();
+  EXPECT_EQ(omp_get_max_threads(), (port::NumSchedulableCPUs() + ht - 1) / ht);
+}
+
+TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) {
+  SessionOptions options;
+  setenv("OMP_NUM_THREADS", "314", 1);
+
+  ThreadPoolDevice* tp = new ThreadPoolDevice(
+      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
+
+  EXPECT_EQ(omp_get_max_threads(), 314);
+}
+#endif  // _OPENMP
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc

index 2191223..a5d31b7 100644 (file)
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -16,8 +16,10 @@ limitations under the License.
  #include "tensorflow/core/common_runtime/process_util.h"
  
  #ifdef INTEL_MKL
+#ifdef _OPENMP
  #include <omp.h>
-#endif
+#endif  // _OPENMP
+#endif  // INTEL_MKL
  #include <string.h>
  
  #include "tensorflow/core/lib/core/threadpool.h"
@@ -57,7 +59,10 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
    // MKL library executes ops in parallel using OMP threads
    // Set inter_op conservatively to avoid thread oversubscription that could
    // lead to severe perf degradations and OMP resource exhaustion
-  const int mkl_intra_op = omp_get_max_threads();
+  int mkl_intra_op = 1;
+#ifdef _OPENMP
+  mkl_intra_op = omp_get_max_threads();
+#endif  // _OPENMP
    CHECK_GE(mkl_intra_op, 1);
    const int32 mkl_inter_op = std::max(
        (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
@@ -68,7 +73,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
  #else
    // Default to using the number of cores available in the process.
    return port::NumSchedulableCPUs();
-#endif
+#endif  // INTEL_MKL
  }
  
  thread::ThreadPool* NewThreadPoolFromSessionOptions(
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc

index f7a07fe..74a8721 100644 (file)
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -31,7 +31,11 @@ limitations under the License.
  #include "tensorflow/core/public/session_options.h"
  
  #ifdef INTEL_MKL
+#ifdef _OPENMP
+#include <omp.h>
+#endif
  #include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
+#include "tensorflow/core/platform/cpu_info.h"
  #endif
  
  namespace tensorflow {
@@ -43,7 +47,26 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
      : LocalDevice(options, Device::BuildDeviceAttributes(
                                 name, DEVICE_CPU, memory_limit, locality)),
        allocator_(allocator),
-      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {}
+      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
+#ifdef INTEL_MKL
+#ifdef _OPENMP
+  const char* user_omp_threads = getenv("OMP_NUM_THREADS");
+  if (user_omp_threads == nullptr) {
+    // OMP_NUM_THREADS controls MKL's intra-op parallelization
+    // Default to available physical cores
+    const int mkl_intra_op = port::NumSchedulableCPUs();
+    const int ht = port::NumHyperthreadsPerCore();
+    omp_set_num_threads((mkl_intra_op + ht - 1) / ht);
+  } else {
+    uint64 user_val = 0;
+    if (strings::safe_strtou64(user_omp_threads, &user_val)) {
+      // Superflous but triggers OpenMP loading
+      omp_set_num_threads(user_val);
+    }
+  }
+#endif  // _OPENMP
+#endif  // INTEL_MKL
+}
  
  ThreadPoolDevice::~ThreadPoolDevice() {}
  
diff --git a/tensorflow/core/platform/cpu_info.cc b/tensorflow/core/platform/cpu_info.cc

index 99de364..e9da3d8 100644 (file)
--- a/tensorflow/core/platform/cpu_info.cc
+++ b/tensorflow/core/platform/cpu_info.cc
@@ -344,5 +344,28 @@ int CPUModelNum() {
  #endif
  }
  
+int CPUIDNumSMT() {
+#ifdef PLATFORM_IS_X86
+  // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration
+  // https://software.intel.com/en-us/articles/intel-sdm (Vol 3A)
+  // Section: Detecting Hardware Multi-threads Support and Topology
+  // Uses CPUID Leaf 11 to enumerate system topology on Intel x86 architectures
+  // Other cases not supported
+  uint32 eax, ebx, ecx, edx;
+  // Check if system supports Leaf 11
+  GETCPUID(eax, ebx, ecx, edx, 0, 0);
+  if (eax >= 11) {
+    // 1) Leaf 11 available? CPUID.(EAX=11, ECX=0):EBX != 0
+    // 2) SMT_Mask_Width = CPUID.(EAX=11, ECX=0):EAX[4:0] if CPUID.(EAX=11,
+    // ECX=0):ECX[15:8] is 1
+    GETCPUID(eax, ebx, ecx, edx, 11, 0);
+    if (ebx != 0 && ((ecx & 0xff00) >> 8) == 1) {
+      return 1 << (eax & 0x1f);  // 2 ^ SMT_Mask_Width
+    }
+  }
+#endif  // PLATFORM_IS_X86
+  return 0;
+}
+
  }  // namespace port
  }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h

index b5be7e8..175c9ae 100644 (file)
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -35,6 +35,10 @@ namespace port {
  // software can change it dynamically.
  int NumSchedulableCPUs();
  
+// Returns an estimate of the number of hyperthreads per physical core
+// on the CPU
+int NumHyperthreadsPerCore();
+
  // Mostly ISA related features that we care about
  enum CPUFeature {
    // Do not change numeric assignments.
@@ -107,6 +111,9 @@ int CPUModelNum();
  // Returns nominal core processor cycles per second of each processor.
  double NominalCPUFrequency();
  
+// Returns num of hyperthreads per physical core
+int CPUIDNumSMT();
+
  }  // namespace port
  }  // namespace tensorflow
  
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc

index 8e31647..708f32b 100644 (file)
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -74,6 +74,11 @@ int NumSchedulableCPUs() {
    return kDefaultCores;
  }
  
+int NumHyperthreadsPerCore() {
+  static const int ht_per_core = tensorflow::port::CPUIDNumSMT();
+  return (ht_per_core > 0) ? ht_per_core : 1;
+}
+
  void* AlignedMalloc(size_t size, int minimum_alignment) {
  #if defined(__ANDROID__)
    return memalign(minimum_alignment, size);
author	Jayaram Bobba <jayaram.bobba@intel.com>
	Tue, 22 May 2018 19:03:24 +0000 (12:03 -0700)
committer	Rasmus Munk Larsen <rmlarsen@google.com>
	Tue, 22 May 2018 19:03:24 +0000 (12:03 -0700)
tensorflow/core/BUILD		patch \| blob \| history
tensorflow/core/common_runtime/mkl_threadpool_device_test.cc	[new file with mode: 0644]	patch \| blob
tensorflow/core/common_runtime/process_util.cc		patch \| blob \| history
tensorflow/core/common_runtime/threadpool_device.cc		patch \| blob \| history
tensorflow/core/platform/cpu_info.cc		patch \| blob \| history
tensorflow/core/platform/cpu_info.h		patch \| blob \| history
tensorflow/core/platform/posix/port.cc		patch \| blob \| history