From: Jayaram Bobba Date: Tue, 22 May 2018 19:03:24 +0000 (-0700) Subject: Setting default openmp settings for MKL kernels (#19136) X-Git-Tag: upstream/v1.9.0_rc1~73 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c36266e1ed6bf632408461844fe3e8ef21f32839;p=platform%2Fupstream%2Ftensorflow.git Setting default openmp settings for MKL kernels (#19136) * Change inter op defaults when built with MKL to avoid thread oversubscription * Bump up default mkl inter_op to be less conservative * Added default OMP settings that are expected to give reasonable performance when using MKL kernels * Query CPUID for determining number of hyperthreads per physical core on Intel 64 architectures * Style fixes * Buildifier and clang-format style fixes * Removed use of setenv due to concerns about thread safety --- diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index e536451..bf49495 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -3289,7 +3289,10 @@ tf_cc_tests_gpu( tf_cc_test_mkl( name = "mkl_runtime_tests", size = "small", - srcs = ["common_runtime/mkl_cpu_allocator_test.cc"], + srcs = [ + "common_runtime/mkl_cpu_allocator_test.cc", + "common_runtime/mkl_threadpool_device_test.cc", + ], linkstatic = 1, deps = [ ":core", diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc new file mode 100644 index 0000000..5d583a8 --- /dev/null +++ b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc @@ -0,0 +1,53 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifdef INTEL_MKL + +#include "tensorflow/core/common_runtime/threadpool_device.h" + +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/public/session_options.h" + +namespace tensorflow { + +#ifdef _OPENMP +TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) { + SessionOptions options; + unsetenv("OMP_NUM_THREADS"); + + ThreadPoolDevice* tp = new ThreadPoolDevice( + options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator()); + + const int ht = port::NumHyperthreadsPerCore(); + EXPECT_EQ(omp_get_max_threads(), (port::NumSchedulableCPUs() + ht - 1) / ht); +} + +TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) { + SessionOptions options; + setenv("OMP_NUM_THREADS", "314", 1); + + ThreadPoolDevice* tp = new ThreadPoolDevice( + options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator()); + + EXPECT_EQ(omp_get_max_threads(), 314); +} +#endif // _OPENMP + +} // namespace tensorflow + +#endif // INTEL_MKL diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc index 2191223..a5d31b7 100644 --- a/tensorflow/core/common_runtime/process_util.cc +++ b/tensorflow/core/common_runtime/process_util.cc @@ -16,8 +16,10 @@ limitations under the License. #include "tensorflow/core/common_runtime/process_util.h" #ifdef INTEL_MKL +#ifdef _OPENMP #include -#endif +#endif // _OPENMP +#endif // INTEL_MKL #include #include "tensorflow/core/lib/core/threadpool.h" @@ -57,7 +59,10 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) { // MKL library executes ops in parallel using OMP threads // Set inter_op conservatively to avoid thread oversubscription that could // lead to severe perf degradations and OMP resource exhaustion - const int mkl_intra_op = omp_get_max_threads(); + int mkl_intra_op = 1; +#ifdef _OPENMP + mkl_intra_op = omp_get_max_threads(); +#endif // _OPENMP CHECK_GE(mkl_intra_op, 1); const int32 mkl_inter_op = std::max( (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2); @@ -68,7 +73,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) { #else // Default to using the number of cores available in the process. return port::NumSchedulableCPUs(); -#endif +#endif // INTEL_MKL } thread::ThreadPool* NewThreadPoolFromSessionOptions( diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc index f7a07fe..74a8721 100644 --- a/tensorflow/core/common_runtime/threadpool_device.cc +++ b/tensorflow/core/common_runtime/threadpool_device.cc @@ -31,7 +31,11 @@ limitations under the License. #include "tensorflow/core/public/session_options.h" #ifdef INTEL_MKL +#ifdef _OPENMP +#include +#endif #include "tensorflow/core/common_runtime/mkl_cpu_allocator.h" +#include "tensorflow/core/platform/cpu_info.h" #endif namespace tensorflow { @@ -43,7 +47,26 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options, : LocalDevice(options, Device::BuildDeviceAttributes( name, DEVICE_CPU, memory_limit, locality)), allocator_(allocator), - scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {} + scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) { +#ifdef INTEL_MKL +#ifdef _OPENMP + const char* user_omp_threads = getenv("OMP_NUM_THREADS"); + if (user_omp_threads == nullptr) { + // OMP_NUM_THREADS controls MKL's intra-op parallelization + // Default to available physical cores + const int mkl_intra_op = port::NumSchedulableCPUs(); + const int ht = port::NumHyperthreadsPerCore(); + omp_set_num_threads((mkl_intra_op + ht - 1) / ht); + } else { + uint64 user_val = 0; + if (strings::safe_strtou64(user_omp_threads, &user_val)) { + // Superflous but triggers OpenMP loading + omp_set_num_threads(user_val); + } + } +#endif // _OPENMP +#endif // INTEL_MKL +} ThreadPoolDevice::~ThreadPoolDevice() {} diff --git a/tensorflow/core/platform/cpu_info.cc b/tensorflow/core/platform/cpu_info.cc index 99de364..e9da3d8 100644 --- a/tensorflow/core/platform/cpu_info.cc +++ b/tensorflow/core/platform/cpu_info.cc @@ -344,5 +344,28 @@ int CPUModelNum() { #endif } +int CPUIDNumSMT() { +#ifdef PLATFORM_IS_X86 + // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration + // https://software.intel.com/en-us/articles/intel-sdm (Vol 3A) + // Section: Detecting Hardware Multi-threads Support and Topology + // Uses CPUID Leaf 11 to enumerate system topology on Intel x86 architectures + // Other cases not supported + uint32 eax, ebx, ecx, edx; + // Check if system supports Leaf 11 + GETCPUID(eax, ebx, ecx, edx, 0, 0); + if (eax >= 11) { + // 1) Leaf 11 available? CPUID.(EAX=11, ECX=0):EBX != 0 + // 2) SMT_Mask_Width = CPUID.(EAX=11, ECX=0):EAX[4:0] if CPUID.(EAX=11, + // ECX=0):ECX[15:8] is 1 + GETCPUID(eax, ebx, ecx, edx, 11, 0); + if (ebx != 0 && ((ecx & 0xff00) >> 8) == 1) { + return 1 << (eax & 0x1f); // 2 ^ SMT_Mask_Width + } + } +#endif // PLATFORM_IS_X86 + return 0; +} + } // namespace port } // namespace tensorflow diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h index b5be7e8..175c9ae 100644 --- a/tensorflow/core/platform/cpu_info.h +++ b/tensorflow/core/platform/cpu_info.h @@ -35,6 +35,10 @@ namespace port { // software can change it dynamically. int NumSchedulableCPUs(); +// Returns an estimate of the number of hyperthreads per physical core +// on the CPU +int NumHyperthreadsPerCore(); + // Mostly ISA related features that we care about enum CPUFeature { // Do not change numeric assignments. @@ -107,6 +111,9 @@ int CPUModelNum(); // Returns nominal core processor cycles per second of each processor. double NominalCPUFrequency(); +// Returns num of hyperthreads per physical core +int CPUIDNumSMT(); + } // namespace port } // namespace tensorflow diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc index 8e31647..708f32b 100644 --- a/tensorflow/core/platform/posix/port.cc +++ b/tensorflow/core/platform/posix/port.cc @@ -74,6 +74,11 @@ int NumSchedulableCPUs() { return kDefaultCores; } +int NumHyperthreadsPerCore() { + static const int ht_per_core = tensorflow::port::CPUIDNumSMT(); + return (ht_per_core > 0) ? ht_per_core : 1; +} + void* AlignedMalloc(size_t size, int minimum_alignment) { #if defined(__ANDROID__) return memalign(minimum_alignment, size);