tf_cc_test_mkl(
name = "mkl_runtime_tests",
size = "small",
- srcs = ["common_runtime/mkl_cpu_allocator_test.cc"],
+ srcs = [
+ "common_runtime/mkl_cpu_allocator_test.cc",
+ "common_runtime/mkl_threadpool_device_test.cc",
+ ],
linkstatic = 1,
deps = [
":core",
--- /dev/null
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+#ifdef _OPENMP
+TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) {
+ SessionOptions options;
+ unsetenv("OMP_NUM_THREADS");
+
+ ThreadPoolDevice* tp = new ThreadPoolDevice(
+ options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
+
+ const int ht = port::NumHyperthreadsPerCore();
+ EXPECT_EQ(omp_get_max_threads(), (port::NumSchedulableCPUs() + ht - 1) / ht);
+}
+
+TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) {
+ SessionOptions options;
+ setenv("OMP_NUM_THREADS", "314", 1);
+
+ ThreadPoolDevice* tp = new ThreadPoolDevice(
+ options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
+
+ EXPECT_EQ(omp_get_max_threads(), 314);
+}
+#endif // _OPENMP
+
+} // namespace tensorflow
+
+#endif // INTEL_MKL
#include "tensorflow/core/common_runtime/process_util.h"
#ifdef INTEL_MKL
+#ifdef _OPENMP
#include <omp.h>
-#endif
+#endif // _OPENMP
+#endif // INTEL_MKL
#include <string.h>
#include "tensorflow/core/lib/core/threadpool.h"
// MKL library executes ops in parallel using OMP threads
// Set inter_op conservatively to avoid thread oversubscription that could
// lead to severe perf degradations and OMP resource exhaustion
- const int mkl_intra_op = omp_get_max_threads();
+ int mkl_intra_op = 1;
+#ifdef _OPENMP
+ mkl_intra_op = omp_get_max_threads();
+#endif // _OPENMP
CHECK_GE(mkl_intra_op, 1);
const int32 mkl_inter_op = std::max(
(port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
#else
// Default to using the number of cores available in the process.
return port::NumSchedulableCPUs();
-#endif
+#endif // INTEL_MKL
}
thread::ThreadPool* NewThreadPoolFromSessionOptions(
#include "tensorflow/core/public/session_options.h"
#ifdef INTEL_MKL
+#ifdef _OPENMP
+#include <omp.h>
+#endif
#include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
+#include "tensorflow/core/platform/cpu_info.h"
#endif
namespace tensorflow {
: LocalDevice(options, Device::BuildDeviceAttributes(
name, DEVICE_CPU, memory_limit, locality)),
allocator_(allocator),
- scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {}
+ scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
+#ifdef INTEL_MKL
+#ifdef _OPENMP
+ const char* user_omp_threads = getenv("OMP_NUM_THREADS");
+ if (user_omp_threads == nullptr) {
+ // OMP_NUM_THREADS controls MKL's intra-op parallelization
+ // Default to available physical cores
+ const int mkl_intra_op = port::NumSchedulableCPUs();
+ const int ht = port::NumHyperthreadsPerCore();
+ omp_set_num_threads((mkl_intra_op + ht - 1) / ht);
+ } else {
+ uint64 user_val = 0;
+ if (strings::safe_strtou64(user_omp_threads, &user_val)) {
+ // Superflous but triggers OpenMP loading
+ omp_set_num_threads(user_val);
+ }
+ }
+#endif // _OPENMP
+#endif // INTEL_MKL
+}
ThreadPoolDevice::~ThreadPoolDevice() {}
#endif
}
+int CPUIDNumSMT() {
+#ifdef PLATFORM_IS_X86
+ // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration
+ // https://software.intel.com/en-us/articles/intel-sdm (Vol 3A)
+ // Section: Detecting Hardware Multi-threads Support and Topology
+ // Uses CPUID Leaf 11 to enumerate system topology on Intel x86 architectures
+ // Other cases not supported
+ uint32 eax, ebx, ecx, edx;
+ // Check if system supports Leaf 11
+ GETCPUID(eax, ebx, ecx, edx, 0, 0);
+ if (eax >= 11) {
+ // 1) Leaf 11 available? CPUID.(EAX=11, ECX=0):EBX != 0
+ // 2) SMT_Mask_Width = CPUID.(EAX=11, ECX=0):EAX[4:0] if CPUID.(EAX=11,
+ // ECX=0):ECX[15:8] is 1
+ GETCPUID(eax, ebx, ecx, edx, 11, 0);
+ if (ebx != 0 && ((ecx & 0xff00) >> 8) == 1) {
+ return 1 << (eax & 0x1f); // 2 ^ SMT_Mask_Width
+ }
+ }
+#endif // PLATFORM_IS_X86
+ return 0;
+}
+
} // namespace port
} // namespace tensorflow
// software can change it dynamically.
int NumSchedulableCPUs();
+// Returns an estimate of the number of hyperthreads per physical core
+// on the CPU
+int NumHyperthreadsPerCore();
+
// Mostly ISA related features that we care about
enum CPUFeature {
// Do not change numeric assignments.
// Returns nominal core processor cycles per second of each processor.
double NominalCPUFrequency();
+// Returns num of hyperthreads per physical core
+int CPUIDNumSMT();
+
} // namespace port
} // namespace tensorflow
return kDefaultCores;
}
+int NumHyperthreadsPerCore() {
+ static const int ht_per_core = tensorflow::port::CPUIDNumSMT();
+ return (ht_per_core > 0) ? ht_per_core : 1;
+}
+
void* AlignedMalloc(size_t size, int minimum_alignment) {
#if defined(__ANDROID__)
return memalign(minimum_alignment, size);