Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / mkl-dnn / src / cpu / xbyak / xbyak_util.h
index 713c68d..08f0a30 100644 (file)
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2018 Intel Corporation
+* Copyright 2016-2019 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 */
 #include "xbyak.h"
 
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+       #define XBYAK_INTEL_CPU_SPECIFIC
+#endif
+
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
 #ifdef _MSC_VER
        #if (_MSC_VER < 1400) && defined(XBYAK32)
                static inline __declspec(naked) void __cpuid(int[4], int)
                #endif
        #endif
 #endif
+#endif
 
 namespace Xbyak { namespace util {
 
+typedef enum {
+   SmtLevel = 1,
+   CoreLevel = 2
+} IntelCpuTopologyLevel;
+
 /**
        CPU detection class
 */
 class Cpu {
        uint64 type_;
+       //system topology
+       bool x2APIC_supported_;
+       static const size_t maxTopologyLevels = 2;
+       unsigned int numCores_[maxTopologyLevels];
+
+       static const unsigned int maxNumberCacheLevels = 10;
+       unsigned int dataCacheSize_[maxNumberCacheLevels];
+       unsigned int coresSharignDataCache_[maxNumberCacheLevels];
+       unsigned int dataCacheLevels_;
+
        unsigned int get32bitAsBE(const char *x) const
        {
                return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
@@ -110,7 +131,7 @@ class Cpu {
        }
        void setFamily()
        {
-               unsigned int data[4];
+               unsigned int data[4] = {};
                getCpuid(1, data);
                stepping = data[0] & mask(4);
                model = (data[0] >> 4) & mask(4);
@@ -133,6 +154,42 @@ class Cpu {
        {
                return (val >> base) & ((1u << (end - base)) - 1);
        }
+       void setNumCores()
+       {
+               if ((type_ & tINTEL) == 0) return;
+
+               unsigned int data[4] = {};
+
+                /* CAUTION: These numbers are configuration as shipped by Intel. */
+               getCpuidEx(0x0, 0, data);
+               if (data[0] >= 0xB) {
+                        /*
+                               if leaf 11 exists(x2APIC is supported),
+                               we use it to get the number of smt cores and cores on socket
+
+                               leaf 0xB can be zeroed-out by a hypervisor
+                       */
+                       x2APIC_supported_ = true;
+                       for (unsigned int i = 0; i < maxTopologyLevels; i++) {
+                               getCpuidEx(0xB, i, data);
+                               IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
+                               if (level == SmtLevel || level == CoreLevel) {
+                                       numCores_[level - 1] = extractBit(data[1], 0, 15);
+                               }
+                       }
+                       if (numCores_[SmtLevel - 1] != 0) {
+                               numCores_[CoreLevel - 1] /= numCores_[SmtLevel - 1];
+                       }
+               } else {
+                       /*
+                               Failed to deremine num of cores without x2APIC support.
+                               TODO: USE initial APIC ID to determine ncores.
+                       */
+                       numCores_[SmtLevel - 1] = 0;
+                       numCores_[CoreLevel - 1] = 0;
+               }
+
+       }
        void setCacheHierarchy()
        {
                if ((type_ & tINTEL) == 0) return;
@@ -141,21 +198,12 @@ class Cpu {
 //             const unsigned int INSTRUCTION_CACHE = 2;
                const unsigned int UNIFIED_CACHE = 3;
                unsigned int smt_width = 0;
-               unsigned int n_cores = 0;
-               unsigned int data[4];
+               unsigned int logical_cores = 0;
+               unsigned int data[4] = {};
 
-               /*
-                       if leaf 11 exists, we use it to get the number of smt cores and cores on socket
-                       If x2APIC is supported, these are the only correct numbers.
-
-                       leaf 0xB can be zeroed-out by a hypervisor
-               */
-               getCpuidEx(0x0, 0, data);
-               if (data[0] >= 0xB) {
-                       getCpuidEx(0xB, 0, data); // CPUID for SMT Level
-                       smt_width = data[1] & 0x7FFF;
-                       getCpuidEx(0xB, 1, data); // CPUID for CORE Level
-                       n_cores = data[1] & 0x7FFF;
+               if (x2APIC_supported_) {
+                       smt_width = numCores_[0];
+                       logical_cores = numCores_[1];
                }
 
                /*
@@ -163,28 +211,29 @@ class Cpu {
                        the first level of data cache is not shared (which is the
                        case for every existing architecture) and use this to
                        determine the SMT width for arch not supporting leaf 11.
-                       when leaf 4 reports a number of core less than n_cores
+                       when leaf 4 reports a number of core less than numCores_
                        on socket reported by leaf 11, then it is a correct number
                        of cores not an upperbound.
                */
-               for (int i = 0; data_cache_levels < maxNumberCacheLevels; i++) {
+               for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
                        getCpuidEx(0x4, i, data);
                        unsigned int cacheType = extractBit(data[0], 0, 4);
                        if (cacheType == NO_CACHE) break;
                        if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
-                               unsigned int nb_logical_cores = extractBit(data[0], 14, 25) + 1;
-                               if (n_cores != 0) // true only if leaf 0xB is supported and valid
-                                       nb_logical_cores = (std::min)(nb_logical_cores, n_cores);
-                               assert(nb_logical_cores != 0);
-                               data_cache_size[data_cache_levels] =
+                               unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1;
+                               if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
+                                       actual_logical_cores = (std::min)(actual_logical_cores, logical_cores);
+                               }
+                               assert(actual_logical_cores != 0);
+                               dataCacheSize_[dataCacheLevels_] =
                                        (extractBit(data[1], 22, 31) + 1)
                                        * (extractBit(data[1], 12, 21) + 1)
                                        * (extractBit(data[1], 0, 11) + 1)
                                        * (data[2] + 1);
-                               if (cacheType == DATA_CACHE && smt_width == 0) smt_width = nb_logical_cores;
+                               if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
                                assert(smt_width != 0);
-                               cores_sharing_data_cache[data_cache_levels] = nb_logical_cores / smt_width;
-                               data_cache_levels++;
+                               coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u);
+                               dataCacheLevels_++;
                        }
                }
        }
@@ -198,22 +247,22 @@ public:
        int displayFamily; // family + extFamily
        int displayModel; // model + extModel
 
-       // may I move these members into private?
-       static const unsigned int maxNumberCacheLevels = 10;
-       unsigned int data_cache_size[maxNumberCacheLevels];
-       unsigned int cores_sharing_data_cache[maxNumberCacheLevels];
-       unsigned int data_cache_levels;
+       unsigned int getNumCores(IntelCpuTopologyLevel level) {
+               if (level != SmtLevel && level != CoreLevel) throw Error(ERR_BAD_PARAMETER);
+               if (!x2APIC_supported_) throw Error(ERR_X2APIC_IS_NOT_SUPPORTED);
+               return numCores_[level - 1];
+       }
 
-       unsigned int getDataCacheLevels() const { return data_cache_levels; }
+       unsigned int getDataCacheLevels() const { return dataCacheLevels_; }
        unsigned int getCoresSharingDataCache(unsigned int i) const
        {
-               if (i >= data_cache_levels) throw  Error(ERR_BAD_PARAMETER);
-               return cores_sharing_data_cache[i];
+               if (i >= dataCacheLevels_) throw  Error(ERR_BAD_PARAMETER);
+               return coresSharignDataCache_[i];
        }
        unsigned int getDataCacheSize(unsigned int i) const
        {
-               if (i >= data_cache_levels) throw  Error(ERR_BAD_PARAMETER);
-               return data_cache_size[i];
+               if (i >= dataCacheLevels_) throw  Error(ERR_BAD_PARAMETER);
+               return dataCacheSize_[i];
        }
 
        /*
@@ -221,30 +270,45 @@ public:
        */
        static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
        {
-#ifdef _MSC_VER
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+       #ifdef _MSC_VER
                __cpuid(reinterpret_cast<int*>(data), eaxIn);
-#else
+       #else
                __cpuid(eaxIn, data[0], data[1], data[2], data[3]);
+       #endif
+#else
+               (void)eaxIn;
+               (void)data;
 #endif
        }
        static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4])
        {
-#ifdef _MSC_VER
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+       #ifdef _MSC_VER
                __cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
-#else
+       #else
                __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
+       #endif
+#else
+               (void)eaxIn;
+               (void)ecxIn;
+               (void)data;
 #endif
        }
        static inline uint64 getXfeature()
        {
-#ifdef _MSC_VER
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+       #ifdef _MSC_VER
                return _xgetbv(0);
-#else
+       #else
                unsigned int eax, edx;
                // xgetvb is not support on gcc 4.2
 //             __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
                __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
                return ((uint64)edx << 32) | eax;
+       #endif
+#else
+               return 0;
 #endif
        }
        typedef uint64 Type;
@@ -315,9 +379,13 @@ public:
 
        Cpu()
                : type_(NONE)
-               , data_cache_levels(0)
+               , x2APIC_supported_(false)
+               , numCores_()
+               , dataCacheSize_()
+               , coresSharignDataCache_()
+               , dataCacheLevels_(0)
        {
-               unsigned int data[4];
+               unsigned int data[4] = {};
                const unsigned int& EAX = data[0];
                const unsigned int& EBX = data[1];
                const unsigned int& ECX = data[2];
@@ -407,6 +475,7 @@ public:
                        if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
                }
                setFamily();
+               setNumCores();
                setCacheHierarchy();
        }
        void putFamily() const
@@ -425,12 +494,17 @@ class Clock {
 public:
        static inline uint64 getRdtsc()
        {
-#ifdef _MSC_VER
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+       #ifdef _MSC_VER
                return __rdtsc();
-#else
+       #else
                unsigned int eax, edx;
                __asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
                return ((uint64)edx << 32) | eax;
+       #endif
+#else
+               // TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu
+               return 0;
 #endif
        }
        Clock()
@@ -460,7 +534,7 @@ const int UseRCX = 1 << 6;
 const int UseRDX = 1 << 7;
 
 class Pack {
-       static const size_t maxTblNum = 10;
+       static const size_t maxTblNum = 15;
        const Xbyak::Reg64 *tbl_[maxTblNum];
        size_t n_;
 public:
@@ -520,7 +594,7 @@ public:
        const Xbyak::Reg64& operator[](size_t n) const
        {
                if (n >= n_) {
-                       fprintf(stderr, "ERR Pack bad n=%d\n", (int)n);
+                       fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
                        throw Error(ERR_BAD_PARAMETER);
                }
                return *tbl_[n];
@@ -562,6 +636,7 @@ class StackFrame {
        static const int rcxPos = 3;
        static const int rdxPos = 2;
 #endif
+       static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax
        Xbyak::CodeGenerator *code_;
        int pNum_;
        int tNum_;
@@ -571,7 +646,7 @@ class StackFrame {
        int P_;
        bool makeEpilog_;
        Xbyak::Reg64 pTbl_[4];
-       Xbyak::Reg64 tTbl_[10];
+       Xbyak::Reg64 tTbl_[maxRegNum];
        Pack p_;
        Pack t_;
        StackFrame(const StackFrame&);
@@ -583,7 +658,7 @@ public:
                make stack frame
                @param sf [in] this
                @param pNum [in] num of function parameter(0 <= pNum <= 4)
-               @param tNum [in] num of temporary register(0 <= tNum <= 10, with UseRCX, UseRDX)
+               @param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14
                @param stackSizeByte [in] local stack size
                @param makeEpilog [in] automatically call close() if true
 
@@ -610,27 +685,17 @@ public:
                using namespace Xbyak;
                if (pNum < 0 || pNum > 4) throw Error(ERR_BAD_PNUM);
                const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
-               if (allRegNum < pNum || allRegNum > 14) throw Error(ERR_BAD_TNUM);
+               if (tNum_ < 0 || allRegNum > maxRegNum) throw Error(ERR_BAD_TNUM);
                const Reg64& _rsp = code->rsp;
-               const AddressFrame& _ptr = code->ptr;
                saveNum_ = (std::max)(0, allRegNum - noSaveNum);
                const int *tbl = getOrderTbl() + noSaveNum;
-               P_ = saveNum_ + (stackSizeByte + 7) / 8;
-               if (P_ > 0 && (P_ & 1) == 0) P_++; // here (rsp % 16) == 8, then increment P_ for 16 byte alignment
-               P_ *= 8;
-               if (P_ > 0) code->sub(_rsp, P_);
-#ifdef XBYAK64_WIN
-               for (int i = 0; i < (std::min)(saveNum_, 4); i++) {
-                       code->mov(_ptr [_rsp + P_ + (i + 1) * 8], Reg64(tbl[i]));
-               }
-               for (int i = 4; i < saveNum_; i++) {
-                       code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i]));
-               }
-#else
                for (int i = 0; i < saveNum_; i++) {
-                       code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i]));
+                       code->push(Reg64(tbl[i]));
                }
-#endif
+               P_ = (stackSizeByte + 7) / 8;
+               if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++; // (rsp % 16) == 8, then increment P_ for 16 byte alignment
+               P_ *= 8;
+               if (P_ > 0) code->sub(_rsp, P_);
                int pos = 0;
                for (int i = 0; i < pNum; i++) {
                        pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
@@ -651,21 +716,11 @@ public:
        {
                using namespace Xbyak;
                const Reg64& _rsp = code_->rsp;
-               const AddressFrame& _ptr = code_->ptr;
                const int *tbl = getOrderTbl() + noSaveNum;
-#ifdef XBYAK64_WIN
-               for (int i = 0; i < (std::min)(saveNum_, 4); i++) {
-                       code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ + (i + 1) * 8]);
-               }
-               for (int i = 4; i < saveNum_; i++) {
-                       code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]);
-               }
-#else
+               if (P_ > 0) code_->add(_rsp, P_);
                for (int i = 0; i < saveNum_; i++) {
-                       code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]);
+                       code_->pop(Reg64(tbl[saveNum_ - 1 - i]));
                }
-#endif
-               if (P_ > 0) code_->add(_rsp, P_);
 
                if (callRet) code_->ret();
        }
@@ -677,9 +732,6 @@ public:
                } catch (std::exception& e) {
                        printf("ERR:StackFrame %s\n", e.what());
                        exit(1);
-               } catch (...) {
-                       printf("ERR:StackFrame otherwise\n");
-                       exit(1);
                }
        }
 private:
@@ -698,7 +750,7 @@ private:
        }
        int getRegIdx(int& pos) const
        {
-               assert(pos < 14);
+               assert(pos < maxRegNum);
                using namespace Xbyak;
                const int *tbl = getOrderTbl();
                int r = tbl[pos++];