inference-engine/thirdparty/clDNN/src/half.cpp

   1 /*
   2 // Copyright (c) 2017 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 ///////////////////////////////////////////////////////////////////////////////////////////////////
  18 #include <immintrin.h>
  19 #include <stdint.h>
  20
  21 namespace cldnn {
  22
  23 float half_to_float(uint16_t value) {
  24     static const uint32_t FLOAT16_EXP_SHIFT = (23 - 10);
  25     static const uint32_t FLOAT16_EXP_MASK = 0x7C00;
  26     static const uint32_t FLOAT32_EXP_MASK = 0x7F800000;
  27     static const uint32_t FLOAT16_MANTISSA_MASK = 0x03FF;
  28     static const uint32_t FLOAT16_TO_32_BIAS_DIFF_DENORM =
  29         ((127 - 15 - 10)
  30          << 23);  // The difference is (127-15) but we want to do the calculation in the exp place (bit 23:32)
  31     static const uint32_t FLOAT16_TO_32_BIAS_DIFF = ((127 - 15) << 10);
  32     static const uint32_t FLOAT16_IMPLICIT_1 = (1 << 10);
  33     static const uint32_t FLOAT16_EXP_MIN = (1 << 10);
  34     static const uint32_t FLOAT16_SIGN_MASK = 0x8000;
  35     __m128i a = _mm_unpacklo_epi16(_mm_set1_epi16(value), _mm_setzero_si128());
  36     __m128i exps = _mm_and_si128(_mm_set1_epi32(FLOAT16_EXP_MASK), a);           // Mask the exponents
  37     __m128i mantissa = _mm_and_si128(_mm_set1_epi32(FLOAT16_MANTISSA_MASK), a);  // Mask the mantissa
  38     __m128i signs = _mm_and_si128(_mm_set1_epi32(FLOAT16_SIGN_MASK), a);
  39     signs = _mm_slli_epi32(signs, 16);
  40
  41     __m128i nans = _mm_cmpeq_epi32(exps, _mm_set1_epi32(FLOAT16_EXP_MASK));
  42     nans = _mm_and_si128(nans, _mm_set1_epi32(FLOAT32_EXP_MASK));
  43     nans = _mm_or_si128(nans, signs);
  44
  45     __m128i subnormals = _mm_cmpeq_epi32(exps, _mm_setzero_si128());
  46
  47     int out32;
  48     // e\m| 0 | 1
  49     // ------------
  50     //  0 | 0 | S
  51     // ------------
  52     //  1 | N | N
  53     //
  54     // The expression: (~exp) & mantissa, will evaluate to 0 exactly when the number is non subnormal or it's zero (just
  55     // like in the table) testz Tests for this condition
  56     if (_mm_testz_si128(subnormals, mantissa)) {
  57         __m128i tmp;
  58         exps = _mm_add_epi32(exps, _mm_set1_epi32(FLOAT16_TO_32_BIAS_DIFF));
  59         tmp = _mm_or_si128(exps, mantissa);
  60         tmp = _mm_slli_epi32(tmp, FLOAT16_EXP_SHIFT);
  61         tmp = _mm_blendv_epi8(
  62             tmp,
  63             _mm_setzero_si128(),
  64             subnormals);  // The idea is of course to use blendv_ps, but epi8 will work the same and won't switch stack
  65         tmp = _mm_or_si128(tmp, nans);
  66         out32 = _mm_extract_epi32(tmp, 0);
  67     } else {
  68         __m128i normals = _mm_andnot_si128(subnormals, _mm_set1_epi32(FLOAT16_IMPLICIT_1));  // Mark all normal numbers
  69         mantissa = _mm_or_si128(mantissa, normals);                                          // Apply implicit bit
  70         exps = _mm_max_epi16(
  71             exps,
  72             _mm_set1_epi32(
  73                 FLOAT16_EXP_MIN));  // All subnormals will have 1 in the exponent (needed for correct bias computation)
  74         exps = _mm_slli_epi32(exps, FLOAT16_EXP_SHIFT);
  75         exps = _mm_add_epi32(exps, _mm_set1_epi32(FLOAT16_TO_32_BIAS_DIFF_DENORM));
  76         __m128 tmp;
  77         tmp = _mm_mul_ps(_mm_castsi128_ps(exps), _mm_cvtepi32_ps(mantissa));
  78         tmp = _mm_or_ps(tmp, _mm_castsi128_ps(nans));
  79         out32 = _mm_extract_ps(tmp, 0);
  80     }
  81
  82     float outf32 = *reinterpret_cast<float*>(&out32);
  83     return outf32;
  84 }
  85 uint16_t float_to_half(float value) {
  86 #define TO_M128i(a) (*reinterpret_cast<__m128i*>(&(a)))
  87 #define TO_M128(a) (*const_cast<__m128*>(reinterpret_cast<const __m128*>(&(a))))
  88
  89     static const uint32_t DWORD_SIGNMASK = 0x80000000;
  90     static const uint32_t DWORD_MINFP16 = 0x38800000;
  91     static const uint32_t DWORD_MAXFP16 = 0x477fe000;
  92     static const uint32_t DWORD_FP16_2_POW_10 = (1 << 10);
  93     static const uint32_t DWORD_FP16_EXPBIAS_NO_HALF = 0xc8000000;
  94     static const uint32_t WORD_MAXFP16 = 0x7BFF;
  95
  96     static const __m128i IVec4SignMask = _mm_set1_epi32(DWORD_SIGNMASK);
  97     static const __m128i IVec4MinNormalFp16 = _mm_set1_epi32(DWORD_MINFP16);
  98     static const __m128i IVec4MaxNormalFp16 = _mm_set1_epi32(DWORD_MAXFP16);
  99     static const __m128i IVec4OnePow10 = _mm_set1_epi32(DWORD_FP16_2_POW_10);
 100     static const __m128i IVec4ExpBiasFp16 = _mm_set1_epi32(DWORD_FP16_EXPBIAS_NO_HALF);
 101     static const __m128i IVec4MaxFp16InWords = _mm_set1_epi32(WORD_MAXFP16);
 102
 103     static const __m128 FVec4MaxNormalFp16 = TO_M128(IVec4MaxNormalFp16);
 104     static const __m128 FVec4MinNormalFp16 = TO_M128(IVec4MinNormalFp16);
 105     static const __m128i IVec4InfF32 = _mm_set1_epi32(0x7f800000);  // inf in in hex representation
 106     static const __m128i IVec4InfF16 = _mm_set1_epi32(0x00007c00);
 107
 108     static const __m128 FVec4MaxFp16InWords = TO_M128(IVec4MaxFp16InWords);
 109
 110     __m128 Src = _mm_set1_ps(value);
 111
 112     // Remove the sign bit from the source
 113     __m128 AbsSrc = _mm_andnot_ps(TO_M128(IVec4SignMask), Src);
 114
 115     // Create a mask to identify the DWORDs that are smaller than the minimum normalized fp16 number
 116     __m128 CmpToMinFp16Mask = _mm_cmplt_ps(AbsSrc, FVec4MinNormalFp16);
 117
 118     // Create a mask to identify the DWORDs that are larger than the maximum normalized fp16 number
 119     __m128 CmpToMaxFp16Mask = _mm_cmpgt_ps(AbsSrc, FVec4MaxNormalFp16);
 120     __m128i CmpToInfMask = _mm_cmpeq_epi32(TO_M128i(AbsSrc), IVec4InfF32);
 121     // Create a mask with the minimum normalized fp16 number in the DWORDs that are smaller than it
 122     __m128 MaskOfMinFp16 = _mm_and_ps(CmpToMinFp16Mask, FVec4MinNormalFp16);
 123
 124     __m128i MaskOf2POW10 = _mm_and_si128(TO_M128i(CmpToMinFp16Mask), IVec4OnePow10);
 125     __m128 ResultPS = _mm_add_ps(AbsSrc, MaskOfMinFp16);
 126     __m128i Result = TO_M128i(ResultPS);
 127
 128     // We need to move from a 127 biased domain to a 15 biased domain. This means subtracting 112 from the exponent. We
 129     // will add '-112' to the exponent but since the exponent is shifted 23 bits to the left we need to shift '-112' 23
 130     // bits to the left as well. This gives us 0xC8000000. We are going to shift the mantissa 13 bits to the right
 131     // (moving from 23 bits mantissa to 10).
 132     Result = _mm_add_epi32(Result, IVec4ExpBiasFp16);
 133
 134     // Shift the mantissa to go from 23 bits to 10 bits
 135     Result = _mm_srli_epi32(Result, 13);
 136
 137     Result = _mm_sub_epi16(Result, MaskOf2POW10);
 138
 139     ResultPS = _mm_blendv_ps(TO_M128(Result), FVec4MaxFp16InWords, CmpToMaxFp16Mask);
 140     Result = TO_M128i(ResultPS);
 141     // infinity preserving blending
 142     Result = _mm_blendv_epi8(Result, IVec4InfF16, CmpToInfMask);
 143
 144     __m128i iPackedResult = _mm_packs_epi32(Result, Result);
 145
 146     // iSignMask = mask of the sign bits of the source 4 dwords
 147     __m128i iSignMask = _mm_and_si128(TO_M128i(Src), IVec4SignMask);
 148
 149     // Pack the sign mask to 4 words
 150     __m128i iSignInWords = _mm_packs_epi32(iSignMask, iSignMask);
 151
 152     iPackedResult = _mm_or_si128(iPackedResult, iSignInWords);
 153     return (uint16_t)_mm_extract_epi16(iPackedResult, 0);
 154 }
 155 }  // namespace cldnn