2 // Copyright (c) 2017 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 ///////////////////////////////////////////////////////////////////////////////////////////////////
18 #include <immintrin.h>
23 float half_to_float(uint16_t value) {
24 static const uint32_t FLOAT16_EXP_SHIFT = (23 - 10);
25 static const uint32_t FLOAT16_EXP_MASK = 0x7C00;
26 static const uint32_t FLOAT32_EXP_MASK = 0x7F800000;
27 static const uint32_t FLOAT16_MANTISSA_MASK = 0x03FF;
28 static const uint32_t FLOAT16_TO_32_BIAS_DIFF_DENORM =
30 << 23); // The difference is (127-15) but we want to do the calculation in the exp place (bit 23:32)
31 static const uint32_t FLOAT16_TO_32_BIAS_DIFF = ((127 - 15) << 10);
32 static const uint32_t FLOAT16_IMPLICIT_1 = (1 << 10);
33 static const uint32_t FLOAT16_EXP_MIN = (1 << 10);
34 static const uint32_t FLOAT16_SIGN_MASK = 0x8000;
35 __m128i a = _mm_unpacklo_epi16(_mm_set1_epi16(value), _mm_setzero_si128());
36 __m128i exps = _mm_and_si128(_mm_set1_epi32(FLOAT16_EXP_MASK), a); // Mask the exponents
37 __m128i mantissa = _mm_and_si128(_mm_set1_epi32(FLOAT16_MANTISSA_MASK), a); // Mask the mantissa
38 __m128i signs = _mm_and_si128(_mm_set1_epi32(FLOAT16_SIGN_MASK), a);
39 signs = _mm_slli_epi32(signs, 16);
41 __m128i nans = _mm_cmpeq_epi32(exps, _mm_set1_epi32(FLOAT16_EXP_MASK));
42 nans = _mm_and_si128(nans, _mm_set1_epi32(FLOAT32_EXP_MASK));
43 nans = _mm_or_si128(nans, signs);
45 __m128i subnormals = _mm_cmpeq_epi32(exps, _mm_setzero_si128());
54 // The expression: (~exp) & mantissa, will evaluate to 0 exactly when the number is non subnormal or it's zero (just
55 // like in the table) testz Tests for this condition
56 if (_mm_testz_si128(subnormals, mantissa)) {
58 exps = _mm_add_epi32(exps, _mm_set1_epi32(FLOAT16_TO_32_BIAS_DIFF));
59 tmp = _mm_or_si128(exps, mantissa);
60 tmp = _mm_slli_epi32(tmp, FLOAT16_EXP_SHIFT);
61 tmp = _mm_blendv_epi8(
64 subnormals); // The idea is of course to use blendv_ps, but epi8 will work the same and won't switch stack
65 tmp = _mm_or_si128(tmp, nans);
66 out32 = _mm_extract_epi32(tmp, 0);
68 __m128i normals = _mm_andnot_si128(subnormals, _mm_set1_epi32(FLOAT16_IMPLICIT_1)); // Mark all normal numbers
69 mantissa = _mm_or_si128(mantissa, normals); // Apply implicit bit
73 FLOAT16_EXP_MIN)); // All subnormals will have 1 in the exponent (needed for correct bias computation)
74 exps = _mm_slli_epi32(exps, FLOAT16_EXP_SHIFT);
75 exps = _mm_add_epi32(exps, _mm_set1_epi32(FLOAT16_TO_32_BIAS_DIFF_DENORM));
77 tmp = _mm_mul_ps(_mm_castsi128_ps(exps), _mm_cvtepi32_ps(mantissa));
78 tmp = _mm_or_ps(tmp, _mm_castsi128_ps(nans));
79 out32 = _mm_extract_ps(tmp, 0);
82 float outf32 = *reinterpret_cast<float*>(&out32);
85 uint16_t float_to_half(float value) {
86 #define TO_M128i(a) (*reinterpret_cast<__m128i*>(&(a)))
87 #define TO_M128(a) (*const_cast<__m128*>(reinterpret_cast<const __m128*>(&(a))))
89 static const uint32_t DWORD_SIGNMASK = 0x80000000;
90 static const uint32_t DWORD_MINFP16 = 0x38800000;
91 static const uint32_t DWORD_MAXFP16 = 0x477fe000;
92 static const uint32_t DWORD_FP16_2_POW_10 = (1 << 10);
93 static const uint32_t DWORD_FP16_EXPBIAS_NO_HALF = 0xc8000000;
94 static const uint32_t WORD_MAXFP16 = 0x7BFF;
96 static const __m128i IVec4SignMask = _mm_set1_epi32(DWORD_SIGNMASK);
97 static const __m128i IVec4MinNormalFp16 = _mm_set1_epi32(DWORD_MINFP16);
98 static const __m128i IVec4MaxNormalFp16 = _mm_set1_epi32(DWORD_MAXFP16);
99 static const __m128i IVec4OnePow10 = _mm_set1_epi32(DWORD_FP16_2_POW_10);
100 static const __m128i IVec4ExpBiasFp16 = _mm_set1_epi32(DWORD_FP16_EXPBIAS_NO_HALF);
101 static const __m128i IVec4MaxFp16InWords = _mm_set1_epi32(WORD_MAXFP16);
103 static const __m128 FVec4MaxNormalFp16 = TO_M128(IVec4MaxNormalFp16);
104 static const __m128 FVec4MinNormalFp16 = TO_M128(IVec4MinNormalFp16);
105 static const __m128i IVec4InfF32 = _mm_set1_epi32(0x7f800000); // inf in in hex representation
106 static const __m128i IVec4InfF16 = _mm_set1_epi32(0x00007c00);
108 static const __m128 FVec4MaxFp16InWords = TO_M128(IVec4MaxFp16InWords);
110 __m128 Src = _mm_set1_ps(value);
112 // Remove the sign bit from the source
113 __m128 AbsSrc = _mm_andnot_ps(TO_M128(IVec4SignMask), Src);
115 // Create a mask to identify the DWORDs that are smaller than the minimum normalized fp16 number
116 __m128 CmpToMinFp16Mask = _mm_cmplt_ps(AbsSrc, FVec4MinNormalFp16);
118 // Create a mask to identify the DWORDs that are larger than the maximum normalized fp16 number
119 __m128 CmpToMaxFp16Mask = _mm_cmpgt_ps(AbsSrc, FVec4MaxNormalFp16);
120 __m128i CmpToInfMask = _mm_cmpeq_epi32(TO_M128i(AbsSrc), IVec4InfF32);
121 // Create a mask with the minimum normalized fp16 number in the DWORDs that are smaller than it
122 __m128 MaskOfMinFp16 = _mm_and_ps(CmpToMinFp16Mask, FVec4MinNormalFp16);
124 __m128i MaskOf2POW10 = _mm_and_si128(TO_M128i(CmpToMinFp16Mask), IVec4OnePow10);
125 __m128 ResultPS = _mm_add_ps(AbsSrc, MaskOfMinFp16);
126 __m128i Result = TO_M128i(ResultPS);
128 // We need to move from a 127 biased domain to a 15 biased domain. This means subtracting 112 from the exponent. We
129 // will add '-112' to the exponent but since the exponent is shifted 23 bits to the left we need to shift '-112' 23
130 // bits to the left as well. This gives us 0xC8000000. We are going to shift the mantissa 13 bits to the right
131 // (moving from 23 bits mantissa to 10).
132 Result = _mm_add_epi32(Result, IVec4ExpBiasFp16);
134 // Shift the mantissa to go from 23 bits to 10 bits
135 Result = _mm_srli_epi32(Result, 13);
137 Result = _mm_sub_epi16(Result, MaskOf2POW10);
139 ResultPS = _mm_blendv_ps(TO_M128(Result), FVec4MaxFp16InWords, CmpToMaxFp16Mask);
140 Result = TO_M128i(ResultPS);
141 // infinity preserving blending
142 Result = _mm_blendv_epi8(Result, IVec4InfF16, CmpToInfMask);
144 __m128i iPackedResult = _mm_packs_epi32(Result, Result);
146 // iSignMask = mask of the sign bits of the source 4 dwords
147 __m128i iSignMask = _mm_and_si128(TO_M128i(Src), IVec4SignMask);
149 // Pack the sign mask to 4 words
150 __m128i iSignInWords = _mm_packs_epi32(iSignMask, iSignMask);
152 iPackedResult = _mm_or_si128(iPackedResult, iSignInWords);
153 return (uint16_t)_mm_extract_epi16(iPackedResult, 0);