1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
5 #include "precision_utils.h"
7 #include <details/ie_exception.hpp>
9 #include "inference_engine.hpp"
11 namespace InferenceEngine {
12 namespace PrecisionUtils {
14 INFERENCE_ENGINE_API_CPP(void) f16tof32Arrays(float *dst,
19 const ie_fp16 *_src = reinterpret_cast<const ie_fp16 *>(src);
21 for (size_t i = 0; i < nelem; i++) {
22 dst[i] = PrecisionUtils::f16tof32(_src[i]) * scale + bias;
26 INFERENCE_ENGINE_API_CPP(void) f32tof16Arrays(short *dst,
31 for (size_t i = 0; i < nelem; i++) {
32 dst[i] = PrecisionUtils::f32tof16(src[i] * scale + bias);
36 // Function to convert F32 into F16
37 // F32: exp_bias:127 SEEEEEEE EMMMMMMM MMMMMMMM MMMMMMMM.
38 // F16: exp_bias:15 SEEEEEMM MMMMMMMM
39 #define EXP_MASK_F32 0x7F800000U
40 #define EXP_MASK_F16 0x7C00U
42 // small helper function to represent uint32_t value as float32
43 inline float asfloat(uint32_t v) {
44 // Both type-punning casts and unions are UB per C++ spec
45 // But compilers usually only break code with casts
54 // Function to convert F32 into F16
55 INFERENCE_ENGINE_API_CPP(float) f16tof32(ie_fp16 x) {
56 // this is storage for output result
57 uint32_t u = static_cast<uint32_t>(x);
59 // get sign in 32bit format
60 uint32_t s = ((u & 0x8000) << 16);
62 // check for NAN and INF
63 if ((u & EXP_MASK_F16) == EXP_MASK_F16) {
67 // check if it is NAN and raise 10 bit to be align with intrin
75 } else if ((u & EXP_MASK_F16) == 0) { // check for zero and denormals.
76 uint16_t h_sig = (u & 0x03ffu);
82 uint16_t h_exp = (u & EXP_MASK_F16);
84 while ((h_sig & 0x0400u) == 0) {
88 uint32_t f_exp = (static_cast<uint32_t>(127 - 15 - h_exp)) << 23;
89 uint32_t f_sig = (static_cast<uint32_t>(h_sig & 0x03ffu)) << 13;
90 u = s + f_exp + f_sig;
96 // shift mantissa and exp from f16 to f32 position
99 // new bias for exp (f16 bias is 15 and f32 bias is 127)
100 u += ((127 - 15) << 23);
106 // finaly represent result as float and return
110 // This function convert f32 to f16 with rounding to nearest value to minimize error
111 // the denormal values are converted to 0.
112 INFERENCE_ENGINE_API_CPP(ie_fp16) f32tof16(float x) {
113 // create minimal positive normal f16 value in f32 format
114 // exp:-14,mantissa:0 -> 2^-14 * 1.0
115 static float min16 = asfloat((127 - 14) << 23);
117 // create maximal positive normal f16 value in f32 and f16 formats
118 // exp:15,mantissa:11111 -> 2^15 * 1.(11111)
119 static float max16 = asfloat(((127 + 15) << 23) | 0x007FE000);
120 static uint32_t max16f16 = ((15 + 15) << 10) | 0x3FF;
122 // define and declare variable for intermidiate and output result
123 // the union is used to simplify representation changing
130 // get sign in 16bit format
131 uint32_t s = (v.u >> 16) & 0x8000; // sign 16: 00000000 00000000 10000000 00000000
134 v.u &= 0x7FFFFFFF; // abs mask: 01111111 11111111 11111111 11111111
137 if ((v.u & EXP_MASK_F32) == EXP_MASK_F32) {
138 if (v.u & 0x007FFFFF) {
139 return s | (v.u >> (23 - 10)) | 0x0200; // return NAN f16
141 return s | (v.u >> (23 - 10)); // return INF f16
145 // to make f32 round to nearest f16
146 // create halfULP for f16 and add it to origin value
147 float halfULP = asfloat(v.u & EXP_MASK_F32) * asfloat((127 - 11) << 23);
150 // if input value is not fit normalized f16 then return 0
151 // denormals are not covered by this code and just converted to 0
152 if (v.f < min16 * 0.5F) {
156 // if input value between min16/2 and min16 then return min16
158 return s | (1 << 10);
161 // if input value more than maximal allowed value for f16
162 // then return this maximal value
167 // change exp bias from 127 to 15
168 v.u -= ((127 - 15) << 23);
176 } // namespace PrecisionUtils
177 } // namespace InferenceEngine