inference-engine/src/inference_engine/precision_utils.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #include "precision_utils.h"
   6 #include <stdint.h>
   7 #include <details/ie_exception.hpp>
   8 #include <ie_blob.h>
   9 #include "inference_engine.hpp"
  10
  11 namespace InferenceEngine {
  12 namespace PrecisionUtils {
  13
  14 INFERENCE_ENGINE_API_CPP(void) f16tof32Arrays(float *dst,
  15                                               const short *src,
  16                                               size_t nelem,
  17                                               float scale,
  18                                               float bias) {
  19     const ie_fp16 *_src = reinterpret_cast<const ie_fp16 *>(src);
  20
  21     for (size_t i = 0; i < nelem; i++) {
  22         dst[i] = PrecisionUtils::f16tof32(_src[i]) * scale + bias;
  23     }
  24 }
  25
  26 INFERENCE_ENGINE_API_CPP(void) f32tof16Arrays(short *dst,
  27                                               const float *src,
  28                                               size_t nelem,
  29                                               float scale,
  30                                               float bias) {
  31     for (size_t i = 0; i < nelem; i++) {
  32         dst[i] = PrecisionUtils::f32tof16(src[i] * scale + bias);
  33     }
  34 }
  35
  36 // Function to convert F32 into F16
  37 // F32: exp_bias:127 SEEEEEEE EMMMMMMM MMMMMMMM MMMMMMMM.
  38 // F16: exp_bias:15  SEEEEEMM MMMMMMMM
  39 #define EXP_MASK_F32 0x7F800000U
  40 #define EXP_MASK_F16     0x7C00U
  41
  42 // small helper function to represent uint32_t value as float32
  43 inline float asfloat(uint32_t v) {
  44     // Both type-punning casts and unions are UB per C++ spec
  45     // But compilers usually only break code with casts
  46     union {
  47         float f;
  48         uint32_t i;
  49     };
  50     i = v;
  51     return f;
  52 }
  53
  54 // Function to convert F32 into F16
  55 INFERENCE_ENGINE_API_CPP(float) f16tof32(ie_fp16 x) {
  56     // this is storage for output result
  57     uint32_t u = static_cast<uint32_t>(x);
  58
  59     // get sign in 32bit format
  60     uint32_t s = ((u & 0x8000) << 16);
  61
  62     // check for NAN and INF
  63     if ((u & EXP_MASK_F16) == EXP_MASK_F16) {
  64         // keep mantissa only
  65         u &= 0x03FF;
  66
  67         // check if it is NAN and raise 10 bit to be align with intrin
  68         if (u) {
  69             u |= 0x0200;
  70         }
  71
  72         u <<= (23 - 10);
  73         u |= EXP_MASK_F32;
  74         u |= s;
  75     } else if ((u & EXP_MASK_F16) == 0) {  // check for zero and denormals.
  76         uint16_t h_sig = (u & 0x03ffu);
  77         if (h_sig == 0) {
  78             /* Signed zero */
  79             u = s;
  80         } else {
  81             /* Subnormal */
  82             uint16_t h_exp = (u & EXP_MASK_F16);
  83             h_sig <<= 1;
  84             while ((h_sig & 0x0400u) == 0) {
  85                 h_sig <<= 1;
  86                 h_exp++;
  87             }
  88             uint32_t f_exp = (static_cast<uint32_t>(127 - 15 - h_exp)) << 23;
  89             uint32_t f_sig = (static_cast<uint32_t>(h_sig & 0x03ffu)) << 13;
  90             u = s + f_exp + f_sig;
  91         }
  92     } else {
  93         // abs
  94         u = (u & 0x7FFF);
  95
  96         // shift mantissa and exp from f16 to f32 position
  97         u <<= (23 - 10);
  98
  99         // new bias for exp (f16 bias is 15 and f32 bias is 127)
 100         u += ((127 - 15) << 23);
 101
 102         // add sign
 103         u |= s;
 104     }
 105
 106     // finaly represent result as float and return
 107     return asfloat(u);
 108 }
 109
 110 // This function convert f32 to f16 with rounding to nearest value to minimize error
 111 // the denormal values are converted to 0.
 112 INFERENCE_ENGINE_API_CPP(ie_fp16) f32tof16(float x) {
 113     // create minimal positive normal f16 value in f32 format
 114     // exp:-14,mantissa:0 -> 2^-14 * 1.0
 115     static float min16 = asfloat((127 - 14) << 23);
 116
 117     // create maximal positive normal f16 value in f32 and f16 formats
 118     // exp:15,mantissa:11111 -> 2^15 * 1.(11111)
 119     static float max16 = asfloat(((127 + 15) << 23) | 0x007FE000);
 120     static uint32_t max16f16 = ((15 + 15) << 10) | 0x3FF;
 121
 122     // define and declare variable for intermidiate and output result
 123     // the union is used to simplify representation changing
 124     union {
 125         float f;
 126         uint32_t u;
 127     } v;
 128     v.f = x;
 129
 130     // get sign in 16bit format
 131     uint32_t s = (v.u >> 16) & 0x8000;  // sign 16:  00000000 00000000 10000000 00000000
 132
 133     // make it abs
 134     v.u &= 0x7FFFFFFF;  // abs mask: 01111111 11111111 11111111 11111111
 135
 136     // check NAN and INF
 137     if ((v.u & EXP_MASK_F32) == EXP_MASK_F32) {
 138         if (v.u & 0x007FFFFF) {
 139             return s | (v.u >> (23 - 10)) | 0x0200;  // return NAN f16
 140         } else {
 141             return s | (v.u >> (23 - 10));  // return INF f16
 142         }
 143     }
 144
 145     // to make f32 round to nearest f16
 146     // create halfULP for f16 and add it to origin value
 147     float halfULP = asfloat(v.u & EXP_MASK_F32) * asfloat((127 - 11) << 23);
 148     v.f += halfULP;
 149
 150     // if input value is not fit normalized f16 then return 0
 151     // denormals are not covered by this code and just converted to 0
 152     if (v.f < min16 * 0.5F) {
 153         return s;
 154     }
 155
 156     // if input value between min16/2 and min16 then return min16
 157     if (v.f < min16) {
 158         return s | (1 << 10);
 159     }
 160
 161     // if input value more than maximal allowed value for f16
 162     // then return this maximal value
 163     if (v.f >= max16) {
 164         return max16f16 | s;
 165     }
 166
 167     // change exp bias from 127 to 15
 168     v.u -= ((127 - 15) << 23);
 169
 170     // round to f16
 171     v.u >>= (23 - 10);
 172
 173     return v.u | s;
 174 }
 175
 176 }  // namespace PrecisionUtils
 177 }  // namespace InferenceEngine
 178