modules/core/src/convert.simd.hpp

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html
   4
   5 #include "precomp.hpp"
   6 #include "convert.hpp"
   7
   8 #if !defined(OPENCV_SUPRESS_WARNING_AVX2_WITHOUT_FP16C) && \
   9     (defined(__GNUC__) && defined(__AVX2__) && !defined(__F16C__))
  10 #warning "Non-optimal compiler flags: AVX2 without FP16. Generated code is very slow. Consider adding '-mf16c' compiler option."
  11 #endif
  12
  13 namespace cv {
  14 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
  15
  16 BinaryFunc getConvertFunc(int sdepth, int ddepth);
  17 BinaryFunc get_cvt32f16f();
  18 BinaryFunc get_cvt16f32f();
  19
  20 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
  21
  22 /*namespace hal {
  23
  24 void cvt16f32f( const float16_t* src, float* dst, int len )
  25 {
  26     int j = 0;
  27 #if CV_SIMD
  28     const int VECSZ = v_float32::nlanes;
  29     for( ; j < len; j += VECSZ )
  30     {
  31         if( j > len - VECSZ )
  32         {
  33             if( j == 0 )
  34                 break;
  35             j = len - VECSZ;
  36         }
  37         v_store(dst + j, vx_load_expand(src + j));
  38     }
  39 #endif
  40     for( ; j < len; j++ )
  41         dst[j] = (float)src[j];
  42 }
  43
  44 void cvt32f16f( const float* src, float16_t* dst, int len )
  45 {
  46     int j = 0;
  47 #if CV_SIMD
  48     const int VECSZ = v_float32::nlanes;
  49     for( ; j < len; j += VECSZ )
  50     {
  51         if( j > len - VECSZ )
  52         {
  53             if( j == 0 )
  54                 break;
  55             j = len - VECSZ;
  56         }
  57         v_pack_store(dst + j, vx_load(src + j));
  58     }
  59 #endif
  60     for( ; j < len; j++ )
  61         dst[j] = float16_t(src[j]);
  62 }
  63
  64 /*void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len )
  65 {
  66     // the loop is simple enough, so we let the compiler to vectorize it
  67     for( int i = 0; i < len; i++ )
  68         arr[i] = scaleBiasPairs[i*2 + 1];
  69 }
  70
  71 void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len )
  72 {
  73     // the loop is simple enough, so we let the compiler to vectorize it
  74     for( int i = 0; i < len; i++ )
  75         arr[i] = scaleBiasPairs[i*2 + 1];
  76 }
  77
  78 }*/
  79
  80 template<typename _Ts, typename _Td, typename _Twvec> static inline void
  81 cvt_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
  82 {
  83     sstep /= sizeof(src[0]);
  84     dstep /= sizeof(dst[0]);
  85
  86     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
  87     {
  88         int j = 0;
  89 #if CV_SIMD
  90         const int VECSZ = _Twvec::nlanes*2;
  91         for( ; j < size.width; j += VECSZ )
  92         {
  93             if( j > size.width - VECSZ )
  94             {
  95                 if( j == 0 || src == (_Ts*)dst )
  96                     break;
  97                 j = size.width - VECSZ;
  98             }
  99             _Twvec v0, v1;
 100             vx_load_pair_as(src + j, v0, v1);
 101             v_store_pair_as(dst + j, v0, v1);
 102         }
 103 #endif
 104         for( ; j < size.width; j++ )
 105             dst[j] = saturate_cast<_Td>(src[j]);
 106     }
 107 }
 108
 109 // in order to reduce the code size, for (16f <-> ...) conversions
 110 // we add a conversion function without loop unrolling
 111 template<typename _Ts, typename _Td, typename _Twvec> static inline void
 112 cvt1_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
 113 {
 114     sstep /= sizeof(src[0]);
 115     dstep /= sizeof(dst[0]);
 116
 117     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
 118     {
 119         int j = 0;
 120 #if CV_SIMD
 121         const int VECSZ = _Twvec::nlanes;
 122         for( ; j < size.width; j += VECSZ )
 123         {
 124             if( j > size.width - VECSZ )
 125             {
 126                 if( j == 0 || src == (_Ts*)dst )
 127                     break;
 128                 j = size.width - VECSZ;
 129             }
 130             _Twvec v;
 131             vx_load_as(src + j, v);
 132             v_store_as(dst + j, v);
 133         }
 134         vx_cleanup();
 135 #endif
 136         for( ; j < size.width; j++ )
 137             dst[j] = saturate_cast<_Td>(src[j]);
 138     }
 139 }
 140
 141 static void cvtCopy( const uchar* src, size_t sstep,
 142                      uchar* dst, size_t dstep, Size size, size_t elemsize)
 143 {
 144     size_t len = size.width*elemsize;
 145     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
 146     {
 147         memcpy( dst, src, len );
 148     }
 149 }
 150
 151 #define DEF_CVT_FUNC(suffix, cvtfunc, _Ts, _Td, _Twvec) \
 152 static void cvt##suffix(const uchar* src_, size_t sstep, const uchar*, size_t, \
 153                         uchar* dst_, size_t dstep, Size size, void*) \
 154 { \
 155     CV_INSTRUMENT_REGION(); \
 156     const _Ts* src = (const _Ts*)src_; \
 157     _Td* dst = (_Td*)dst_; \
 158     cvtfunc<_Ts, _Td, _Twvec>(src, sstep, dst, dstep, size); \
 159 }
 160
 161 ////////////////////// 8u -> ... ////////////////////////
 162
 163 DEF_CVT_FUNC(8u8s,  cvt_,  uchar, schar,    v_int16)
 164 DEF_CVT_FUNC(8u16u, cvt_,  uchar, ushort,   v_uint16)
 165 DEF_CVT_FUNC(8u16s, cvt_,  uchar, short,    v_int16)
 166 DEF_CVT_FUNC(8u32s, cvt_,  uchar, int,      v_int32)
 167 DEF_CVT_FUNC(8u32f, cvt_,  uchar, float,    v_float32)
 168 DEF_CVT_FUNC(8u64f, cvt_,  uchar, double,   v_int32)
 169 //DEF_CVT_FUNC(8u16f, cvt1_, uchar, float16_t, v_float32)
 170
 171 ////////////////////// 8s -> ... ////////////////////////
 172
 173 DEF_CVT_FUNC(8s8u,  cvt_,  schar, uchar,    v_int16)
 174 DEF_CVT_FUNC(8s16u, cvt_,  schar, ushort,   v_uint16)
 175 DEF_CVT_FUNC(8s16s, cvt_,  schar, short,    v_int16)
 176 DEF_CVT_FUNC(8s32s, cvt_,  schar, int,      v_int32)
 177 DEF_CVT_FUNC(8s32f, cvt_,  schar, float,    v_float32)
 178 DEF_CVT_FUNC(8s64f, cvt_,  schar, double,   v_int32)
 179 //DEF_CVT_FUNC(8s16f, cvt1_, schar, float16_t, v_float32)
 180
 181 ////////////////////// 16u -> ... ////////////////////////
 182
 183 DEF_CVT_FUNC(16u8u,  cvt_, ushort, uchar,  v_uint16)
 184 DEF_CVT_FUNC(16u8s,  cvt_, ushort, schar,  v_uint16)
 185 DEF_CVT_FUNC(16u16s, cvt_, ushort, short,  v_int32)
 186 DEF_CVT_FUNC(16u32s, cvt_, ushort, int,    v_int32)
 187 DEF_CVT_FUNC(16u32f, cvt_, ushort, float,  v_float32)
 188 DEF_CVT_FUNC(16u64f, cvt_, ushort, double, v_int32)
 189 //DEF_CVT_FUNC(16u16f, cvt1_,ushort, float16_t, v_float32)
 190
 191 ////////////////////// 16s -> ... ////////////////////////
 192
 193 DEF_CVT_FUNC(16s8u,  cvt_, short, uchar,  v_int16)
 194 DEF_CVT_FUNC(16s8s,  cvt_, short, schar,  v_int16)
 195 DEF_CVT_FUNC(16s16u, cvt_, short, ushort, v_int32)
 196 DEF_CVT_FUNC(16s32s, cvt_, short, int,    v_int32)
 197 DEF_CVT_FUNC(16s32f, cvt_, short, float,  v_float32)
 198 DEF_CVT_FUNC(16s64f, cvt_, short, double, v_int32)
 199 //DEF_CVT_FUNC(16s16f, cvt1_,short, float16_t, v_float32)
 200
 201 ////////////////////// 32s -> ... ////////////////////////
 202
 203 DEF_CVT_FUNC(32s8u,  cvt_, int, uchar,  v_int32)
 204 DEF_CVT_FUNC(32s8s,  cvt_, int, schar,  v_int32)
 205 DEF_CVT_FUNC(32s16u, cvt_, int, ushort, v_int32)
 206 DEF_CVT_FUNC(32s16s, cvt_, int, short,  v_int32)
 207 DEF_CVT_FUNC(32s32f, cvt_, int, float,  v_float32)
 208 DEF_CVT_FUNC(32s64f, cvt_, int, double, v_int32)
 209 //DEF_CVT_FUNC(32s16f, cvt1_,int, float16_t, v_float32)
 210
 211 ////////////////////// 32f -> ... ////////////////////////
 212
 213 DEF_CVT_FUNC(32f8u,  cvt_, float, uchar,  v_float32)
 214 DEF_CVT_FUNC(32f8s,  cvt_, float, schar,  v_float32)
 215 DEF_CVT_FUNC(32f16u, cvt_, float, ushort, v_float32)
 216 DEF_CVT_FUNC(32f16s, cvt_, float, short,  v_float32)
 217 DEF_CVT_FUNC(32f32s, cvt_, float, int,    v_float32)
 218 DEF_CVT_FUNC(32f64f, cvt_, float, double, v_float32)
 219 DEF_CVT_FUNC(32f16f, cvt1_,float, float16_t, v_float32)
 220
 221 ////////////////////// 64f -> ... ////////////////////////
 222
 223 DEF_CVT_FUNC(64f8u,  cvt_, double, uchar,  v_int32)
 224 DEF_CVT_FUNC(64f8s,  cvt_, double, schar,  v_int32)
 225 DEF_CVT_FUNC(64f16u, cvt_, double, ushort, v_int32)
 226 DEF_CVT_FUNC(64f16s, cvt_, double, short,  v_int32)
 227 DEF_CVT_FUNC(64f32s, cvt_, double, int,    v_int32)
 228 DEF_CVT_FUNC(64f32f, cvt_, double, float,  v_float32)
 229 //DEF_CVT_FUNC(64f16f, cvt1_,double, float16_t, v_float32)
 230
 231 ////////////////////// 16f -> ... ////////////////////////
 232
 233 //DEF_CVT_FUNC(16f8u,  cvt_,  float16_t, uchar,  v_float32)
 234 //DEF_CVT_FUNC(16f8s,  cvt_,  float16_t, schar,  v_float32)
 235 //DEF_CVT_FUNC(16f16u, cvt1_, float16_t, ushort, v_float32)
 236 //DEF_CVT_FUNC(16f16s, cvt1_, float16_t, short,  v_float32)
 237 //DEF_CVT_FUNC(16f32s, cvt1_, float16_t, int,    v_float32)
 238 DEF_CVT_FUNC(16f32f, cvt1_, float16_t, float,  v_float32)
 239 //DEF_CVT_FUNC(16f64f, cvt1_, float16_t, double, v_float32)
 240
 241 ///////////// "conversion" w/o conversion ///////////////
 242
 243 static void cvt8u(const uchar* src, size_t sstep, const uchar*, size_t, uchar* dst, size_t dstep, Size size, void*)
 244 { CV_INSTRUMENT_REGION(); cvtCopy(src, sstep, dst, dstep, size, 1); }
 245
 246 static void cvt16u(const uchar* src, size_t sstep, const uchar*, size_t, uchar* dst, size_t dstep, Size size, void*)
 247 { CV_INSTRUMENT_REGION(); cvtCopy((const uchar*)src, sstep, (uchar*)dst, dstep, size, 2); }
 248
 249 static void cvt32s(const uchar* src, size_t sstep, const uchar*, size_t, uchar* dst, size_t dstep, Size size, void*)
 250 { CV_INSTRUMENT_REGION(); cvtCopy((const uchar*)src, sstep, (uchar*)dst, dstep, size, 4); }
 251
 252 static void cvt64s(const uchar* src, size_t sstep, const uchar*, size_t, uchar* dst, size_t dstep, Size size, void*)
 253 { CV_INSTRUMENT_REGION(); cvtCopy((const uchar*)src, sstep, (uchar*)dst, dstep, size, 8); }
 254
 255
 256 /* [TODO] Recover IPP calls
 257 #if defined(HAVE_IPP)
 258 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
 259 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
 260                          dtype* dst, size_t dstep, Size size, double*) \
 261 { \
 262     CV_IPP_RUN(src && dst, CV_INSTRUMENT_FUN_IPP(ippiConvert_##ippFavor, src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height)) >= 0) \
 263     cvt_(src, sstep, dst, dstep, size); \
 264 }
 265
 266 #define DEF_CVT_FUNC_F2(suffix, stype, dtype, ippFavor) \
 267 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
 268                          dtype* dst, size_t dstep, Size size, double*) \
 269 { \
 270     CV_IPP_RUN(src && dst, CV_INSTRUMENT_FUN_IPP(ippiConvert_##ippFavor, src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height), ippRndFinancial, 0) >= 0) \
 271     cvt_(src, sstep, dst, dstep, size); \
 272 }
 273 #else
 274 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
 275 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
 276                          dtype* dst, size_t dstep, Size size, double*) \
 277 { \
 278     cvt_(src, sstep, dst, dstep, size); \
 279 }
 280 #define DEF_CVT_FUNC_F2 DEF_CVT_FUNC_F
 281 #endif
 282
 283 #define DEF_CVT_FUNC(suffix, stype, dtype) \
 284 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
 285                          dtype* dst, size_t dstep, Size size, double*) \
 286 { \
 287     cvt_(src, sstep, dst, dstep, size); \
 288 }
 289
 290 #define DEF_CPY_FUNC(suffix, stype) \
 291 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
 292                          stype* dst, size_t dstep, Size size, double*) \
 293 { \
 294     cpy_(src, sstep, dst, dstep, size); \
 295 }
 296
 297 DEF_CPY_FUNC(8u,     uchar)
 298 DEF_CVT_FUNC_F(8s8u,   schar, uchar, 8s8u_C1Rs)
 299 DEF_CVT_FUNC_F(16u8u,  ushort, uchar, 16u8u_C1R)
 300 DEF_CVT_FUNC_F(16s8u,  short, uchar, 16s8u_C1R)
 301 DEF_CVT_FUNC_F(32s8u,  int, uchar, 32s8u_C1R)
 302 DEF_CVT_FUNC_F2(32f8u,  float, uchar, 32f8u_C1RSfs)
 303 DEF_CVT_FUNC(64f8u,  double, uchar)
 304
 305 DEF_CVT_FUNC_F2(8u8s,   uchar, schar, 8u8s_C1RSfs)
 306 DEF_CVT_FUNC_F2(16u8s,  ushort, schar, 16u8s_C1RSfs)
 307 DEF_CVT_FUNC_F2(16s8s,  short, schar, 16s8s_C1RSfs)
 308 DEF_CVT_FUNC_F(32s8s,  int, schar, 32s8s_C1R)
 309 DEF_CVT_FUNC_F2(32f8s,  float, schar, 32f8s_C1RSfs)
 310 DEF_CVT_FUNC(64f8s,  double, schar)
 311
 312 DEF_CVT_FUNC_F(8u16u,  uchar, ushort, 8u16u_C1R)
 313 DEF_CVT_FUNC_F(8s16u,  schar, ushort, 8s16u_C1Rs)
 314 DEF_CPY_FUNC(16u,    ushort)
 315 DEF_CVT_FUNC_F(16s16u, short, ushort, 16s16u_C1Rs)
 316 DEF_CVT_FUNC_F2(32s16u, int, ushort, 32s16u_C1RSfs)
 317 DEF_CVT_FUNC_F2(32f16u, float, ushort, 32f16u_C1RSfs)
 318 DEF_CVT_FUNC(64f16u, double, ushort)
 319
 320 DEF_CVT_FUNC_F(8u16s,  uchar, short, 8u16s_C1R)
 321 DEF_CVT_FUNC_F(8s16s,  schar, short, 8s16s_C1R)
 322 DEF_CVT_FUNC_F2(16u16s, ushort, short, 16u16s_C1RSfs)
 323 DEF_CVT_FUNC_F2(32s16s, int, short, 32s16s_C1RSfs)
 324 DEF_CVT_FUNC(32f16s, float, short)
 325 DEF_CVT_FUNC(64f16s, double, short)
 326
 327 DEF_CVT_FUNC_F(8u32s,  uchar, int, 8u32s_C1R)
 328 DEF_CVT_FUNC_F(8s32s,  schar, int, 8s32s_C1R)
 329 DEF_CVT_FUNC_F(16u32s, ushort, int, 16u32s_C1R)
 330 DEF_CVT_FUNC_F(16s32s, short, int, 16s32s_C1R)
 331 DEF_CPY_FUNC(32s,    int)
 332 DEF_CVT_FUNC_F2(32f32s, float, int, 32f32s_C1RSfs)
 333 DEF_CVT_FUNC(64f32s, double, int)
 334
 335 DEF_CVT_FUNC_F(8u32f,  uchar, float, 8u32f_C1R)
 336 DEF_CVT_FUNC_F(8s32f,  schar, float, 8s32f_C1R)
 337 DEF_CVT_FUNC_F(16u32f, ushort, float, 16u32f_C1R)
 338 DEF_CVT_FUNC_F(16s32f, short, float, 16s32f_C1R)
 339 DEF_CVT_FUNC_F(32s32f, int, float, 32s32f_C1R)
 340 DEF_CVT_FUNC(64f32f, double, float)
 341
 342 DEF_CVT_FUNC(8u64f,  uchar, double)
 343 DEF_CVT_FUNC(8s64f,  schar, double)
 344 DEF_CVT_FUNC(16u64f, ushort, double)
 345 DEF_CVT_FUNC(16s64f, short, double)
 346 DEF_CVT_FUNC(32s64f, int, double)
 347 DEF_CVT_FUNC(32f64f, float, double)
 348 DEF_CPY_FUNC(64s,    int64)
 349 */
 350
 351 BinaryFunc getConvertFunc(int sdepth, int ddepth)
 352 {
 353     static BinaryFunc cvtTab[][8] =
 354     {
 355         {
 356             (cvt8u), (cvt8s8u), (cvt16u8u),
 357             (cvt16s8u), (cvt32s8u), (cvt32f8u),
 358             (cvt64f8u), 0 //(cvt16f8u)
 359         },
 360         {
 361             (cvt8u8s), cvt8u, (cvt16u8s),
 362             (cvt16s8s), (cvt32s8s), (cvt32f8s),
 363             (cvt64f8s), 0 //(cvt16f8s)
 364         },
 365         {
 366             (cvt8u16u), (cvt8s16u), cvt16u,
 367             (cvt16s16u), (cvt32s16u), (cvt32f16u),
 368             (cvt64f16u), 0 //(cvt16f16u)
 369         },
 370         {
 371             (cvt8u16s), (cvt8s16s), (cvt16u16s),
 372             cvt16u, (cvt32s16s), (cvt32f16s),
 373             (cvt64f16s), 0 //(cvt16f16s)
 374         },
 375         {
 376             (cvt8u32s), (cvt8s32s), (cvt16u32s),
 377             (cvt16s32s), cvt32s, (cvt32f32s),
 378             (cvt64f32s), 0 //(cvt16f32s)
 379         },
 380         {
 381             (cvt8u32f), (cvt8s32f), (cvt16u32f),
 382             (cvt16s32f), (cvt32s32f), cvt32s,
 383             (cvt64f32f), 0 //(cvt16f32f)
 384         },
 385         {
 386             (cvt8u64f), (cvt8s64f), (cvt16u64f),
 387             (cvt16s64f), (cvt32s64f), (cvt32f64f),
 388             (cvt64s), 0 //(cvt16f64f)
 389         },
 390         {
 391             0, 0, 0, 0, 0, 0, 0, 0
 392             //(cvt8u16f), (cvt8s16f), (cvt16u16f), (cvt16s16f),
 393             //(cvt32s16f), (cvt32f16f), (cvt64f16f), (cvt16u)
 394         }
 395     };
 396     return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
 397 }
 398
 399 BinaryFunc get_cvt32f16f()
 400 {
 401     return cvt32f16f;
 402 }
 403
 404 BinaryFunc get_cvt16f32f()
 405 {
 406     return cvt16f32f;
 407 }
 408
 409 #endif
 410
 411 CV_CPU_OPTIMIZATION_NAMESPACE_END
 412 } // namespace