From: Tomoaki Teshima Date: Sat, 21 May 2016 12:31:33 +0000 (+0900) Subject: add feature to convert FP32(float) to FP16(half) X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~1748^2~9 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b2ad7cd9c018d9acf5e319e3f6a99651c0dca244;p=platform%2Fupstream%2Fopencv.git add feature to convert FP32(float) to FP16(half) * check compiler support * check HW support before executing * add test doing round trip conversion from / to FP32 * treat array correctly if size is not multiple of 4 * add declaration to prevent warning * make it possible to enable fp16 on 32bit ARM * let the conversion possible on non-supported HW, too. * add test using both HW and SW implementation --- diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index 0dcf7ed..33dd575 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -146,8 +146,11 @@ if(CMAKE_COMPILER_IS_GNUCXX) elseif(X86 OR X86_64) add_extra_compiler_option(-mno-sse2) endif() + if(ARM) + add_extra_compiler_option("-mfp16-format=ieee") + endif(ARM) if(ENABLE_NEON) - add_extra_compiler_option("-mfpu=neon") + add_extra_compiler_option("-mfpu=neon-fp16") endif() if(ENABLE_VFPV3 AND NOT ENABLE_NEON) add_extra_compiler_option("-mfpu=vfpv3") @@ -167,6 +170,9 @@ if(CMAKE_COMPILER_IS_GNUCXX) add_extra_compiler_option(-mfma) endif() endif() + if((X86 OR X86_64) AND NOT MSVC) + add_extra_compiler_option(-mf16c) + endif((X86 OR X86_64) AND NOT MSVC) # GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed. if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx") diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp index 0d180f5..fa7ab46 100644 --- a/modules/core/include/opencv2/core.hpp +++ b/modules/core/include/opencv2/core.hpp @@ -524,6 +524,17 @@ For example: CV_EXPORTS_W void convertScaleAbs(InputArray src, OutputArray dst, double alpha = 1, double beta = 0); +/** @brief Converts an array to half precision floating number. + +convertFp16 converts FP32 to FP16 or FP16 to FP32. The input array has to have type of CV_32F or +CV_16S to represent the bit depth. If the input array is neither of them, it'll do nothing. + +@param src input array. +@param dst output array. +@param useHW if possible use HW SIMD instruction to convert +*/ +CV_EXPORTS_W void convertFp16(InputArray src, OutputArray dst, bool useHW = true); + /** @brief Performs a look-up table transform of an array. The function LUT fills the output array with values from the look-up table. Indices of the entries diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index c005914..42e9311 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -112,7 +112,7 @@ #define CV_CPU_SSE4_1 6 #define CV_CPU_SSE4_2 7 #define CV_CPU_POPCNT 8 - +#define CV_CPU_FP16 9 #define CV_CPU_AVX 10 #define CV_CPU_AVX2 11 #define CV_CPU_FMA3 12 @@ -143,7 +143,7 @@ enum CpuFeatures { CPU_SSE4_1 = 6, CPU_SSE4_2 = 7, CPU_POPCNT = 8, - + CPU_FP16 = 9, CPU_AVX = 10, CPU_AVX2 = 11, CPU_FMA3 = 12, @@ -193,6 +193,10 @@ enum CpuFeatures { # endif # define CV_POPCNT 1 # endif +# if defined __F16C__ || (defined _MSC_VER && _MSC_VER >= 1700) +# include +# define CV_FP16 1 +# endif # if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0) // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32 @@ -223,6 +227,10 @@ enum CpuFeatures { # define CV_NEON 1 #endif +#if defined __GNUC__ && ((defined (__arm__) && (__ARM_FP & 0x2)) || defined(__aarch64__)) +# define CV_FP16 1 +#endif + #if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__ # define CV_VFP 1 #endif @@ -253,6 +261,9 @@ enum CpuFeatures { #ifndef CV_SSE4_2 # define CV_SSE4_2 0 #endif +#ifndef CV_FP16 +# define CV_FP16 0 +#endif #ifndef CV_AVX # define CV_AVX 0 #endif diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index f41bfa1..4ff9830 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -4356,6 +4356,283 @@ struct Cvt_SIMD #endif +#if !(defined (__arm__) || defined (__aarch64__)) +// const numbers for floating points format +const unsigned int kShiftSignificand = 13; +const unsigned int kMaskFp16Significand = 0x3ff; +const unsigned int kBiasFp16Exponent = 15; +const unsigned int kBiasFp32Exponent = 127; + +union fp32Int32 +{ + int i; + float f; + struct _fp32Format + { + unsigned int significand : 23; + unsigned int exponent : 8; + unsigned int sign : 1; + } fmt; +}; +#endif + +union fp16Int16 +{ + short i; +#if defined (__arm__) || defined (__aarch64__) + __fp16 h; +#endif + struct _fp16Format + { + unsigned int significand : 10; + unsigned int exponent : 5; + unsigned int sign : 1; + } fmt; +}; + +#if defined (__arm__) || defined (__aarch64__) +static float convertFp16SW(short fp16) +{ + // Fp16 -> Fp32 + fp16Int16 a; + a.i = fp16; + return (float)a.h; +} +#else +static float convertFp16SW(short fp16) +{ + // Fp16 -> Fp32 + fp16Int16 b; + b.i = fp16; + int exponent = b.fmt.exponent - kBiasFp16Exponent; + int significand = b.fmt.significand; + + fp32Int32 a; + a.i = 0; + a.fmt.sign = b.fmt.sign; // sign bit + if( exponent == 16 ) + { + // Inf or NaN + a.i = a.i | 0x7F800000; + if( significand != 0 ) + { + // NaN +#if defined(__x86_64__) || defined(_M_X64) + // 64bit + a.i = a.i | 0x7FC00000; +#endif + a.fmt.significand = a.fmt.significand | (significand << kShiftSignificand); + } + return a.f; + } + else if ( exponent == -15 ) + { + // subnormal in Fp16 + if( significand == 0 ) + { + // zero + return a.f; + } + else + { + int shift = -1; + while( ( significand & 0x400 ) == 0 ) + { + significand = significand << 1; + shift++; + } + significand = significand & kMaskFp16Significand; + exponent -= shift; + } + } + + a.fmt.exponent = (exponent+kBiasFp32Exponent); + a.fmt.significand = significand << kShiftSignificand; + return a.f; +} +#endif + +#if defined (__arm__) || defined (__aarch64__) +static short convertFp16SW(float fp32) +{ + // Fp32 -> Fp16 + fp16Int16 a; + a.h = (__fp16)fp32; + return a.i; +} +#else +static short convertFp16SW(float fp32) +{ + // Fp32 -> Fp16 + fp32Int32 a; + a.f = fp32; + int exponent = a.fmt.exponent - kBiasFp32Exponent; + int significand = a.fmt.significand; + + fp16Int16 result; + result.i = 0; + if( 0x477ff000 <= ( a.i & 0x7fffffff ) ) + { + // Inf in Fp16 + result.i = result.i | 0x7C00; + if( exponent == 128 && significand != 0 ) + { + // NaN + result.i = (short)(result.i | 0x200 | (significand >> kShiftSignificand)); + } + } + else if ( ( a.i & 0x7fffffff ) <= 0x387fe000 ) + { + // subnormal in Fp16 + int fp16Significand = significand | 0x800000; + int bitShift = (-exponent) - 1; + fp16Significand = fp16Significand >> bitShift; + + // special cases to round up + int threshold = 0x8000 + ( ( fp16Significand & 1 ) ? 0 : 1 ); + if( threshold <= ( significand & 0xffff ) ) + { + fp16Significand++; + } + result.i = (short)fp16Significand; + } + else + { + // usual situation + // exponent + result.fmt.exponent = (exponent + kBiasFp16Exponent); + + // significand; + short fp16Significand = (short)(significand >> kShiftSignificand); + result.fmt.significand = fp16Significand; + + // special cases to round up + short lsb10bitsFp32 = (significand & 0x1fff); + short threshold = 0x1000 + ( ( fp16Significand & 0x1 ) ? 0 : 1 ); + if( threshold <= lsb10bitsFp32 ) + { + result.i++; + } + else if ( fp16Significand == 0x3ff && exponent == -15) + { + result.i++; + } + } + + // sign bit + result.fmt.sign = a.fmt.sign; + return result.i; +} +#endif + +template static void +cvtScaleHalfSW_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size) +{ + sstep /= sizeof(src[0]); + dstep /= sizeof(dst[0]); + + for( ; size.height--; src += sstep, dst += dstep ) + { + for ( int x = 0 ; x < size.width; x ++ ) + { + dst[x] = convertFp16SW(src[x]); + } + } +} + +// template for FP16 HW conversion function +template static void +cvtScaleHalfHW_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size) +{ + sstep /= sizeof(src[0]); + dstep /= sizeof(dst[0]); + + for( ; size.height--; src += sstep, dst += dstep ) + { + int x = 0; + + for ( ; x < size.width; x++ ) + { + } + } +} + +template<> void +cvtScaleHalfHW_( const float* src, size_t sstep, short* dst, size_t dstep, Size size) +{ + sstep /= sizeof(src[0]); + dstep /= sizeof(dst[0]); + + for( ; size.height--; src += sstep, dst += dstep ) + { + int x = 0; + + if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 ) + { +#if CV_FP16 + for ( ; x <= size.width - 4; x += 4) + { +#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386) + __m128 v_src = _mm_load_ps(src + x); + + __m128i v_dst = _mm_cvtps_ph(v_src, 0); + + _mm_storel_epi64((__m128i *)(dst + x), v_dst); +#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__) + float32x4_t v_src = *(float32x4_t*)(src + x); + + float16x4_t v_dst = vcvt_f16_f32(v_src); + + *(float16x4_t*)(dst + x) = v_dst; +#endif + } +#endif + } + for ( ; x < size.width; x++ ) + { + dst[x] = convertFp16SW(src[x]); + } + } +} + +template<> void +cvtScaleHalfHW_( const short* src, size_t sstep, float* dst, size_t dstep, Size size) +{ + sstep /= sizeof(src[0]); + dstep /= sizeof(dst[0]); + + for( ; size.height--; src += sstep, dst += dstep ) + { + int x = 0; + + if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 ) + { +#if CV_FP16 + for ( ; x <= size.width - 4; x += 4) + { +#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386) + __m128i v_src = _mm_loadl_epi64((__m128i*)(src+x)); + + __m128 v_dst = _mm_cvtph_ps(v_src); + + _mm_store_ps((dst + x), v_dst); +#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__) + float16x4_t v_src = *(float16x4_t*)(src + x); + + float32x4_t v_dst = vcvt_f32_f16(v_src); + + *(float32x4_t*)(dst + x) = v_dst; +#endif + } +#endif + } + for ( ; x < size.width; x++ ) + { + dst[x] = convertFp16SW(src[x]); + } + } +} + template static void cvt_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size ) @@ -4443,6 +4720,13 @@ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, s tfunc(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \ } +#define DEF_CVT_SCALE_FP16_FUNC(suffix, stype, dtype, resource) \ +static void cvtScaleHalf##suffix##resource( const stype* src, size_t sstep, const uchar*, size_t, \ +dtype* dst, size_t dstep, Size size, double*) \ +{ \ + cvtScaleHalf##resource##_(src, sstep, dst, dstep, size); \ +} + #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \ static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ dtype* dst, size_t dstep, Size size, double* scale) \ @@ -4499,6 +4783,11 @@ DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtScaleAbs_, int, uchar, float) DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtScaleAbs_, float, uchar, float) DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtScaleAbs_, double, uchar, float) +DEF_CVT_SCALE_FP16_FUNC(32f16f, float, short, SW) +DEF_CVT_SCALE_FP16_FUNC(16f32f, short, float, SW) +DEF_CVT_SCALE_FP16_FUNC(32f16f, float, short, HW) +DEF_CVT_SCALE_FP16_FUNC(16f32f, short, float, HW) + DEF_CVT_SCALE_FUNC(8u, uchar, uchar, float) DEF_CVT_SCALE_FUNC(8s8u, schar, uchar, float) DEF_CVT_SCALE_FUNC(16u8u, ushort, uchar, float) @@ -4620,6 +4909,30 @@ static BinaryFunc getCvtScaleAbsFunc(int depth) return cvtScaleAbsTab[depth]; } +BinaryFunc getConvertFuncFp16(int ddepth, bool useHW) +{ + static BinaryFunc cvtTabHW[] = + { + 0, 0, 0, + (BinaryFunc)(cvtScaleHalf32f16fHW), 0, (BinaryFunc)(cvtScaleHalf16f32fHW), + 0, 0, + }; + static BinaryFunc cvtTabSW[] = + { + 0, 0, 0, + (BinaryFunc)(cvtScaleHalf32f16fSW), 0, (BinaryFunc)(cvtScaleHalf16f32fSW), + 0, 0, + }; + if( useHW == true) + { + return cvtTabHW[CV_MAT_DEPTH(ddepth)]; + } + else + { + return cvtTabSW[CV_MAT_DEPTH(ddepth)]; + } +} + BinaryFunc getConvertFunc(int sdepth, int ddepth) { static BinaryFunc cvtTab[][8] = @@ -4804,6 +5117,52 @@ void cv::convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, doubl } } +void cv::convertFp16( InputArray _src, OutputArray _dst, bool useHW ) +{ + if ( checkHardwareSupport(CV_CPU_FP16) == false) + { + useHW = false; + } + + Mat src = _src.getMat(); + int ddepth = 0; + + switch( src.depth() ) + { + case CV_32F: + ddepth = CV_16S; + break; + case CV_16S: + ddepth = CV_32F; + break; + default: + return; + } + + int type = CV_MAKETYPE(ddepth, src.channels()); + _dst.create( src.dims, src.size, type ); + Mat dst = _dst.getMat(); + BinaryFunc func = getConvertFuncFp16(ddepth, useHW); + int cn = src.channels(); + CV_Assert( func != 0 ); + + if( src.dims <= 2 ) + { + Size sz = getContinuousSize(src, dst, cn); + func( src.data, src.step, 0, 0, dst.data, dst.step, sz, 0); + } + else + { + const Mat* arrays[] = {&src, &dst, 0}; + uchar* ptrs[2]; + NAryMatIterator it(arrays, ptrs); + Size sz((int)(it.size*cn), 1); + + for( size_t i = 0; i < it.nplanes; i++, ++it ) + func(ptrs[0], 1, 0, 0, ptrs[1], 1, sz, 0); + } +} + void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const { bool noScale = fabs(alpha-1) < DBL_EPSILON && fabs(beta) < DBL_EPSILON; diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index f699ede..cece96c 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -135,6 +135,7 @@ typedef void (*BinaryFuncC)(const uchar* src1, size_t step1, uchar* dst, size_t step, int width, int height, void*); +BinaryFunc getConvertFuncFp16(int ddepth, bool useHW); BinaryFunc getConvertFunc(int sdepth, int ddepth); BinaryFunc getCopyMaskFunc(size_t esz); diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 768280e..a3858c1 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -291,6 +291,7 @@ struct HWFeatures f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0; f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0; f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX + f.have[CV_CPU_FP16] = (cpuid_data[2] & (1<<29)) != 0; // make the second call to the cpuid command in order to get // information about extended features like AVX2 @@ -338,7 +339,8 @@ struct HWFeatures #if defined ANDROID || defined __linux__ #ifdef __aarch64__ f.have[CV_CPU_NEON] = true; - #else + f.have[CV_CPU_FP16] = true; + #elif defined __arm__ int cpufile = open("/proc/self/auxv", O_RDONLY); if (cpufile >= 0) @@ -351,6 +353,7 @@ struct HWFeatures if (auxv.a_type == AT_HWCAP) { f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0; + f.have[CV_CPU_FP16] = (auxv.a_un.a_val & 2) != 0; break; } } @@ -358,9 +361,14 @@ struct HWFeatures close(cpufile); } #endif - #elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__)) + #elif (defined __clang__ || defined __APPLE__) + #if (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__)) f.have[CV_CPU_NEON] = true; #endif + #if (defined __ARM_FP && (((__ARM_FP & 0x2) != 0) && defined __ARM_NEON__)) + f.have[CV_CPU_FP16] = true; + #endif + #endif return f; } diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp index ace7950..58974a8 100644 --- a/modules/core/test/test_arithm.cpp +++ b/modules/core/test/test_arithm.cpp @@ -737,6 +737,60 @@ struct ConvertScaleOp : public BaseElemWiseOp int ddepth; }; +struct ConvertScaleFp16Op : public BaseElemWiseOp +{ + ConvertScaleFp16Op() : BaseElemWiseOp(1, FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)), nextRange(0) { } + void op(const vector& src, Mat& dst, const Mat&) + { + convertFp16(src[0], dst, true); + } + void refop(const vector& src, Mat& dst, const Mat&) + { + convertFp16(src[0], dst, false); + } + int getRandomType(RNG&) + { + // 0: FP32 -> FP16 + // 1: FP16 -> FP32 + int srctype = (nextRange & 1) == 0 ? CV_32F : CV_16S; + return srctype; + } + void getValueRange(int, double& minval, double& maxval) + { + // 0: FP32 -> FP16 + // 1: FP16 -> FP32 + if( (nextRange & 1) == 0 ) + { + // largest integer number that fp16 can express + maxval = 65504.f; + minval = -maxval; + } + else + { + // 0: positive number range + // 1: negative number range + if( (nextRange & 2) == 0 ) + { + minval = 0; // 0x0000 +0 + maxval = 31744; // 0x7C00 +Inf + } + else + { + minval = -32768; // 0x8000 -0 + maxval = -1024; // 0xFC00 -Inf + } + } + } + double getMaxErr(int) + { + return 0.5f; + } + void generateScalars(int, RNG& rng) + { + nextRange = rng.next(); + } + int nextRange; +}; struct ConvertScaleAbsOp : public BaseElemWiseOp { @@ -1371,6 +1425,7 @@ INSTANTIATE_TEST_CASE_P(Core_Copy, ElemWiseTest, ::testing::Values(ElemWiseOpPtr INSTANTIATE_TEST_CASE_P(Core_Set, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::SetOp))); INSTANTIATE_TEST_CASE_P(Core_SetZero, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::SetZeroOp))); INSTANTIATE_TEST_CASE_P(Core_ConvertScale, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::ConvertScaleOp))); +INSTANTIATE_TEST_CASE_P(Core_ConvertScaleFp16, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::ConvertScaleFp16Op))); INSTANTIATE_TEST_CASE_P(Core_ConvertScaleAbs, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::ConvertScaleAbsOp))); INSTANTIATE_TEST_CASE_P(Core_Add, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::AddOp))); diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp index ca7664c..a8f1460 100644 --- a/modules/ts/src/ts_func.cpp +++ b/modules/ts/src/ts_func.cpp @@ -3064,6 +3064,9 @@ void printVersionInfo(bool useStdOut) #if CV_NEON if (checkHardwareSupport(CV_CPU_NEON)) cpu_features += " neon"; #endif +#if CV_FP16 + if (checkHardwareSupport(CV_CPU_FP16)) cpu_features += " fp16"; +#endif cpu_features.erase(0, 1); // erase initial space