X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=modules%2Fcore%2Fsrc%2Fconvert.cpp;h=e3d901eafd85882b87a8b42f62b09ff5c097519a;hb=d611684d9e8c8e01d01df7b4f64b46223ef9940b;hp=d6abaa4adbf80dfd5be07984c787ece61721a54a;hpb=d2f1c00719383e9c42a4ff36f99babae910af64b;p=profile%2Fivi%2Fopencv.git diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index d6abaa4..e3d901e 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -50,6 +50,71 @@ namespace cv * split & merge * \****************************************************************************************/ +#if CV_NEON +template struct VSplit2; +template struct VSplit3; +template struct VSplit4; + +#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name{ \ + void operator()(const data_type* src, data_type* dst0, data_type* dst1){ \ + reg_type r = load_func(src); \ + store_func(dst0, r.val[0]); \ + store_func(dst1, r.val[1]); \ + } \ + } + +#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name{ \ + void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ + data_type* dst2){ \ + reg_type r = load_func(src); \ + store_func(dst0, r.val[0]); \ + store_func(dst1, r.val[1]); \ + store_func(dst2, r.val[2]); \ + } \ + } + +#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name{ \ + void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ + data_type* dst2, data_type* dst3){ \ + reg_type r = load_func(src); \ + store_func(dst0, r.val[0]); \ + store_func(dst1, r.val[1]); \ + store_func(dst2, r.val[2]); \ + store_func(dst3, r.val[3]); \ + } \ + } + +SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 ); +SPLIT2_KERNEL_TEMPLATE(VSplit2, schar , int8x16x2_t, vld2q_s8 , vst1q_s8 ); +SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16); +SPLIT2_KERNEL_TEMPLATE(VSplit2, short , int16x8x2_t, vld2q_s16, vst1q_s16); +SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32); +SPLIT2_KERNEL_TEMPLATE(VSplit2, float , float32x4x2_t, vld2q_f32, vst1q_f32); +SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 ); + +SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 ); +SPLIT3_KERNEL_TEMPLATE(VSplit3, schar , int8x16x3_t, vld3q_s8 , vst1q_s8 ); +SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16); +SPLIT3_KERNEL_TEMPLATE(VSplit3, short , int16x8x3_t, vld3q_s16, vst1q_s16); +SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32); +SPLIT3_KERNEL_TEMPLATE(VSplit3, float , float32x4x3_t, vld3q_f32, vst1q_f32); +SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 ); + +SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 ); +SPLIT4_KERNEL_TEMPLATE(VSplit4, schar , int8x16x4_t, vld4q_s8 , vst1q_s8 ); +SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16); +SPLIT4_KERNEL_TEMPLATE(VSplit4, short , int16x8x4_t, vld4q_s16, vst1q_s16); +SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32); +SPLIT4_KERNEL_TEMPLATE(VSplit4, float , float32x4x4_t, vld4q_f32, vst1q_f32); +SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 ); +#endif + template static void split_( const T* src, T** dst, int len, int cn ) { @@ -58,13 +123,34 @@ split_( const T* src, T** dst, int len, int cn ) if( k == 1 ) { T* dst0 = dst[0]; - for( i = j = 0; i < len; i++, j += cn ) - dst0[i] = src[j]; + + if(cn == 1) + { + memcpy(dst0, src, len * sizeof(T)); + } + else + { + for( i = 0, j = 0 ; i < len; i++, j += cn ) + dst0[i] = src[j]; + } } else if( k == 2 ) { T *dst0 = dst[0], *dst1 = dst[1]; - for( i = j = 0; i < len; i++, j += cn ) + i = j = 0; + +#if CV_NEON + if(cn == 2) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 2 * inc_i; + + VSplit2 vsplit; + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i); + } +#endif + for( ; i < len; i++, j += cn ) { dst0[i] = src[j]; dst1[i] = src[j+1]; @@ -73,7 +159,20 @@ split_( const T* src, T** dst, int len, int cn ) else if( k == 3 ) { T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2]; - for( i = j = 0; i < len; i++, j += cn ) + i = j = 0; + +#if CV_NEON + if(cn == 3) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 3 * inc_i; + + VSplit3 vsplit; + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); + } +#endif + for( ; i < len; i++, j += cn ) { dst0[i] = src[j]; dst1[i] = src[j+1]; @@ -83,7 +182,20 @@ split_( const T* src, T** dst, int len, int cn ) else { T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3]; - for( i = j = 0; i < len; i++, j += cn ) + i = j = 0; + +#if CV_NEON + if(cn == 4) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 4 * inc_i; + + VSplit4 vsplit; + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); + } +#endif + for( ; i < len; i++, j += cn ) { dst0[i] = src[j]; dst1[i] = src[j+1]; dst2[i] = src[j+2]; dst3[i] = src[j+3]; @@ -101,6 +213,77 @@ split_( const T* src, T** dst, int len, int cn ) } } + +#if CV_NEON +template struct VMerge2; +template struct VMerge3; +template struct VMerge4; + +#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name{ \ + void operator()(const data_type* src0, const data_type* src1, \ + data_type* dst){ \ + reg_type r; \ + r.val[0] = load_func(src0); \ + r.val[1] = load_func(src1); \ + store_func(dst, r); \ + } \ + } + +#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name{ \ + void operator()(const data_type* src0, const data_type* src1, \ + const data_type* src2, data_type* dst){ \ + reg_type r; \ + r.val[0] = load_func(src0); \ + r.val[1] = load_func(src1); \ + r.val[2] = load_func(src2); \ + store_func(dst, r); \ + } \ + } + +#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name{ \ + void operator()(const data_type* src0, const data_type* src1, \ + const data_type* src2, const data_type* src3, \ + data_type* dst){ \ + reg_type r; \ + r.val[0] = load_func(src0); \ + r.val[1] = load_func(src1); \ + r.val[2] = load_func(src2); \ + r.val[3] = load_func(src3); \ + store_func(dst, r); \ + } \ + } + +MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 ); +MERGE2_KERNEL_TEMPLATE(VMerge2, schar , int8x16x2_t, vld1q_s8 , vst2q_s8 ); +MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16); +MERGE2_KERNEL_TEMPLATE(VMerge2, short , int16x8x2_t, vld1q_s16, vst2q_s16); +MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32); +MERGE2_KERNEL_TEMPLATE(VMerge2, float , float32x4x2_t, vld1q_f32, vst2q_f32); +MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 ); + +MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 ); +MERGE3_KERNEL_TEMPLATE(VMerge3, schar , int8x16x3_t, vld1q_s8 , vst3q_s8 ); +MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16); +MERGE3_KERNEL_TEMPLATE(VMerge3, short , int16x8x3_t, vld1q_s16, vst3q_s16); +MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32); +MERGE3_KERNEL_TEMPLATE(VMerge3, float , float32x4x3_t, vld1q_f32, vst3q_f32); +MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 ); + +MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 ); +MERGE4_KERNEL_TEMPLATE(VMerge4, schar , int8x16x4_t, vld1q_s8 , vst4q_s8 ); +MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16); +MERGE4_KERNEL_TEMPLATE(VMerge4, short , int16x8x4_t, vld1q_s16, vst4q_s16); +MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32); +MERGE4_KERNEL_TEMPLATE(VMerge4, float , float32x4x4_t, vld1q_f32, vst4q_f32); +MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 ); +#endif + template static void merge_( const T** src, T* dst, int len, int cn ) { @@ -115,7 +298,19 @@ merge_( const T** src, T* dst, int len, int cn ) else if( k == 2 ) { const T *src0 = src[0], *src1 = src[1]; - for( i = j = 0; i < len; i++, j += cn ) + i = j = 0; +#if CV_NEON + if(cn == 2) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 2 * inc_i; + + VMerge2 vmerge; + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, dst + j); + } +#endif + for( ; i < len; i++, j += cn ) { dst[j] = src0[i]; dst[j+1] = src1[i]; @@ -124,7 +319,19 @@ merge_( const T** src, T* dst, int len, int cn ) else if( k == 3 ) { const T *src0 = src[0], *src1 = src[1], *src2 = src[2]; - for( i = j = 0; i < len; i++, j += cn ) + i = j = 0; +#if CV_NEON + if(cn == 3) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 3 * inc_i; + + VMerge3 vmerge; + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, src2 + i, dst + j); + } +#endif + for( ; i < len; i++, j += cn ) { dst[j] = src0[i]; dst[j+1] = src1[i]; @@ -134,7 +341,19 @@ merge_( const T** src, T* dst, int len, int cn ) else { const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3]; - for( i = j = 0; i < len; i++, j += cn ) + i = j = 0; +#if CV_NEON + if(cn == 4) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 4 * inc_i; + + VMerge4 vmerge; + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); + } +#endif + for( ; i < len; i++, j += cn ) { dst[j] = src0[i]; dst[j+1] = src1[i]; dst[j+2] = src2[i]; dst[j+3] = src3[i];