1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
5 // Copyright (C) 2018-2019 Intel Corporation
7 // NB: allow including this *.hpp several times!
8 // #pragma once -- don't: this file is NOT once!
10 #if !defined(GAPI_STANDALONE)
12 #include "gfluidimgproc_func.hpp"
14 #include "opencv2/gapi/own/saturate.hpp"
16 #include "opencv2/core.hpp"
17 #include "opencv2/core/hal/intrin.hpp"
27 # pragma GCC diagnostic push
28 # pragma GCC diagnostic ignored "-Wstrict-overflow"
31 using cv::gapi::own::saturate;
37 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
39 //----------------------------------
41 // Fluid kernels: RGB2Gray, BGR2Gray
43 //----------------------------------
45 void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
46 float coef_r, float coef_g, float coef_b);
48 //--------------------------------------
50 // Fluid kernels: RGB-to-YUV, YUV-to-RGB
52 //--------------------------------------
54 void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef[5]);
56 void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]);
58 //-------------------------
60 // Fluid kernels: sepFilter
62 //-------------------------
64 #define RUN_SEPFILTER3X3_IMPL(DST, SRC) \
65 void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
66 const float kx[], const float ky[], int border, \
67 float scale, float delta, \
68 float *buf[], int y, int y0);
70 RUN_SEPFILTER3X3_IMPL(uchar , uchar )
71 RUN_SEPFILTER3X3_IMPL( short, uchar )
72 RUN_SEPFILTER3X3_IMPL( float, uchar )
73 RUN_SEPFILTER3X3_IMPL(ushort, ushort)
74 RUN_SEPFILTER3X3_IMPL( short, ushort)
75 RUN_SEPFILTER3X3_IMPL( float, ushort)
76 RUN_SEPFILTER3X3_IMPL( short, short)
77 RUN_SEPFILTER3X3_IMPL( float, short)
78 RUN_SEPFILTER3X3_IMPL( float, float)
80 #undef RUN_SEPFILTER3X3_IMPL
82 //-------------------------
84 // Fluid kernels: Filter 2D
86 //-------------------------
88 #define RUN_FILTER2D_3X3_IMPL(DST, SRC) \
89 void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \
90 const float kernel[], float scale, float delta);
92 RUN_FILTER2D_3X3_IMPL(uchar , uchar )
93 RUN_FILTER2D_3X3_IMPL(ushort, ushort)
94 RUN_FILTER2D_3X3_IMPL( short, short)
95 RUN_FILTER2D_3X3_IMPL( float, uchar )
96 RUN_FILTER2D_3X3_IMPL( float, ushort)
97 RUN_FILTER2D_3X3_IMPL( float, short)
98 RUN_FILTER2D_3X3_IMPL( float, float)
100 #undef RUN_FILTER2D_3X3_IMPL
102 //-----------------------------
104 // Fluid kernels: Erode, Dilate
106 //-----------------------------
108 #define RUN_MORPHOLOGY3X3_IMPL(T) \
109 void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \
110 const uchar k[], MorphShape k_type, \
111 Morphology morphology);
113 RUN_MORPHOLOGY3X3_IMPL(uchar )
114 RUN_MORPHOLOGY3X3_IMPL(ushort)
115 RUN_MORPHOLOGY3X3_IMPL( short)
116 RUN_MORPHOLOGY3X3_IMPL( float)
118 #undef RUN_MORPHOLOGY3X3_IMPL
120 //---------------------------
122 // Fluid kernels: Median blur
124 //---------------------------
126 #define RUN_MEDBLUR3X3_IMPL(T) \
127 void run_medblur3x3_impl(T out[], const T *in[], int width, int chan);
129 RUN_MEDBLUR3X3_IMPL(uchar )
130 RUN_MEDBLUR3X3_IMPL(ushort)
131 RUN_MEDBLUR3X3_IMPL( short)
132 RUN_MEDBLUR3X3_IMPL( float)
134 #undef RUN_MEDBLUR3X3_IMPL
136 //----------------------------------------------------------------------
138 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
141 template<typename SRC>
142 static inline v_float32 vx_load_f32(const SRC* ptr)
144 if (std::is_same<SRC,uchar>::value)
146 v_uint32 tmp = vx_load_expand_q(reinterpret_cast<const uchar*>(ptr));
147 return v_cvt_f32(v_reinterpret_as_s32(tmp));
150 if (std::is_same<SRC,ushort>::value)
152 v_uint32 tmp = vx_load_expand(reinterpret_cast<const ushort*>(ptr));
153 return v_cvt_f32(v_reinterpret_as_s32(tmp));
156 if (std::is_same<SRC,short>::value)
158 v_int32 tmp = vx_load_expand(reinterpret_cast<const short*>(ptr));
159 return v_cvt_f32(tmp);
162 if (std::is_same<SRC,float>::value)
164 v_float32 tmp = vx_load(reinterpret_cast<const float*>(ptr));
168 CV_Error(cv::Error::StsBadArg, "unsupported type");
172 //----------------------------------
174 // Fluid kernels: RGB2Gray, BGR2Gray
176 //----------------------------------
178 void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
179 float coef_r, float coef_g, float coef_b)
182 // - coefficients are less than 1
183 // - and their sum equals 1
185 constexpr int unity = 1 << 16; // Q0.0.16 inside ushort:
186 ushort rc = static_cast<ushort>(coef_r * unity + 0.5f);
187 ushort gc = static_cast<ushort>(coef_g * unity + 0.5f);
188 ushort bc = static_cast<ushort>(coef_b * unity + 0.5f);
190 GAPI_Assert(rc + gc + bc <= unity);
191 GAPI_Assert(rc + gc + bc >= USHRT_MAX);
194 constexpr int nlanes = v_uint8::nlanes;
197 for (int w=0; w < width; )
199 // process main part of pixels row
200 for ( ; w <= width - nlanes; w += nlanes)
203 v_load_deinterleave(&in[3*w], r, g, b);
205 v_uint16 r0, r1, g0, g1, b0, b1;
211 static const ushort half = 1 << 7; // Q0.8.8
212 y0 = (v_mul_hi(r0 << 8, vx_setall_u16(rc)) +
213 v_mul_hi(g0 << 8, vx_setall_u16(gc)) +
214 v_mul_hi(b0 << 8, vx_setall_u16(bc)) +
215 vx_setall_u16(half)) >> 8;
216 y1 = (v_mul_hi(r1 << 8, vx_setall_u16(rc)) +
217 v_mul_hi(g1 << 8, vx_setall_u16(gc)) +
218 v_mul_hi(b1 << 8, vx_setall_u16(bc)) +
219 vx_setall_u16(half)) >> 8;
226 // process tail (if any)
229 GAPI_DbgAssert(width - nlanes >= 0);
238 for (int w=0; w < width; w++)
241 uchar g = in[3*w + 1];
242 uchar b = in[3*w + 2];
244 static const int half = 1 << 15; // Q0.0.16
245 ushort y = (r*rc + b*bc + g*gc + half) >> 16;
246 out[w] = static_cast<uchar>(y);
250 //--------------------------------------
252 // Fluid kernels: RGB-to-YUV, YUV-to-RGB
254 //--------------------------------------
256 void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef[5])
258 ushort c0 = static_cast<ushort>(coef[0]*(1 << 16) + 0.5f); // Q0.0.16 un-signed
259 ushort c1 = static_cast<ushort>(coef[1]*(1 << 16) + 0.5f);
260 ushort c2 = static_cast<ushort>(coef[2]*(1 << 16) + 0.5f);
261 short c3 = static_cast<short>(coef[3]*(1 << 12) + 0.5f); // Q1.0.12 signed
262 short c4 = static_cast<short>(coef[4]*(1 << 12) + 0.5f);
267 static const int nlanes = v_uint8::nlanes;
268 for ( ; w <= width - nlanes; w += nlanes)
271 v_load_deinterleave(&in[3*w], r, g, b);
273 v_uint16 _r0, _r1, _g0, _g1, _b0, _b1;
274 v_expand(r, _r0, _r1);
275 v_expand(g, _g0, _g1);
276 v_expand(b, _b0, _b1);
278 _r0 = _r0 << 7; // Q0.9.7 un-signed
286 _y0 = v_mul_hi(vx_setall_u16(c0), _r0) // Q0.9.7
287 + v_mul_hi(vx_setall_u16(c1), _g0)
288 + v_mul_hi(vx_setall_u16(c2), _b0);
289 _y1 = v_mul_hi(vx_setall_u16(c0), _r1)
290 + v_mul_hi(vx_setall_u16(c1), _g1)
291 + v_mul_hi(vx_setall_u16(c2), _b1);
293 v_int16 r0, r1, b0, b1, y0, y1;
294 r0 = v_reinterpret_as_s16(_r0); // Q1.8.7 signed
295 r1 = v_reinterpret_as_s16(_r1);
296 b0 = v_reinterpret_as_s16(_b0);
297 b1 = v_reinterpret_as_s16(_b1);
298 y0 = v_reinterpret_as_s16(_y0);
299 y1 = v_reinterpret_as_s16(_y1);
301 v_int16 u0, u1, v0, v1;
302 u0 = v_mul_hi(vx_setall_s16(c3), b0 - y0); // Q1.12.3
303 u1 = v_mul_hi(vx_setall_s16(c3), b1 - y1);
304 v0 = v_mul_hi(vx_setall_s16(c4), r0 - y0);
305 v1 = v_mul_hi(vx_setall_s16(c4), r1 - y1);
308 y = v_pack((_y0 + vx_setall_u16(1 << 6)) >> 7,
309 (_y1 + vx_setall_u16(1 << 6)) >> 7);
310 u = v_pack_u((u0 + vx_setall_s16(257 << 2)) >> 3, // 257 << 2 = 128.5 * (1 << 3)
311 (u1 + vx_setall_s16(257 << 2)) >> 3);
312 v = v_pack_u((v0 + vx_setall_s16(257 << 2)) >> 3,
313 (v1 + vx_setall_s16(257 << 2)) >> 3);
315 v_store_interleave(&out[3*w], y, u, v);
319 for ( ; w < width; w++)
321 short r = in[3*w ] << 7; // Q1.8.7 signed
322 short g = in[3*w + 1] << 7;
323 short b = in[3*w + 2] << 7;
324 short y = (c0*r + c1*g + c2*b) >> 16; // Q1.8.7
325 short u = c3*(b - y) >> 16; // Q1.12.3
326 short v = c4*(r - y) >> 16;
327 out[3*w ] = static_cast<uchar>((y + (1 << 6)) >> 7);
328 out[3*w + 1] = saturate<uchar>((u + (128 << 3) + (1 << 2)) >> 3);
329 out[3*w + 2] = saturate<uchar>((v + (128 << 3) + (1 << 2)) >> 3);
333 void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4])
335 short c0 = static_cast<short>(coef[0] * (1 << 12) + 0.5f); // Q1.3.12
336 short c1 = static_cast<short>(coef[1] * (1 << 12) + 0.5f);
337 short c2 = static_cast<short>(coef[2] * (1 << 12) + 0.5f);
338 short c3 = static_cast<short>(coef[3] * (1 << 12) + 0.5f);
343 static const int nlanes = v_uint8::nlanes;
344 for ( ; w <= width - nlanes; w += nlanes)
347 v_load_deinterleave(&in[3*w], y, u, v);
349 v_uint16 _y0, _y1, _u0, _u1, _v0, _v1;
350 v_expand(y, _y0, _y1);
351 v_expand(u, _u0, _u1);
352 v_expand(v, _v0, _v1);
354 v_int16 y0, y1, u0, u1, v0, v1;
355 y0 = v_reinterpret_as_s16(_y0);
356 y1 = v_reinterpret_as_s16(_y1);
357 u0 = v_reinterpret_as_s16(_u0);
358 u1 = v_reinterpret_as_s16(_u1);
359 v0 = v_reinterpret_as_s16(_v0);
360 v1 = v_reinterpret_as_s16(_v1);
362 y0 = y0 << 3; // Q1.12.3
364 u0 = (u0 - vx_setall_s16(128)) << 7; // Q1.8.7
365 u1 = (u1 - vx_setall_s16(128)) << 7;
366 v0 = (v0 - vx_setall_s16(128)) << 7;
367 v1 = (v1 - vx_setall_s16(128)) << 7;
369 v_int16 r0, r1, g0, g1, b0, b1;
370 r0 = y0 + v_mul_hi(vx_setall_s16(c0), v0); // Q1.12.3
371 r1 = y1 + v_mul_hi(vx_setall_s16(c0), v1);
372 g0 = y0 + v_mul_hi(vx_setall_s16(c1), u0)
373 + v_mul_hi(vx_setall_s16(c2), v0);
374 g1 = y1 + v_mul_hi(vx_setall_s16(c1), u1)
375 + v_mul_hi(vx_setall_s16(c2), v1);
376 b0 = y0 + v_mul_hi(vx_setall_s16(c3), u0);
377 b1 = y1 + v_mul_hi(vx_setall_s16(c3), u1);
380 r = v_pack_u((r0 + vx_setall_s16(1 << 2)) >> 3,
381 (r1 + vx_setall_s16(1 << 2)) >> 3);
382 g = v_pack_u((g0 + vx_setall_s16(1 << 2)) >> 3,
383 (g1 + vx_setall_s16(1 << 2)) >> 3);
384 b = v_pack_u((b0 + vx_setall_s16(1 << 2)) >> 3,
385 (b1 + vx_setall_s16(1 << 2)) >> 3);
387 v_store_interleave(&out[3*w], r, g, b);
391 for ( ; w < width; w++)
393 short y = in[3*w ] << 3; // Q1.12.3
394 short u = (in[3*w + 1] - 128) << 7; // Q1.8.7
395 short v = (in[3*w + 2] - 128) << 7;
396 short r = y + ( c0*v >> 16); // Q1.12.3
397 short g = y + ((c1*u + c2*v) >> 16);
398 short b = y + ((c3*u ) >> 16);
399 out[3*w ] = saturate<uchar>((r + (1 << 2)) >> 3);
400 out[3*w + 1] = saturate<uchar>((g + (1 << 2)) >> 3);
401 out[3*w + 2] = saturate<uchar>((b + (1 << 2)) >> 3);
405 //-------------------------
407 // Fluid kernels: sepFilter
409 //-------------------------
412 // this variant not using buf[] appears 15% faster than reference any-2-float code below
413 template<bool noscale, typename SRC>
414 static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width, int chan,
415 const float kx[], const float ky[], int border,
416 float scale, float delta)
418 const int length = width * chan;
419 const int shift = border * chan;
421 const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
422 const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
424 for (int l=0; l < length; )
426 static const int nlanes = v_float32::nlanes;
429 for ( ; l <= length - nlanes; l += nlanes)
431 auto xsum = [l, shift, kx0, kx1, kx2](const SRC i[])
433 v_float32 t0 = vx_load_f32(&i[l - shift]);
434 v_float32 t1 = vx_load_f32(&i[l ]);
435 v_float32 t2 = vx_load_f32(&i[l + shift]);
436 v_float32 t = t0 * vx_setall_f32(kx0);
437 t = v_fma(t1, vx_setall_f32(kx1), t);
438 t = v_fma(t2, vx_setall_f32(kx2), t);
442 v_float32 s0 = xsum(in[0]);
443 v_float32 s1 = xsum(in[1]);
444 v_float32 s2 = xsum(in[2]);
445 v_float32 s = s0 * vx_setall_f32(ky0);
446 s = v_fma(s1, vx_setall_f32(ky1), s);
447 s = v_fma(s2, vx_setall_f32(ky2), s);
451 s = v_fma(s, vx_setall_f32(scale), vx_setall_f32(delta));
460 GAPI_DbgAssert(length >= nlanes);
466 // this variant with manually vectored rounding to short/ushort appears 10-40x faster
467 // than reference code below
468 template<bool noscale, typename DST, typename SRC>
469 static void run_sepfilter3x3_any2short(DST out[], const SRC *in[], int width, int chan,
470 const float kx[], const float ky[], int border,
471 float scale, float delta,
472 float *buf[], int y, int y0)
475 r[0] = (y - y0 ) % 3; // buf[r[0]]: previous
476 r[1] = (y - y0 + 1) % 3; // this
477 r[2] = (y - y0 + 2) % 3; // next row
479 const int length = width * chan;
480 const int shift = border * chan;
482 const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
483 const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
487 int k0 = (y == y0)? 0: 2;
489 for (int k = k0; k < 3; k++)
491 // previous , this , next pixel
492 const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
494 // rely on compiler vectoring
495 for (int l=0; l < length; l++)
497 buf[r[k]][l] = s[0][l]*kx0 + s[1][l]*kx1 + s[2][l]*kx2;
503 const int r0=r[0], r1=r[1], r2=r[2];
505 for (int l=0; l < length;)
507 constexpr int nlanes = v_int16::nlanes;
510 for (; l <= length - nlanes; l += nlanes)
512 v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0);
513 sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0);
514 sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0);
516 v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0);
517 sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]), vx_setall_f32(ky1), sum1);
518 sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]), vx_setall_f32(ky2), sum1);
522 sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
523 sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
526 v_int32 isum0 = v_round(sum0),
527 isum1 = v_round(sum1);
529 if (std::is_same<DST, short>::value)
532 v_int16 res = v_pack(isum0, isum1);
533 v_store(reinterpret_cast<short*>(&out[l]), res);
537 v_uint16 res = v_pack_u(isum0, isum1);
538 v_store(reinterpret_cast<ushort*>(&out[l]), res);
545 GAPI_DbgAssert(length >= nlanes);
551 // this code with manually vectored rounding to uchar is 10-40x faster than reference
552 template<bool noscale, typename SRC>
553 static void run_sepfilter3x3_any2char(uchar out[], const SRC *in[], int width, int chan,
554 const float kx[], const float ky[], int border,
555 float scale, float delta,
556 float *buf[], int y, int y0)
559 r[0] = (y - y0 ) % 3; // buf[r[0]]: previous
560 r[1] = (y - y0 + 1) % 3; // this
561 r[2] = (y - y0 + 2) % 3; // next row
563 const int length = width * chan;
564 const int shift = border * chan;
566 const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
567 const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
571 int k0 = (y == y0)? 0: 2;
573 for (int k = k0; k < 3; k++)
575 // previous , this , next pixel
576 const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
578 // rely on compiler vectoring
579 for (int l=0; l < length; l++)
581 buf[r[k]][l] = s[0][l]*kx0 + s[1][l]*kx1 + s[2][l]*kx2;
587 const int r0=r[0], r1=r[1], r2=r[2];
589 for (int l=0; l < length;)
591 constexpr int nlanes = v_uint8::nlanes;
594 for (; l <= length - nlanes; l += nlanes)
596 v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0);
597 sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0);
598 sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0);
600 v_float32 sum1 = vx_load(&buf[r0][l + nlanes/4]) * vx_setall_f32(ky0);
601 sum1 = v_fma(vx_load(&buf[r1][l + nlanes/4]), vx_setall_f32(ky1), sum1);
602 sum1 = v_fma(vx_load(&buf[r2][l + nlanes/4]), vx_setall_f32(ky2), sum1);
604 v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0);
605 sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]), vx_setall_f32(ky1), sum2);
606 sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]), vx_setall_f32(ky2), sum2);
608 v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0);
609 sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]), vx_setall_f32(ky1), sum3);
610 sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]), vx_setall_f32(ky2), sum3);
614 sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
615 sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
616 sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
617 sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
620 v_int32 isum0 = v_round(sum0),
621 isum1 = v_round(sum1),
622 isum2 = v_round(sum2),
623 isum3 = v_round(sum3);
625 v_int16 ires0 = v_pack(isum0, isum1),
626 ires1 = v_pack(isum2, isum3);
628 v_uint8 res = v_pack_u(ires0, ires1);
629 v_store(reinterpret_cast<uchar*>(&out[l]), res);
635 GAPI_DbgAssert(length >= nlanes);
641 // this code manually vectored for int16 not much faster than generic any-to-short code above
642 #define USE_SEPFILTER3X3_CHAR2SHORT 1
644 #if USE_SEPFILTER3X3_CHAR2SHORT
645 template<bool noscale>
646 static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int width, int chan,
647 const float kx[], const float ky[], int border,
648 float scale, float delta,
649 float *buf[], int y, int y0)
651 const schar ikx0 = saturate<schar>(kx[0], rintf);
652 const schar ikx1 = saturate<schar>(kx[1], rintf);
653 const schar ikx2 = saturate<schar>(kx[2], rintf);
655 const schar iky0 = saturate<schar>(ky[0], rintf);
656 const schar iky1 = saturate<schar>(ky[1], rintf);
657 const schar iky2 = saturate<schar>(ky[2], rintf);
659 const short iscale = saturate<short>(scale * (1 << 15), rintf);
660 const short idelta = saturate<short>(delta , rintf);
662 // check if this code is applicable
663 if (ikx0 != kx[0] || ikx1 != kx[1] || ikx2 != kx[2] ||
664 iky0 != ky[0] || iky1 != ky[1] || iky2 != ky[2] ||
666 std::abs(scale) > 1 || std::abs(scale) < 0.01)
668 run_sepfilter3x3_any2short<noscale>(out, in, width, chan, kx, ky, border, scale, delta,
674 ibuf[0] = reinterpret_cast<short*>(buf[0]);
675 ibuf[1] = reinterpret_cast<short*>(buf[1]);
676 ibuf[2] = reinterpret_cast<short*>(buf[2]);
679 r[0] = (y - y0 ) % 3; // buf[r[0]]: previous
680 r[1] = (y - y0 + 1) % 3; // this
681 r[2] = (y - y0 + 2) % 3; // next row
683 const int length = width * chan;
684 const int shift = border * chan;
688 int k0 = (y == y0)? 0: 2;
690 for (int k = k0; k < 3; k++)
692 for (int l=0; l < length;)
694 constexpr int nlanes = v_int16::nlanes;
696 // main part of output row
697 for (; l <= length - nlanes; l += nlanes)
699 v_uint16 t0 = vx_load_expand(&in[k][l - shift]); // previous
700 v_uint16 t1 = vx_load_expand(&in[k][l ]); // current
701 v_uint16 t2 = vx_load_expand(&in[k][l + shift]); // next pixel
702 v_int16 t = v_reinterpret_as_s16(t0) * vx_setall_s16(ikx0) +
703 v_reinterpret_as_s16(t1) * vx_setall_s16(ikx1) +
704 v_reinterpret_as_s16(t2) * vx_setall_s16(ikx2);
705 v_store(&ibuf[r[k]][l], t);
711 GAPI_DbgAssert(length >= nlanes);
719 for (int l=0; l < length;)
721 constexpr int nlanes = v_int16::nlanes;
723 // main part of output row
724 for (; l <= length - nlanes; l += nlanes)
726 v_int16 s0 = vx_load(&ibuf[r[0]][l]); // previous
727 v_int16 s1 = vx_load(&ibuf[r[1]][l]); // current
728 v_int16 s2 = vx_load(&ibuf[r[2]][l]); // next row
729 v_int16 s = s0 * vx_setall_s16(iky0) +
730 s1 * vx_setall_s16(iky1) +
731 s2 * vx_setall_s16(iky2);
735 s = v_mul_hi(s << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta);
744 GAPI_DbgAssert(length >= nlanes);
753 template<bool noscale, typename DST, typename SRC>
754 static void run_sepfilter3x3_reference(DST out[], const SRC *in[], int width, int chan,
755 const float kx[], const float ky[], int border,
756 float scale, float delta,
757 float *buf[], int y, int y0)
760 r[0] = (y - y0) % 3; // buf[r[0]]: previous
761 r[1] = (y - y0 + 1) % 3; // this
762 r[2] = (y - y0 + 2) % 3; // next row
764 int length = width * chan;
765 int shift = border * chan;
769 // full horizontal pass is needed only if very 1st row in ROI;
770 // for 2nd and further rows, it is enough to convolve only the
771 // "next" row - as we can reuse buffers from previous calls to
772 // this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
774 int k0 = (y == y0)? 0: 2;
776 for (int k = k0; k < 3; k++)
778 // previous , this , next pixel
779 const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
781 // rely on compiler vectoring
782 for (int l=0; l < length; l++)
784 buf[r[k]][l] = s[0][l]*kx[0] + s[1][l]*kx[1] + s[2][l]*kx[2];
790 for (int l=0; l < length; l++)
792 float sum = buf[r[0]][l]*ky[0] + buf[r[1]][l]*ky[1] + buf[r[2]][l]*ky[2];
796 sum = sum*scale + delta;
799 out[l] = saturate<DST>(sum, rintf);
803 template<bool noscale, typename DST, typename SRC>
804 static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int chan,
805 const float kx[], const float ky[], int border,
806 float scale, float delta,
807 float *buf[], int y, int y0)
810 int length = width * chan;
812 // length variable may be unused if types do not match at 'if' statements below
815 #if USE_SEPFILTER3X3_CHAR2SHORT
816 if (std::is_same<DST, short>::value && std::is_same<SRC, uchar>::value &&
817 length >= v_int16::nlanes)
819 // only slightly faster than more generic any-to-short (see below)
820 run_sepfilter3x3_char2short<noscale>(reinterpret_cast<short*>(out),
821 reinterpret_cast<const uchar**>(in),
822 width, chan, kx, ky, border, scale, delta,
828 if (std::is_same<DST, float>::value && std::is_same<SRC, float>::value &&
829 length >= v_float32::nlanes)
831 // appears 15% faster than reference any-to-float code (called below)
832 run_sepfilter3x3_any2float<noscale>(reinterpret_cast<float*>(out), in,
833 width, chan, kx, ky, border, scale, delta);
837 if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
839 // appears 10-40x faster than reference due to much faster rounding
840 run_sepfilter3x3_any2short<noscale>(reinterpret_cast<short*>(out), in,
841 width, chan, kx, ky, border, scale, delta,
846 if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
848 // appears 10-40x faster than reference due to much faster rounding
849 run_sepfilter3x3_any2short<noscale>(reinterpret_cast<ushort*>(out), in,
850 width, chan, kx, ky, border, scale, delta,
855 if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
857 // appears 10-40x faster than reference due to much faster rounding
858 run_sepfilter3x3_any2char<noscale>(reinterpret_cast<uchar*>(out), in,
859 width, chan, kx, ky, border, scale, delta,
865 // reference code is quite fast for any-to-float case,
866 // but not for any-to-integral due to very slow rounding
867 run_sepfilter3x3_reference<noscale>(out, in, width, chan, kx, ky, border,
868 scale, delta, buf, y, y0);
871 #define RUN_SEPFILTER3X3_IMPL(DST, SRC) \
872 void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
873 const float kx[], const float ky[], int border, \
874 float scale, float delta, \
875 float *buf[], int y, int y0) \
877 if (scale == 1 && delta == 0) \
879 constexpr bool noscale = true; \
880 run_sepfilter3x3_code<noscale>(out, in, width, chan, kx, ky, border, \
881 scale, delta, buf, y, y0); \
885 constexpr bool noscale = false; \
886 run_sepfilter3x3_code<noscale>(out, in, width, chan, kx, ky, border, \
887 scale, delta, buf, y, y0); \
891 RUN_SEPFILTER3X3_IMPL(uchar , uchar )
892 RUN_SEPFILTER3X3_IMPL( short, uchar )
893 RUN_SEPFILTER3X3_IMPL( float, uchar )
894 RUN_SEPFILTER3X3_IMPL(ushort, ushort)
895 RUN_SEPFILTER3X3_IMPL( short, ushort)
896 RUN_SEPFILTER3X3_IMPL( float, ushort)
897 RUN_SEPFILTER3X3_IMPL( short, short)
898 RUN_SEPFILTER3X3_IMPL( float, short)
899 RUN_SEPFILTER3X3_IMPL( float, float)
901 #undef RUN_SEPFILTER3X3_IMPL
903 //-------------------------
905 // Fluid kernels: Filter 2D
907 //-------------------------
909 template<bool noscale, typename DST, typename SRC>
910 static void run_filter2d_3x3_reference(DST out[], const SRC *in[], int width, int chan,
911 const float kernel[], float scale, float delta)
913 static constexpr int ksize = 3;
914 static constexpr int border = (ksize - 1) / 2;
916 const int length = width * chan;
917 const int shift = border * chan;
919 const float k[3][3] = {{ kernel[0], kernel[1], kernel[2] },
920 { kernel[3], kernel[4], kernel[5] },
921 { kernel[6], kernel[7], kernel[8] }};
923 for (int l=0; l < length; l++)
925 float sum = in[0][l - shift] * k[0][0] + in[0][l] * k[0][1] + in[0][l + shift] * k[0][2]
926 + in[1][l - shift] * k[1][0] + in[1][l] * k[1][1] + in[1][l + shift] * k[1][2]
927 + in[2][l - shift] * k[2][0] + in[2][l] * k[2][1] + in[2][l + shift] * k[2][2];
931 sum = sum*scale + delta;
934 out[l] = saturate<DST>(sum, rintf);
939 // assume DST is short or ushort
940 template<bool noscale, typename DST, typename SRC>
941 static void run_filter2d_3x3_any2short(DST out[], const SRC *in[], int width, int chan,
942 const float kernel[], float scale, float delta)
944 static constexpr int ksize = 3;
945 static constexpr int border = (ksize - 1) / 2;
947 const int length = width * chan;
948 const int shift = border * chan;
950 const float k[3][3] = {
951 { kernel[0], kernel[1], kernel[2] },
952 { kernel[3], kernel[4], kernel[5] },
953 { kernel[6], kernel[7], kernel[8] }
956 for (int l=0; l < length;)
958 static constexpr int nlanes = v_int16::nlanes;
960 // main part of output row
961 for (; l <= length - nlanes; l += nlanes)
963 auto sumx = [in, shift, &k](int i, int j)
965 v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]);
966 s = v_fma(vx_load_f32(&in[i][j ]), vx_setall_f32(k[i][1]), s);
967 s = v_fma(vx_load_f32(&in[i][j + shift]), vx_setall_f32(k[i][2]), s);
972 int l1 = l + nlanes/2;
973 v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0);
974 v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1);
978 sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
979 sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
982 v_int32 res0 = v_round(sum0);
983 v_int32 res1 = v_round(sum1);
985 if (std::is_same<DST, ushort>::value)
987 v_uint16 res = v_pack_u(res0, res1);
988 v_store(reinterpret_cast<ushort*>(&out[l]), res);
990 else // if DST == short
992 v_int16 res = v_pack(res0, res1);
993 v_store(reinterpret_cast<short*>(&out[l]), res);
1000 GAPI_DbgAssert(length >= nlanes);
1001 l = length - nlanes;
1006 template<bool noscale, typename SRC>
1007 static void run_filter2d_3x3_any2char(uchar out[], const SRC *in[], int width, int chan,
1008 const float kernel[], float scale, float delta)
1010 static constexpr int ksize = 3;
1011 static constexpr int border = (ksize - 1) / 2;
1013 const int length = width * chan;
1014 const int shift = border * chan;
1016 const float k[3][3] = {
1017 { kernel[0], kernel[1], kernel[2] },
1018 { kernel[3], kernel[4], kernel[5] },
1019 { kernel[6], kernel[7], kernel[8] }
1022 for (int l=0; l < length;)
1024 static constexpr int nlanes = v_uint8::nlanes;
1026 // main part of output row
1027 for (; l <= length - nlanes; l += nlanes)
1029 auto sumx = [in, shift, &k](int i, int j)
1031 v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]);
1032 s = v_fma(vx_load_f32(&in[i][j ]), vx_setall_f32(k[i][1]), s);
1033 s = v_fma(vx_load_f32(&in[i][j + shift]), vx_setall_f32(k[i][2]), s);
1038 int l1 = l + nlanes/4;
1039 int l2 = l + 2*nlanes/4;
1040 int l3 = l + 3*nlanes/4;
1041 v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0);
1042 v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1);
1043 v_float32 sum2 = sumx(0, l2) + sumx(1, l2) + sumx(2, l2);
1044 v_float32 sum3 = sumx(0, l3) + sumx(1, l3) + sumx(2, l3);
1048 sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
1049 sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
1050 sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
1051 sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
1054 v_int32 res0 = v_round(sum0);
1055 v_int32 res1 = v_round(sum1);
1056 v_int32 res2 = v_round(sum2);
1057 v_int32 res3 = v_round(sum3);
1059 v_int16 resl = v_pack(res0, res1);
1060 v_int16 resh = v_pack(res2, res3);
1061 v_uint8 res = v_pack_u(resl, resh);
1063 v_store(&out[l], res);
1069 GAPI_DbgAssert(length >= nlanes);
1070 l = length - nlanes;
1076 template<bool noscale, typename DST, typename SRC>
1077 static void run_filter2d_3x3_code(DST out[], const SRC *in[], int width, int chan,
1078 const float kernel[], float scale, float delta)
1081 int length = width * chan;
1083 // length variable may be unused if types do not match at 'if' statements below
1086 if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
1088 run_filter2d_3x3_any2short<noscale>(reinterpret_cast<short*>(out), in,
1089 width, chan, kernel, scale, delta);
1093 if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
1095 run_filter2d_3x3_any2short<noscale>(reinterpret_cast<ushort*>(out), in,
1096 width, chan, kernel, scale, delta);
1101 if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
1103 run_filter2d_3x3_any2char<noscale>(reinterpret_cast<uchar*>(out), in,
1104 width, chan, kernel, scale, delta);
1109 run_filter2d_3x3_reference<noscale>(out, in, width, chan, kernel, scale, delta);
1112 #define RUN_FILTER2D_3X3_IMPL(DST, SRC) \
1113 void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \
1114 const float kernel[], float scale, float delta) \
1116 if (scale == 1 && delta == 0) \
1118 constexpr bool noscale = true; \
1119 run_filter2d_3x3_code<noscale>(out, in, width, chan, kernel, scale, delta); \
1123 constexpr bool noscale = false; \
1124 run_filter2d_3x3_code<noscale>(out, in, width, chan, kernel, scale, delta); \
1128 RUN_FILTER2D_3X3_IMPL(uchar , uchar )
1129 RUN_FILTER2D_3X3_IMPL(ushort, ushort)
1130 RUN_FILTER2D_3X3_IMPL( short, short)
1131 RUN_FILTER2D_3X3_IMPL( float, uchar )
1132 RUN_FILTER2D_3X3_IMPL( float, ushort)
1133 RUN_FILTER2D_3X3_IMPL( float, short)
1134 RUN_FILTER2D_3X3_IMPL( float, float)
1136 #undef RUN_FILTER2D_3X3_IMPL
1138 //-----------------------------
1140 // Fluid kernels: Erode, Dilate
1142 //-----------------------------
1144 template<typename T>
1145 static void run_morphology3x3_reference(T out[], const T *in[], int width, int chan,
1146 const uchar k[], MorphShape k_type,
1147 Morphology morphology)
1149 constexpr int k_size = 3;
1150 constexpr int border = (k_size - 1) / 2;
1152 const uchar kernel[3][3] = {{k[0], k[1], k[2]}, {k[3], k[4], k[5]}, {k[6], k[7], k[8]}};
1154 const int length = width * chan;
1155 const int shift = border * chan;
1157 if (M_ERODE == morphology)
1159 if (M_FULL == k_type)
1161 for (int l=0; l < length; l++)
1163 T result = std::numeric_limits<T>::max();
1165 result = (std::min)(result, in[0][l - shift]);
1166 result = (std::min)(result, in[0][l ]);
1167 result = (std::min)(result, in[0][l + shift]);
1169 result = (std::min)(result, in[1][l - shift]);
1170 result = (std::min)(result, in[1][l ]);
1171 result = (std::min)(result, in[1][l + shift]);
1173 result = (std::min)(result, in[2][l - shift]);
1174 result = (std::min)(result, in[2][l ]);
1175 result = (std::min)(result, in[2][l + shift]);
1182 if (M_CROSS == k_type)
1184 for (int l=0; l < length; l++)
1186 T result = std::numeric_limits<T>::max();
1188 // result = (std::min)(result, in[0][l - shift]);
1189 result = (std::min)(result, in[0][l ]);
1190 // result = (std::min)(result, in[0][l + shift]);
1192 result = (std::min)(result, in[1][l - shift]);
1193 result = (std::min)(result, in[1][l ]);
1194 result = (std::min)(result, in[1][l + shift]);
1196 // result = (std::min)(result, in[2][l - shift]);
1197 result = (std::min)(result, in[2][l ]);
1198 // result = (std::min)(result, in[2][l + shift]);
1205 for (int l=0; l < length; l++)
1207 T result = std::numeric_limits<T>::max();
1209 result = kernel[0][0]? (std::min)(result, in[0][l - shift]): result;
1210 result = kernel[0][1]? (std::min)(result, in[0][l ]): result;
1211 result = kernel[0][2]? (std::min)(result, in[0][l + shift]): result;
1213 result = kernel[1][0]? (std::min)(result, in[1][l - shift]): result;
1214 result = kernel[1][1]? (std::min)(result, in[1][l ]): result;
1215 result = kernel[1][2]? (std::min)(result, in[1][l + shift]): result;
1217 result = kernel[2][0]? (std::min)(result, in[2][l - shift]): result;
1218 result = kernel[2][1]? (std::min)(result, in[2][l ]): result;
1219 result = kernel[2][2]? (std::min)(result, in[2][l + shift]): result;
1226 if (M_DILATE == morphology)
1228 if (M_FULL == k_type)
1230 for (int l=0; l < length; l++)
1232 T result = std::numeric_limits<T>::min();
1234 result = (std::max)(result, in[0][l - shift]);
1235 result = (std::max)(result, in[0][l ]);
1236 result = (std::max)(result, in[0][l + shift]);
1238 result = (std::max)(result, in[1][l - shift]);
1239 result = (std::max)(result, in[1][l ]);
1240 result = (std::max)(result, in[1][l + shift]);
1242 result = (std::max)(result, in[2][l - shift]);
1243 result = (std::max)(result, in[2][l ]);
1244 result = (std::max)(result, in[2][l + shift]);
1251 if (M_CROSS == k_type)
1253 for (int l=0; l < length; l++)
1255 T result = std::numeric_limits<T>::min();
1257 // result = (std::max)(result, in[0][l - shift]);
1258 result = (std::max)(result, in[0][l ]);
1259 // result = (std::max)(result, in[0][l + shift]);
1261 result = (std::max)(result, in[1][l - shift]);
1262 result = (std::max)(result, in[1][l ]);
1263 result = (std::max)(result, in[1][l + shift]);
1265 // result = (std::max)(result, in[2][l - shift]);
1266 result = (std::max)(result, in[2][l ]);
1267 // result = (std::max)(result, in[2][l + shift]);
1274 for (int l=0; l < length; l++)
1276 T result = std::numeric_limits<T>::min();
1278 result = kernel[0][0]? (std::max)(result, in[0][l - shift]): result;
1279 result = kernel[0][1]? (std::max)(result, in[0][l ]): result;
1280 result = kernel[0][2]? (std::max)(result, in[0][l + shift]): result;
1282 result = kernel[1][0]? (std::max)(result, in[1][l - shift]): result;
1283 result = kernel[1][1]? (std::max)(result, in[1][l ]): result;
1284 result = kernel[1][2]? (std::max)(result, in[1][l + shift]): result;
1286 result = kernel[2][0]? (std::max)(result, in[2][l - shift]): result;
1287 result = kernel[2][1]? (std::max)(result, in[2][l ]): result;
1288 result = kernel[2][2]? (std::max)(result, in[2][l + shift]): result;
1295 CV_Error(cv::Error::StsBadArg, "unsupported morphology");
1299 template<typename T, typename VT, typename S>
1300 static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
1301 const uchar k[], MorphShape k_type,
1302 Morphology morphology,
1305 constexpr int k_size = 3;
1306 constexpr int border = (k_size - 1) / 2;
1308 const uchar kernel[3][3] = {{k[0], k[1], k[2]}, {k[3], k[4], k[5]}, {k[6], k[7], k[8]}};
1310 const int length = width * chan;
1311 const int shift = border * chan;
1313 if (M_ERODE == morphology)
1315 if (M_FULL == k_type)
1317 for (int l=0; l < length;)
1319 constexpr int nlanes = VT::nlanes;
1321 // main part of output row
1322 for (; l <= length - nlanes; l += nlanes)
1324 VT r = setall(std::numeric_limits<T>::max());
1326 r = v_min(r, vx_load(&in[0][l - shift]));
1327 r = v_min(r, vx_load(&in[0][l ]));
1328 r = v_min(r, vx_load(&in[0][l + shift]));
1330 r = v_min(r, vx_load(&in[1][l - shift]));
1331 r = v_min(r, vx_load(&in[1][l ]));
1332 r = v_min(r, vx_load(&in[1][l + shift]));
1334 r = v_min(r, vx_load(&in[2][l - shift]));
1335 r = v_min(r, vx_load(&in[2][l ]));
1336 r = v_min(r, vx_load(&in[2][l + shift]));
1338 v_store(&out[l], r);
1344 GAPI_DbgAssert(length >= nlanes);
1345 l = length - nlanes;
1351 if (M_CROSS == k_type)
1353 for (int l=0; l < length;)
1355 constexpr int nlanes = VT::nlanes;
1357 // main part of output row
1358 for (; l <= length - nlanes; l += nlanes)
1360 VT r = setall(std::numeric_limits<T>::max());
1362 // r = v_min(r, vx_load(&in[0][l - shift]));
1363 r = v_min(r, vx_load(&in[0][l ]));
1364 // r = v_min(r, vx_load(&in[0][l + shift]));
1366 r = v_min(r, vx_load(&in[1][l - shift]));
1367 r = v_min(r, vx_load(&in[1][l ]));
1368 r = v_min(r, vx_load(&in[1][l + shift]));
1370 // r = v_min(r, vx_load(&in[2][l - shift]));
1371 r = v_min(r, vx_load(&in[2][l ]));
1372 // r = v_min(r, vx_load(&in[2][l + shift]));
1374 v_store(&out[l], r);
1380 GAPI_DbgAssert(length >= nlanes);
1381 l = length - nlanes;
1387 for (int l=0; l < length;)
1389 constexpr int nlanes = VT::nlanes;
1391 // main part of output row
1392 for (; l <= length - nlanes; l += nlanes)
1394 VT r = setall(std::numeric_limits<T>::max());
1396 if (kernel[0][0]) r = v_min(r, vx_load(&in[0][l - shift]));
1397 if (kernel[0][1]) r = v_min(r, vx_load(&in[0][l ]));
1398 if (kernel[0][2]) r = v_min(r, vx_load(&in[0][l + shift]));
1400 if (kernel[1][0]) r = v_min(r, vx_load(&in[1][l - shift]));
1401 if (kernel[1][1]) r = v_min(r, vx_load(&in[1][l ]));
1402 if (kernel[1][2]) r = v_min(r, vx_load(&in[1][l + shift]));
1404 if (kernel[2][0]) r = v_min(r, vx_load(&in[2][l - shift]));
1405 if (kernel[2][1]) r = v_min(r, vx_load(&in[2][l ]));
1406 if (kernel[2][2]) r = v_min(r, vx_load(&in[2][l + shift]));
1408 v_store(&out[l], r);
1414 GAPI_DbgAssert(length >= nlanes);
1415 l = length - nlanes;
1421 if (M_DILATE == morphology)
1423 if (M_FULL == k_type)
1425 for (int l=0; l < length;)
1427 constexpr int nlanes = VT::nlanes;
1429 // main part of output row
1430 for (; l <= length - nlanes; l += nlanes)
1432 VT r = setall(std::numeric_limits<T>::min());
1434 r = v_max(r, vx_load(&in[0][l - shift]));
1435 r = v_max(r, vx_load(&in[0][l ]));
1436 r = v_max(r, vx_load(&in[0][l + shift]));
1438 r = v_max(r, vx_load(&in[1][l - shift]));
1439 r = v_max(r, vx_load(&in[1][l ]));
1440 r = v_max(r, vx_load(&in[1][l + shift]));
1442 r = v_max(r, vx_load(&in[2][l - shift]));
1443 r = v_max(r, vx_load(&in[2][l ]));
1444 r = v_max(r, vx_load(&in[2][l + shift]));
1446 v_store(&out[l], r);
1452 GAPI_DbgAssert(length >= nlanes);
1453 l = length - nlanes;
1459 if (M_CROSS == k_type)
1461 for (int l=0; l < length;)
1463 constexpr int nlanes = VT::nlanes;
1465 // main part of output row
1466 for (; l <= length - nlanes; l += nlanes)
1468 VT r = setall(std::numeric_limits<T>::min());
1470 // r = v_max(r, vx_load(&in[0][l - shift]));
1471 r = v_max(r, vx_load(&in[0][l ]));
1472 // r = v_max(r, vx_load(&in[0][l + shift]));
1474 r = v_max(r, vx_load(&in[1][l - shift]));
1475 r = v_max(r, vx_load(&in[1][l ]));
1476 r = v_max(r, vx_load(&in[1][l + shift]));
1478 // r = v_max(r, vx_load(&in[2][l - shift]));
1479 r = v_max(r, vx_load(&in[2][l ]));
1480 // r = v_max(r, vx_load(&in[2][l + shift]));
1482 v_store(&out[l], r);
1488 GAPI_DbgAssert(length >= nlanes);
1489 l = length - nlanes;
1495 for (int l=0; l < length;)
1497 constexpr int nlanes = VT::nlanes;
1499 // main part of output row
1500 for (; l <= length - nlanes; l += nlanes)
1502 VT r = setall(std::numeric_limits<T>::min());
1504 if (kernel[0][0]) r = v_max(r, vx_load(&in[0][l - shift]));
1505 if (kernel[0][1]) r = v_max(r, vx_load(&in[0][l ]));
1506 if (kernel[0][2]) r = v_max(r, vx_load(&in[0][l + shift]));
1508 if (kernel[1][0]) r = v_max(r, vx_load(&in[1][l - shift]));
1509 if (kernel[1][1]) r = v_max(r, vx_load(&in[1][l ]));
1510 if (kernel[1][2]) r = v_max(r, vx_load(&in[1][l + shift]));
1512 if (kernel[2][0]) r = v_max(r, vx_load(&in[2][l - shift]));
1513 if (kernel[2][1]) r = v_max(r, vx_load(&in[2][l ]));
1514 if (kernel[2][2]) r = v_max(r, vx_load(&in[2][l + shift]));
1516 v_store(&out[l], r);
1522 GAPI_DbgAssert(length >= nlanes);
1523 l = length - nlanes;
1529 CV_Error(cv::Error::StsBadArg, "unsupported morphology");
1533 template<typename T>
1534 static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
1535 const uchar k[], MorphShape k_type,
1536 Morphology morphology)
1539 int length = width * chan;
1541 // length variable may be unused if types do not match at 'if' statements below
1544 if (std::is_same<T, float>::value && length >= v_float32::nlanes)
1546 run_morphology3x3_simd<float, v_float32>(reinterpret_cast<float*>(out),
1547 reinterpret_cast<const float**>(in),
1548 width, chan, k, k_type, morphology,
1553 if (std::is_same<T, short>::value && length >= v_int16::nlanes)
1555 run_morphology3x3_simd<short, v_int16>(reinterpret_cast<short*>(out),
1556 reinterpret_cast<const short**>(in),
1557 width, chan, k, k_type, morphology,
1562 if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
1564 run_morphology3x3_simd<ushort, v_uint16>(reinterpret_cast<ushort*>(out),
1565 reinterpret_cast<const ushort**>(in),
1566 width, chan, k, k_type, morphology,
1571 if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
1573 run_morphology3x3_simd<uchar, v_uint8>(reinterpret_cast<uchar*>(out),
1574 reinterpret_cast<const uchar**>(in),
1575 width, chan, k, k_type, morphology,
1581 run_morphology3x3_reference(out, in, width, chan, k, k_type, morphology);
1584 #define RUN_MORPHOLOGY3X3_IMPL(T) \
1585 void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \
1586 const uchar k[], MorphShape k_type, \
1587 Morphology morphology) \
1589 run_morphology3x3_code(out, in, width, chan, k, k_type, morphology); \
1592 RUN_MORPHOLOGY3X3_IMPL(uchar )
1593 RUN_MORPHOLOGY3X3_IMPL(ushort)
1594 RUN_MORPHOLOGY3X3_IMPL( short)
1595 RUN_MORPHOLOGY3X3_IMPL( float)
1597 #undef RUN_MORPHOLOGY3X3_IMPL
1599 //---------------------------
1601 // Fluid kernels: Median blur
1603 //---------------------------
1605 template<typename T>
1606 static void run_medblur3x3_reference(T out[], const T *in[], int width, int chan)
1608 constexpr int ksize = 3;
1609 constexpr int border = (ksize - 1) / 2;
1611 const int length = width * chan;
1612 const int shift = border * chan;
1614 for (int l=0; l < length; l++)
1618 // neighbourhood 3x3
1619 t[0][0] = in[0][l - shift]; t[0][1] = in[0][l]; t[0][2] = in[0][l + shift];
1620 t[1][0] = in[1][l - shift]; t[1][1] = in[1][l]; t[1][2] = in[1][l + shift];
1621 t[2][0] = in[2][l - shift]; t[2][1] = in[2][l]; t[2][2] = in[2][l + shift];
1624 auto sort = [](T& a, T& b)
1627 a = (std::min)(u, v);
1628 b = (std::max)(u, v);
1631 // horizontal: 3-elements bubble-sort per each row
1632 sort(t[0][0], t[0][1]); sort(t[0][1], t[0][2]); sort(t[0][0], t[0][1]);
1633 sort(t[1][0], t[1][1]); sort(t[1][1], t[1][2]); sort(t[1][0], t[1][1]);
1634 sort(t[2][0], t[2][1]); sort(t[2][1], t[2][2]); sort(t[2][0], t[2][1]);
1636 // vertical: columns bubble-sort (although partial)
1637 sort(t[0][0], t[1][0]); sort(t[0][1], t[1][1]); /*sort(t[0][2], t[1][2]);*/
1638 sort(t[1][0], t[2][0]); sort(t[1][1], t[2][1]); sort(t[1][2], t[2][2]);
1639 /*sort(t[0][0], t[1][0]);*/ sort(t[0][1], t[1][1]); sort(t[0][2], t[1][2]);
1641 // diagonal: bubble-sort (in opposite order!)
1642 sort(t[1][1], t[0][2]); sort(t[2][0], t[1][1]); sort(t[1][1], t[0][2]);
1649 template<typename VT, typename T>
1650 static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan)
1652 constexpr int ksize = 3;
1653 constexpr int border = (ksize - 1) / 2;
1655 const int length = width * chan;
1656 const int shift = border * chan;
1658 for (int l=0; l < length;)
1660 constexpr int nlanes = VT::nlanes;
1662 // main part of output row
1663 for (; l <= length - nlanes; l += nlanes)
1665 VT t00, t01, t02, t10, t11, t12, t20, t21, t22;
1667 // neighbourhood 3x3
1669 t00 = vx_load(&in[0][l - shift]);
1670 t01 = vx_load(&in[0][l ]);
1671 t02 = vx_load(&in[0][l + shift]);
1673 t10 = vx_load(&in[1][l - shift]);
1674 t11 = vx_load(&in[1][l ]);
1675 t12 = vx_load(&in[1][l + shift]);
1677 t20 = vx_load(&in[2][l - shift]);
1678 t21 = vx_load(&in[2][l ]);
1679 t22 = vx_load(&in[2][l + shift]);
1682 auto sort = [](VT& a, VT& b)
1689 // horizontal: 3-elements bubble-sort per each row
1690 sort(t00, t01); sort(t01, t02); sort(t00, t01);
1691 sort(t10, t11); sort(t11, t12); sort(t10, t11);
1692 sort(t20, t21); sort(t21, t22); sort(t20, t21);
1694 // vertical: columns bubble-sort (although partial)
1695 sort(t00, t10); sort(t01, t11); /*sort(t02, t12);*/
1696 sort(t10, t20); sort(t11, t21); sort(t12, t22);
1697 /*sort(t00, t10);*/ sort(t01, t11); sort(t02, t12);
1699 // diagonal: bubble-sort (in opposite order!)
1700 sort(t11, t02); sort(t20, t11); sort(t11, t02);
1702 v_store(&out[l], t11);
1708 GAPI_DbgAssert(length >= nlanes);
1709 l = length - nlanes;
1715 template<typename T>
1716 static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
1719 int length = width * chan;
1721 // length variable may be unused if types do not match at 'if' statements below
1724 if (std::is_same<T, float>::value && length >= v_float32::nlanes)
1726 run_medblur3x3_simd<v_float32>(reinterpret_cast<float*>(out),
1727 reinterpret_cast<const float**>(in),
1732 if (std::is_same<T, short>::value && length >= v_int16::nlanes)
1734 run_medblur3x3_simd<v_int16>(reinterpret_cast<short*>(out),
1735 reinterpret_cast<const short**>(in),
1740 if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
1742 run_medblur3x3_simd<v_uint16>(reinterpret_cast<ushort*>(out),
1743 reinterpret_cast<const ushort**>(in),
1748 if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
1750 run_medblur3x3_simd<v_uint8>(reinterpret_cast<uchar*>(out),
1751 reinterpret_cast<const uchar**>(in),
1757 run_medblur3x3_reference(out, in, width, chan);
1760 #define RUN_MEDBLUR3X3_IMPL(T) \
1761 void run_medblur3x3_impl(T out[], const T *in[], int width, int chan) \
1763 run_medblur3x3_code(out, in, width, chan); \
1766 RUN_MEDBLUR3X3_IMPL(uchar )
1767 RUN_MEDBLUR3X3_IMPL(ushort)
1768 RUN_MEDBLUR3X3_IMPL( short)
1769 RUN_MEDBLUR3X3_IMPL( float)
1771 #undef RUN_MEDBLUR3X3_IMPL
1773 //------------------------------------------------------------------------------
1775 #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
1777 CV_CPU_OPTIMIZATION_NAMESPACE_END
1779 } // namespace fluid
1783 #endif // !defined(GAPI_STANDALONE)