inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4 //
   5 // Copyright (C) 2018-2019 Intel Corporation
   6
   7 // NB: allow including this *.hpp several times!
   8 // #pragma once -- don't: this file is NOT once!
   9
  10 #if !defined(GAPI_STANDALONE)
  11
  12 #include "gfluidimgproc_func.hpp"
  13
  14 #include "opencv2/gapi/own/saturate.hpp"
  15
  16 #include "opencv2/core.hpp"
  17 #include "opencv2/core/hal/intrin.hpp"
  18
  19 #include <cstdint>
  20 #include <cstring>
  21
  22 #include <algorithm>
  23 #include <limits>
  24 #include <vector>
  25
  26 #ifdef __GNUC__
  27 #  pragma GCC diagnostic push
  28 #  pragma GCC diagnostic ignored "-Wstrict-overflow"
  29 #endif
  30
  31 using cv::gapi::own::saturate;
  32
  33 namespace cv {
  34 namespace gapi {
  35 namespace fluid {
  36
  37 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
  38
  39 //----------------------------------
  40 //
  41 // Fluid kernels: RGB2Gray, BGR2Gray
  42 //
  43 //----------------------------------
  44
  45 void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
  46                        float coef_r, float coef_g, float coef_b);
  47
  48 //--------------------------------------
  49 //
  50 // Fluid kernels: RGB-to-YUV, YUV-to-RGB
  51 //
  52 //--------------------------------------
  53
  54 void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef[5]);
  55
  56 void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]);
  57
  58 //-------------------------
  59 //
  60 // Fluid kernels: sepFilter
  61 //
  62 //-------------------------
  63
  64 #define RUN_SEPFILTER3X3_IMPL(DST, SRC)                                     \
  65 void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
  66                            const float kx[], const float ky[], int border,  \
  67                            float scale, float delta,                        \
  68                            float *buf[], int y, int y0);
  69
  70 RUN_SEPFILTER3X3_IMPL(uchar , uchar )
  71 RUN_SEPFILTER3X3_IMPL( short, uchar )
  72 RUN_SEPFILTER3X3_IMPL( float, uchar )
  73 RUN_SEPFILTER3X3_IMPL(ushort, ushort)
  74 RUN_SEPFILTER3X3_IMPL( short, ushort)
  75 RUN_SEPFILTER3X3_IMPL( float, ushort)
  76 RUN_SEPFILTER3X3_IMPL( short,  short)
  77 RUN_SEPFILTER3X3_IMPL( float,  short)
  78 RUN_SEPFILTER3X3_IMPL( float,  float)
  79
  80 #undef RUN_SEPFILTER3X3_IMPL
  81
  82 //-------------------------
  83 //
  84 // Fluid kernels: Filter 2D
  85 //
  86 //-------------------------
  87
  88 #define RUN_FILTER2D_3X3_IMPL(DST, SRC)                                     \
  89 void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \
  90                            const float kernel[], float scale, float delta);
  91
  92 RUN_FILTER2D_3X3_IMPL(uchar , uchar )
  93 RUN_FILTER2D_3X3_IMPL(ushort, ushort)
  94 RUN_FILTER2D_3X3_IMPL( short,  short)
  95 RUN_FILTER2D_3X3_IMPL( float, uchar )
  96 RUN_FILTER2D_3X3_IMPL( float, ushort)
  97 RUN_FILTER2D_3X3_IMPL( float,  short)
  98 RUN_FILTER2D_3X3_IMPL( float,  float)
  99
 100 #undef RUN_FILTER2D_3X3_IMPL
 101
 102 //-----------------------------
 103 //
 104 // Fluid kernels: Erode, Dilate
 105 //
 106 //-----------------------------
 107
 108 #define RUN_MORPHOLOGY3X3_IMPL(T)                                        \
 109 void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \
 110                             const uchar k[], MorphShape k_type,          \
 111                             Morphology morphology);
 112
 113 RUN_MORPHOLOGY3X3_IMPL(uchar )
 114 RUN_MORPHOLOGY3X3_IMPL(ushort)
 115 RUN_MORPHOLOGY3X3_IMPL( short)
 116 RUN_MORPHOLOGY3X3_IMPL( float)
 117
 118 #undef RUN_MORPHOLOGY3X3_IMPL
 119
 120 //---------------------------
 121 //
 122 // Fluid kernels: Median blur
 123 //
 124 //---------------------------
 125
 126 #define RUN_MEDBLUR3X3_IMPL(T) \
 127 void run_medblur3x3_impl(T out[], const T *in[], int width, int chan);
 128
 129 RUN_MEDBLUR3X3_IMPL(uchar )
 130 RUN_MEDBLUR3X3_IMPL(ushort)
 131 RUN_MEDBLUR3X3_IMPL( short)
 132 RUN_MEDBLUR3X3_IMPL( float)
 133
 134 #undef RUN_MEDBLUR3X3_IMPL
 135
 136 //----------------------------------------------------------------------
 137
 138 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 139
 140 #if CV_SIMD
 141 template<typename SRC>
 142 static inline v_float32 vx_load_f32(const SRC* ptr)
 143 {
 144     if (std::is_same<SRC,uchar>::value)
 145     {
 146         v_uint32 tmp = vx_load_expand_q(reinterpret_cast<const uchar*>(ptr));
 147         return v_cvt_f32(v_reinterpret_as_s32(tmp));
 148     }
 149
 150     if (std::is_same<SRC,ushort>::value)
 151     {
 152         v_uint32 tmp = vx_load_expand(reinterpret_cast<const ushort*>(ptr));
 153         return v_cvt_f32(v_reinterpret_as_s32(tmp));
 154     }
 155
 156     if (std::is_same<SRC,short>::value)
 157     {
 158         v_int32 tmp = vx_load_expand(reinterpret_cast<const short*>(ptr));
 159         return v_cvt_f32(tmp);
 160     }
 161
 162     if (std::is_same<SRC,float>::value)
 163     {
 164         v_float32 tmp = vx_load(reinterpret_cast<const float*>(ptr));
 165         return tmp;
 166     }
 167
 168     CV_Error(cv::Error::StsBadArg, "unsupported type");
 169 }
 170 #endif  // CV_SIMD
 171
 172 //----------------------------------
 173 //
 174 // Fluid kernels: RGB2Gray, BGR2Gray
 175 //
 176 //----------------------------------
 177
 178 void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
 179                        float coef_r, float coef_g, float coef_b)
 180 {
 181     // assume:
 182     // - coefficients are less than 1
 183     // - and their sum equals 1
 184
 185     constexpr int unity = 1 << 16;  // Q0.0.16 inside ushort:
 186     ushort rc = static_cast<ushort>(coef_r * unity + 0.5f);
 187     ushort gc = static_cast<ushort>(coef_g * unity + 0.5f);
 188     ushort bc = static_cast<ushort>(coef_b * unity + 0.5f);
 189
 190     GAPI_Assert(rc + gc + bc <= unity);
 191     GAPI_Assert(rc + gc + bc >= USHRT_MAX);
 192
 193 #if CV_SIMD
 194     constexpr int nlanes = v_uint8::nlanes;
 195     if (width >= nlanes)
 196     {
 197         for (int w=0; w < width; )
 198         {
 199             // process main part of pixels row
 200             for ( ; w <= width - nlanes; w += nlanes)
 201             {
 202                 v_uint8 r, g, b;
 203                 v_load_deinterleave(&in[3*w], r, g, b);
 204
 205                 v_uint16 r0, r1, g0, g1, b0, b1;
 206                 v_expand(r, r0, r1);
 207                 v_expand(g, g0, g1);
 208                 v_expand(b, b0, b1);
 209
 210                 v_uint16 y0, y1;
 211                 static const ushort half = 1 << 7; // Q0.8.8
 212                 y0 = (v_mul_hi(r0 << 8, vx_setall_u16(rc)) +
 213                       v_mul_hi(g0 << 8, vx_setall_u16(gc)) +
 214                       v_mul_hi(b0 << 8, vx_setall_u16(bc)) +
 215                                         vx_setall_u16(half)) >> 8;
 216                 y1 = (v_mul_hi(r1 << 8, vx_setall_u16(rc)) +
 217                       v_mul_hi(g1 << 8, vx_setall_u16(gc)) +
 218                       v_mul_hi(b1 << 8, vx_setall_u16(bc)) +
 219                                         vx_setall_u16(half)) >> 8;
 220
 221                 v_uint8 y;
 222                 y = v_pack(y0, y1);
 223                 v_store(&out[w], y);
 224             }
 225
 226             // process tail (if any)
 227             if (w < width)
 228             {
 229                 GAPI_DbgAssert(width - nlanes >= 0);
 230                 w = width - nlanes;
 231             }
 232         }
 233
 234         return;
 235     }
 236 #endif
 237
 238     for (int w=0; w < width; w++)
 239     {
 240         uchar r = in[3*w    ];
 241         uchar g = in[3*w + 1];
 242         uchar b = in[3*w + 2];
 243
 244         static const int half = 1 << 15;  // Q0.0.16
 245         ushort y = (r*rc + b*bc + g*gc + half) >> 16;
 246         out[w] = static_cast<uchar>(y);
 247     }
 248 }
 249
 250 //--------------------------------------
 251 //
 252 // Fluid kernels: RGB-to-YUV, YUV-to-RGB
 253 //
 254 //--------------------------------------
 255
 256 void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef[5])
 257 {
 258     ushort c0 = static_cast<ushort>(coef[0]*(1 << 16) + 0.5f);  // Q0.0.16 un-signed
 259     ushort c1 = static_cast<ushort>(coef[1]*(1 << 16) + 0.5f);
 260     ushort c2 = static_cast<ushort>(coef[2]*(1 << 16) + 0.5f);
 261     short c3 = static_cast<short>(coef[3]*(1 << 12) + 0.5f);    // Q1.0.12 signed
 262     short c4 = static_cast<short>(coef[4]*(1 << 12) + 0.5f);
 263
 264     int w = 0;
 265
 266 #if CV_SIMD
 267     static const int nlanes = v_uint8::nlanes;
 268     for ( ; w <= width - nlanes; w += nlanes)
 269     {
 270         v_uint8 r, g, b;
 271         v_load_deinterleave(&in[3*w], r, g, b);
 272
 273         v_uint16 _r0, _r1, _g0, _g1, _b0, _b1;
 274         v_expand(r, _r0, _r1);
 275         v_expand(g, _g0, _g1);
 276         v_expand(b, _b0, _b1);
 277
 278         _r0 = _r0 << 7;                         // Q0.9.7 un-signed
 279         _r1 = _r1 << 7;
 280         _g0 = _g0 << 7;
 281         _g1 = _g1 << 7;
 282         _b0 = _b0 << 7;
 283         _b1 = _b1 << 7;
 284
 285         v_uint16 _y0, _y1;
 286         _y0 = v_mul_hi(vx_setall_u16(c0), _r0)  // Q0.9.7
 287             + v_mul_hi(vx_setall_u16(c1), _g0)
 288             + v_mul_hi(vx_setall_u16(c2), _b0);
 289         _y1 = v_mul_hi(vx_setall_u16(c0), _r1)
 290             + v_mul_hi(vx_setall_u16(c1), _g1)
 291             + v_mul_hi(vx_setall_u16(c2), _b1);
 292
 293         v_int16 r0, r1, b0, b1, y0, y1;
 294         r0 = v_reinterpret_as_s16(_r0);         // Q1.8.7 signed
 295         r1 = v_reinterpret_as_s16(_r1);
 296         b0 = v_reinterpret_as_s16(_b0);
 297         b1 = v_reinterpret_as_s16(_b1);
 298         y0 = v_reinterpret_as_s16(_y0);
 299         y1 = v_reinterpret_as_s16(_y1);
 300
 301         v_int16 u0, u1, v0, v1;
 302         u0 = v_mul_hi(vx_setall_s16(c3), b0 - y0);  // Q1.12.3
 303         u1 = v_mul_hi(vx_setall_s16(c3), b1 - y1);
 304         v0 = v_mul_hi(vx_setall_s16(c4), r0 - y0);
 305         v1 = v_mul_hi(vx_setall_s16(c4), r1 - y1);
 306
 307         v_uint8 y, u, v;
 308         y = v_pack((_y0 + vx_setall_u16(1 << 6)) >> 7,
 309                    (_y1 + vx_setall_u16(1 << 6)) >> 7);
 310         u = v_pack_u((u0 + vx_setall_s16(257 << 2)) >> 3,  // 257 << 2 = 128.5 * (1 << 3)
 311                      (u1 + vx_setall_s16(257 << 2)) >> 3);
 312         v = v_pack_u((v0 + vx_setall_s16(257 << 2)) >> 3,
 313                      (v1 + vx_setall_s16(257 << 2)) >> 3);
 314
 315         v_store_interleave(&out[3*w], y, u, v);
 316     }
 317 #endif
 318
 319     for ( ; w < width; w++)
 320     {
 321         short r = in[3*w    ] << 7;                            // Q1.8.7 signed
 322         short g = in[3*w + 1] << 7;
 323         short b = in[3*w + 2] << 7;
 324         short y = (c0*r + c1*g + c2*b) >> 16;                  // Q1.8.7
 325         short u =  c3*(b - y) >> 16;                           // Q1.12.3
 326         short v =  c4*(r - y) >> 16;
 327         out[3*w    ] = static_cast<uchar>((y              + (1 << 6)) >> 7);
 328         out[3*w + 1] =    saturate<uchar>((u + (128 << 3) + (1 << 2)) >> 3);
 329         out[3*w + 2] =    saturate<uchar>((v + (128 << 3) + (1 << 2)) >> 3);
 330     }
 331 }
 332
 333 void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4])
 334 {
 335     short c0 = static_cast<short>(coef[0] * (1 << 12) + 0.5f);  // Q1.3.12
 336     short c1 = static_cast<short>(coef[1] * (1 << 12) + 0.5f);
 337     short c2 = static_cast<short>(coef[2] * (1 << 12) + 0.5f);
 338     short c3 = static_cast<short>(coef[3] * (1 << 12) + 0.5f);
 339
 340     int w = 0;
 341
 342 #if CV_SIMD
 343     static const int nlanes = v_uint8::nlanes;
 344     for ( ; w <= width - nlanes; w += nlanes)
 345     {
 346         v_uint8 y, u, v;
 347         v_load_deinterleave(&in[3*w], y, u, v);
 348
 349         v_uint16 _y0, _y1, _u0, _u1, _v0, _v1;
 350         v_expand(y, _y0, _y1);
 351         v_expand(u, _u0, _u1);
 352         v_expand(v, _v0, _v1);
 353
 354         v_int16 y0, y1, u0, u1, v0, v1;
 355         y0 = v_reinterpret_as_s16(_y0);
 356         y1 = v_reinterpret_as_s16(_y1);
 357         u0 = v_reinterpret_as_s16(_u0);
 358         u1 = v_reinterpret_as_s16(_u1);
 359         v0 = v_reinterpret_as_s16(_v0);
 360         v1 = v_reinterpret_as_s16(_v1);
 361
 362         y0 =  y0 << 3;                              // Q1.12.3
 363         y1 =  y1 << 3;
 364         u0 = (u0 - vx_setall_s16(128)) << 7;        // Q1.8.7
 365         u1 = (u1 - vx_setall_s16(128)) << 7;
 366         v0 = (v0 - vx_setall_s16(128)) << 7;
 367         v1 = (v1 - vx_setall_s16(128)) << 7;
 368
 369         v_int16 r0, r1, g0, g1, b0, b1;
 370         r0 = y0 + v_mul_hi(vx_setall_s16(c0), v0);  // Q1.12.3
 371         r1 = y1 + v_mul_hi(vx_setall_s16(c0), v1);
 372         g0 = y0 + v_mul_hi(vx_setall_s16(c1), u0)
 373                 + v_mul_hi(vx_setall_s16(c2), v0);
 374         g1 = y1 + v_mul_hi(vx_setall_s16(c1), u1)
 375                 + v_mul_hi(vx_setall_s16(c2), v1);
 376         b0 = y0 + v_mul_hi(vx_setall_s16(c3), u0);
 377         b1 = y1 + v_mul_hi(vx_setall_s16(c3), u1);
 378
 379         v_uint8 r, g, b;
 380         r = v_pack_u((r0 + vx_setall_s16(1 << 2)) >> 3,
 381                      (r1 + vx_setall_s16(1 << 2)) >> 3);
 382         g = v_pack_u((g0 + vx_setall_s16(1 << 2)) >> 3,
 383                      (g1 + vx_setall_s16(1 << 2)) >> 3);
 384         b = v_pack_u((b0 + vx_setall_s16(1 << 2)) >> 3,
 385                      (b1 + vx_setall_s16(1 << 2)) >> 3);
 386
 387         v_store_interleave(&out[3*w], r, g, b);
 388     }
 389 #endif
 390
 391     for ( ; w < width; w++)
 392     {
 393         short y =  in[3*w    ]        << 3;  // Q1.12.3
 394         short u = (in[3*w + 1] - 128) << 7;  // Q1.8.7
 395         short v = (in[3*w + 2] - 128) << 7;
 396         short r = y + (        c0*v  >> 16); // Q1.12.3
 397         short g = y + ((c1*u + c2*v) >> 16);
 398         short b = y + ((c3*u       ) >> 16);
 399         out[3*w    ] = saturate<uchar>((r + (1 << 2)) >> 3);
 400         out[3*w + 1] = saturate<uchar>((g + (1 << 2)) >> 3);
 401         out[3*w + 2] = saturate<uchar>((b + (1 << 2)) >> 3);
 402     }
 403 }
 404
 405 //-------------------------
 406 //
 407 // Fluid kernels: sepFilter
 408 //
 409 //-------------------------
 410
 411 #if CV_SIMD
 412 // this variant not using buf[] appears 15% faster than reference any-2-float code below
 413 template<bool noscale, typename SRC>
 414 static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width, int chan,
 415                                        const float kx[], const float ky[], int border,
 416                                        float scale, float delta)
 417 {
 418     const int length = width * chan;
 419     const int shift = border * chan;
 420
 421     const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
 422     const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
 423
 424     for (int l=0; l < length; )
 425     {
 426         static const int nlanes = v_float32::nlanes;
 427
 428         // main part
 429         for ( ; l <= length - nlanes; l += nlanes)
 430         {
 431             auto xsum = [l, shift, kx0, kx1, kx2](const SRC i[])
 432             {
 433                 v_float32 t0 = vx_load_f32(&i[l - shift]);
 434                 v_float32 t1 = vx_load_f32(&i[l        ]);
 435                 v_float32 t2 = vx_load_f32(&i[l + shift]);
 436                 v_float32 t = t0 * vx_setall_f32(kx0);
 437                     t = v_fma(t1,  vx_setall_f32(kx1), t);
 438                     t = v_fma(t2,  vx_setall_f32(kx2), t);
 439                 return t;
 440             };
 441
 442             v_float32 s0 = xsum(in[0]);
 443             v_float32 s1 = xsum(in[1]);
 444             v_float32 s2 = xsum(in[2]);
 445             v_float32 s = s0 * vx_setall_f32(ky0);
 446                 s = v_fma(s1,  vx_setall_f32(ky1), s);
 447                 s = v_fma(s2,  vx_setall_f32(ky2), s);
 448
 449             if (!noscale)
 450             {
 451                 s = v_fma(s, vx_setall_f32(scale), vx_setall_f32(delta));
 452             }
 453
 454             v_store(&out[l], s);
 455         }
 456
 457         // tail (if any)
 458         if (l < length)
 459         {
 460             GAPI_DbgAssert(length >= nlanes);
 461             l = length - nlanes;
 462         }
 463     }
 464 }
 465
 466 // this variant with manually vectored rounding to short/ushort appears 10-40x faster
 467 // than reference code below
 468 template<bool noscale, typename DST, typename SRC>
 469 static void run_sepfilter3x3_any2short(DST out[], const SRC *in[], int width, int chan,
 470                                        const float kx[], const float ky[], int border,
 471                                        float scale, float delta,
 472                                        float *buf[], int y, int y0)
 473 {
 474     int r[3];
 475     r[0] = (y - y0    ) % 3;  // buf[r[0]]: previous
 476     r[1] = (y - y0 + 1) % 3;  //            this
 477     r[2] = (y - y0 + 2) % 3;  //            next row
 478
 479     const int length = width * chan;
 480     const int shift = border * chan;
 481
 482     const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
 483     const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
 484
 485     // horizontal pass
 486
 487     int k0 = (y == y0)? 0: 2;
 488
 489     for (int k = k0; k < 3; k++)
 490     {
 491         //                      previous , this , next pixel
 492         const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
 493
 494         // rely on compiler vectoring
 495         for (int l=0; l < length; l++)
 496         {
 497             buf[r[k]][l] = s[0][l]*kx0 + s[1][l]*kx1 + s[2][l]*kx2;
 498         }
 499     }
 500
 501     // vertical pass
 502
 503     const int r0=r[0], r1=r[1], r2=r[2];
 504
 505     for (int l=0; l < length;)
 506     {
 507         constexpr int nlanes = v_int16::nlanes;
 508
 509         // main part of row
 510         for (; l <= length - nlanes; l += nlanes)
 511         {
 512             v_float32 sum0 = vx_load(&buf[r0][l])            * vx_setall_f32(ky0);
 513                 sum0 = v_fma(vx_load(&buf[r1][l]),             vx_setall_f32(ky1), sum0);
 514                 sum0 = v_fma(vx_load(&buf[r2][l]),             vx_setall_f32(ky2), sum0);
 515
 516             v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0);
 517                 sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]),  vx_setall_f32(ky1), sum1);
 518                 sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]),  vx_setall_f32(ky2), sum1);
 519
 520             if (!noscale)
 521             {
 522                 sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
 523                 sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
 524             }
 525
 526             v_int32 isum0 = v_round(sum0),
 527                     isum1 = v_round(sum1);
 528
 529             if (std::is_same<DST, short>::value)
 530             {
 531                 // signed short
 532                 v_int16 res = v_pack(isum0, isum1);
 533                 v_store(reinterpret_cast<short*>(&out[l]), res);
 534             } else
 535             {
 536                 // unsigned short
 537                 v_uint16 res = v_pack_u(isum0, isum1);
 538                 v_store(reinterpret_cast<ushort*>(&out[l]), res);
 539             }
 540         }
 541
 542         // tail (if any)
 543         if (l < length)
 544         {
 545             GAPI_DbgAssert(length >= nlanes);
 546             l = length - nlanes;
 547         }
 548     }
 549 }
 550
 551 // this code with manually vectored rounding to uchar is 10-40x faster than reference
 552 template<bool noscale, typename SRC>
 553 static void run_sepfilter3x3_any2char(uchar out[], const SRC *in[], int width, int chan,
 554                                       const float kx[], const float ky[], int border,
 555                                       float scale, float delta,
 556                                       float *buf[], int y, int y0)
 557 {
 558     int r[3];
 559     r[0] = (y - y0    ) % 3;  // buf[r[0]]: previous
 560     r[1] = (y - y0 + 1) % 3;  //            this
 561     r[2] = (y - y0 + 2) % 3;  //            next row
 562
 563     const int length = width * chan;
 564     const int shift = border * chan;
 565
 566     const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
 567     const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
 568
 569     // horizontal pass
 570
 571     int k0 = (y == y0)? 0: 2;
 572
 573     for (int k = k0; k < 3; k++)
 574     {
 575         //                      previous , this , next pixel
 576         const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
 577
 578         // rely on compiler vectoring
 579         for (int l=0; l < length; l++)
 580         {
 581             buf[r[k]][l] = s[0][l]*kx0 + s[1][l]*kx1 + s[2][l]*kx2;
 582         }
 583     }
 584
 585     // vertical pass
 586
 587     const int r0=r[0], r1=r[1], r2=r[2];
 588
 589     for (int l=0; l < length;)
 590     {
 591         constexpr int nlanes = v_uint8::nlanes;
 592
 593         // main part of row
 594         for (; l <= length - nlanes; l += nlanes)
 595         {
 596             v_float32 sum0 = vx_load(&buf[r0][l])              * vx_setall_f32(ky0);
 597                 sum0 = v_fma(vx_load(&buf[r1][l]),               vx_setall_f32(ky1), sum0);
 598                 sum0 = v_fma(vx_load(&buf[r2][l]),               vx_setall_f32(ky2), sum0);
 599
 600             v_float32 sum1 = vx_load(&buf[r0][l +   nlanes/4]) * vx_setall_f32(ky0);
 601                 sum1 = v_fma(vx_load(&buf[r1][l +   nlanes/4]),  vx_setall_f32(ky1), sum1);
 602                 sum1 = v_fma(vx_load(&buf[r2][l +   nlanes/4]),  vx_setall_f32(ky2), sum1);
 603
 604             v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0);
 605                 sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]),  vx_setall_f32(ky1), sum2);
 606                 sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]),  vx_setall_f32(ky2), sum2);
 607
 608             v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0);
 609                 sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]),  vx_setall_f32(ky1), sum3);
 610                 sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]),  vx_setall_f32(ky2), sum3);
 611
 612             if (!noscale)
 613             {
 614                 sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
 615                 sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
 616                 sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
 617                 sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
 618             }
 619
 620             v_int32 isum0 = v_round(sum0),
 621                     isum1 = v_round(sum1),
 622                     isum2 = v_round(sum2),
 623                     isum3 = v_round(sum3);
 624
 625             v_int16 ires0 = v_pack(isum0, isum1),
 626                     ires1 = v_pack(isum2, isum3);
 627
 628             v_uint8 res = v_pack_u(ires0, ires1);
 629             v_store(reinterpret_cast<uchar*>(&out[l]), res);
 630         }
 631
 632         // tail (if any)
 633         if (l < length)
 634         {
 635             GAPI_DbgAssert(length >= nlanes);
 636             l = length - nlanes;
 637         }
 638     }
 639 }
 640
 641 // this code manually vectored for int16 not much faster than generic any-to-short code above
 642 #define USE_SEPFILTER3X3_CHAR2SHORT 1
 643
 644 #if USE_SEPFILTER3X3_CHAR2SHORT
 645 template<bool noscale>
 646 static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int width, int chan,
 647                                         const float kx[], const float ky[], int border,
 648                                         float scale, float delta,
 649                                         float *buf[], int y, int y0)
 650 {
 651     const schar ikx0 = saturate<schar>(kx[0], rintf);
 652     const schar ikx1 = saturate<schar>(kx[1], rintf);
 653     const schar ikx2 = saturate<schar>(kx[2], rintf);
 654
 655     const schar iky0 = saturate<schar>(ky[0], rintf);
 656     const schar iky1 = saturate<schar>(ky[1], rintf);
 657     const schar iky2 = saturate<schar>(ky[2], rintf);
 658
 659     const short iscale = saturate<short>(scale * (1 << 15), rintf);
 660     const short idelta = saturate<short>(delta            , rintf);
 661
 662     // check if this code is applicable
 663     if (ikx0 != kx[0] || ikx1 != kx[1] || ikx2 != kx[2] ||
 664         iky0 != ky[0] || iky1 != ky[1] || iky2 != ky[2] ||
 665         idelta != delta ||
 666         std::abs(scale) > 1 || std::abs(scale) < 0.01)
 667     {
 668         run_sepfilter3x3_any2short<noscale>(out, in, width, chan, kx, ky, border, scale, delta,
 669                                             buf, y, y0);
 670         return;
 671     }
 672
 673     short *ibuf[3];
 674     ibuf[0] = reinterpret_cast<short*>(buf[0]);
 675     ibuf[1] = reinterpret_cast<short*>(buf[1]);
 676     ibuf[2] = reinterpret_cast<short*>(buf[2]);
 677
 678     int r[3];
 679     r[0] = (y - y0    ) % 3;  // buf[r[0]]: previous
 680     r[1] = (y - y0 + 1) % 3;  //            this
 681     r[2] = (y - y0 + 2) % 3;  //            next row
 682
 683     const int length = width * chan;
 684     const int shift = border * chan;
 685
 686     // horizontal pass
 687
 688     int k0 = (y == y0)? 0: 2;
 689
 690     for (int k = k0; k < 3; k++)
 691     {
 692         for (int l=0; l < length;)
 693         {
 694             constexpr int nlanes = v_int16::nlanes;
 695
 696             // main part of output row
 697             for (; l <= length - nlanes; l += nlanes)
 698             {
 699                 v_uint16 t0 = vx_load_expand(&in[k][l - shift]);  // previous
 700                 v_uint16 t1 = vx_load_expand(&in[k][l        ]);  // current
 701                 v_uint16 t2 = vx_load_expand(&in[k][l + shift]);  // next pixel
 702                 v_int16 t = v_reinterpret_as_s16(t0) * vx_setall_s16(ikx0) +
 703                             v_reinterpret_as_s16(t1) * vx_setall_s16(ikx1) +
 704                             v_reinterpret_as_s16(t2) * vx_setall_s16(ikx2);
 705                 v_store(&ibuf[r[k]][l], t);
 706             }
 707
 708             // tail (if any)
 709             if (l < length)
 710             {
 711                 GAPI_DbgAssert(length >= nlanes);
 712                 l = length - nlanes;
 713             }
 714         }
 715     }
 716
 717     // vertical pass
 718
 719     for (int l=0; l < length;)
 720     {
 721         constexpr int nlanes = v_int16::nlanes;
 722
 723         // main part of output row
 724         for (; l <= length - nlanes; l += nlanes)
 725         {
 726             v_int16 s0 = vx_load(&ibuf[r[0]][l]);  // previous
 727             v_int16 s1 = vx_load(&ibuf[r[1]][l]);  // current
 728             v_int16 s2 = vx_load(&ibuf[r[2]][l]);  // next row
 729             v_int16 s = s0 * vx_setall_s16(iky0) +
 730                         s1 * vx_setall_s16(iky1) +
 731                         s2 * vx_setall_s16(iky2);
 732
 733             if (!noscale)
 734             {
 735                 s = v_mul_hi(s << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta);
 736             }
 737
 738             v_store(&out[l], s);
 739         }
 740
 741         // tail (if any)
 742         if (l < length)
 743         {
 744             GAPI_DbgAssert(length >= nlanes);
 745             l = length - nlanes;
 746         }
 747     }
 748 }
 749 #endif
 750
 751 #endif  // CV_SIMD
 752
 753 template<bool noscale, typename DST, typename SRC>
 754 static void run_sepfilter3x3_reference(DST out[], const SRC *in[], int width, int chan,
 755                                        const float kx[], const float ky[], int border,
 756                                        float scale, float delta,
 757                                        float *buf[], int y, int y0)
 758 {
 759     int r[3];
 760     r[0] = (y - y0)     % 3;  // buf[r[0]]: previous
 761     r[1] = (y - y0 + 1) % 3;  //            this
 762     r[2] = (y - y0 + 2) % 3;  //            next row
 763
 764     int length = width * chan;
 765     int shift = border * chan;
 766
 767     // horizontal pass
 768
 769     // full horizontal pass is needed only if very 1st row in ROI;
 770     // for 2nd and further rows, it is enough to convolve only the
 771     // "next" row - as we can reuse buffers from previous calls to
 772     // this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
 773
 774     int k0 = (y == y0)? 0: 2;
 775
 776     for (int k = k0; k < 3; k++)
 777     {
 778         //                      previous , this , next pixel
 779         const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
 780
 781         // rely on compiler vectoring
 782         for (int l=0; l < length; l++)
 783         {
 784             buf[r[k]][l] = s[0][l]*kx[0] + s[1][l]*kx[1] + s[2][l]*kx[2];
 785         }
 786     }
 787
 788     // vertical pass
 789
 790     for (int l=0; l < length; l++)
 791     {
 792         float sum = buf[r[0]][l]*ky[0] + buf[r[1]][l]*ky[1] + buf[r[2]][l]*ky[2];
 793
 794         if (!noscale)
 795         {
 796             sum = sum*scale + delta;
 797         }
 798
 799         out[l] = saturate<DST>(sum, rintf);
 800     }
 801 }
 802
 803 template<bool noscale, typename DST, typename SRC>
 804 static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int chan,
 805                                   const float kx[], const float ky[], int border,
 806                                   float scale, float delta,
 807                                   float *buf[], int y, int y0)
 808 {
 809 #if CV_SIMD
 810     int length = width * chan;
 811
 812     // length variable may be unused if types do not match at 'if' statements below
 813     (void) length;
 814
 815 #if USE_SEPFILTER3X3_CHAR2SHORT
 816     if (std::is_same<DST, short>::value && std::is_same<SRC, uchar>::value &&
 817         length >= v_int16::nlanes)
 818     {
 819         // only slightly faster than more generic any-to-short (see below)
 820         run_sepfilter3x3_char2short<noscale>(reinterpret_cast<short*>(out),
 821                                              reinterpret_cast<const uchar**>(in),
 822                                              width, chan, kx, ky, border, scale, delta,
 823                                              buf, y, y0);
 824         return;
 825     }
 826 #endif
 827
 828     if (std::is_same<DST, float>::value && std::is_same<SRC, float>::value &&
 829         length >= v_float32::nlanes)
 830     {
 831         // appears 15% faster than reference any-to-float code (called below)
 832         run_sepfilter3x3_any2float<noscale>(reinterpret_cast<float*>(out), in,
 833                                             width, chan, kx, ky, border, scale, delta);
 834         return;
 835     }
 836
 837     if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
 838     {
 839         // appears 10-40x faster than reference due to much faster rounding
 840         run_sepfilter3x3_any2short<noscale>(reinterpret_cast<short*>(out), in,
 841                                             width, chan, kx, ky, border, scale, delta,
 842                                             buf, y, y0);
 843         return;
 844     }
 845
 846     if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
 847     {
 848         // appears 10-40x faster than reference due to much faster rounding
 849         run_sepfilter3x3_any2short<noscale>(reinterpret_cast<ushort*>(out), in,
 850                                             width, chan, kx, ky, border, scale, delta,
 851                                             buf, y, y0);
 852         return;
 853     }
 854
 855     if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
 856     {
 857         // appears 10-40x faster than reference due to much faster rounding
 858         run_sepfilter3x3_any2char<noscale>(reinterpret_cast<uchar*>(out), in,
 859                                            width, chan, kx, ky, border, scale, delta,
 860                                            buf, y, y0);
 861         return;
 862     }
 863 #endif  // CV_SIMD
 864
 865     // reference code is quite fast for any-to-float case,
 866     // but not for any-to-integral due to very slow rounding
 867     run_sepfilter3x3_reference<noscale>(out, in, width, chan, kx, ky, border,
 868                                         scale, delta, buf, y, y0);
 869 }
 870
 871 #define RUN_SEPFILTER3X3_IMPL(DST, SRC)                                      \
 872 void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan,  \
 873                            const float kx[], const float ky[], int border,   \
 874                            float scale, float delta,                         \
 875                            float *buf[], int y, int y0)                      \
 876 {                                                                            \
 877     if (scale == 1 && delta == 0)                                            \
 878     {                                                                        \
 879         constexpr bool noscale = true;                                       \
 880         run_sepfilter3x3_code<noscale>(out, in, width, chan, kx, ky, border, \
 881                                        scale, delta, buf, y, y0);            \
 882     }                                                                        \
 883     else                                                                     \
 884     {                                                                        \
 885         constexpr bool noscale = false;                                      \
 886         run_sepfilter3x3_code<noscale>(out, in, width, chan, kx, ky, border, \
 887                                        scale, delta, buf, y, y0);            \
 888     }                                                                        \
 889 }
 890
 891 RUN_SEPFILTER3X3_IMPL(uchar , uchar )
 892 RUN_SEPFILTER3X3_IMPL( short, uchar )
 893 RUN_SEPFILTER3X3_IMPL( float, uchar )
 894 RUN_SEPFILTER3X3_IMPL(ushort, ushort)
 895 RUN_SEPFILTER3X3_IMPL( short, ushort)
 896 RUN_SEPFILTER3X3_IMPL( float, ushort)
 897 RUN_SEPFILTER3X3_IMPL( short,  short)
 898 RUN_SEPFILTER3X3_IMPL( float,  short)
 899 RUN_SEPFILTER3X3_IMPL( float,  float)
 900
 901 #undef RUN_SEPFILTER3X3_IMPL
 902
 903 //-------------------------
 904 //
 905 // Fluid kernels: Filter 2D
 906 //
 907 //-------------------------
 908
 909 template<bool noscale, typename DST, typename SRC>
 910 static void run_filter2d_3x3_reference(DST out[], const SRC *in[], int width, int chan,
 911                                        const float kernel[], float scale, float delta)
 912 {
 913     static constexpr int ksize = 3;
 914     static constexpr int border = (ksize - 1) / 2;
 915
 916     const int length = width * chan;
 917     const int shift = border * chan;
 918
 919     const float k[3][3] = {{ kernel[0], kernel[1], kernel[2] },
 920                            { kernel[3], kernel[4], kernel[5] },
 921                            { kernel[6], kernel[7], kernel[8] }};
 922
 923     for (int l=0; l < length; l++)
 924     {
 925         float sum = in[0][l - shift] * k[0][0] + in[0][l] * k[0][1] + in[0][l + shift] * k[0][2]
 926                   + in[1][l - shift] * k[1][0] + in[1][l] * k[1][1] + in[1][l + shift] * k[1][2]
 927                   + in[2][l - shift] * k[2][0] + in[2][l] * k[2][1] + in[2][l + shift] * k[2][2];
 928
 929         if (!noscale)
 930         {
 931             sum = sum*scale + delta;
 932         }
 933
 934         out[l] = saturate<DST>(sum, rintf);
 935     }
 936 }
 937
 938 #if CV_SIMD
 939 // assume DST is short or ushort
 940 template<bool noscale, typename DST, typename SRC>
 941 static void run_filter2d_3x3_any2short(DST out[], const SRC *in[], int width, int chan,
 942                                        const float kernel[], float scale, float delta)
 943 {
 944     static constexpr int ksize = 3;
 945     static constexpr int border = (ksize - 1) / 2;
 946
 947     const int length = width * chan;
 948     const int shift = border * chan;
 949
 950     const float k[3][3] = {
 951         { kernel[0], kernel[1], kernel[2] },
 952         { kernel[3], kernel[4], kernel[5] },
 953         { kernel[6], kernel[7], kernel[8] }
 954     };
 955
 956     for (int l=0; l < length;)
 957     {
 958         static constexpr int nlanes = v_int16::nlanes;
 959
 960         // main part of output row
 961         for (; l <= length - nlanes; l += nlanes)
 962         {
 963             auto sumx = [in, shift, &k](int i, int j)
 964             {
 965                 v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]);
 966                     s = v_fma(vx_load_f32(&in[i][j        ]),  vx_setall_f32(k[i][1]), s);
 967                     s = v_fma(vx_load_f32(&in[i][j + shift]),  vx_setall_f32(k[i][2]), s);
 968                 return s;
 969             };
 970
 971             int l0 = l;
 972             int l1 = l + nlanes/2;
 973             v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0);
 974             v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1);
 975
 976             if (!noscale)
 977             {
 978                 sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
 979                 sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
 980             }
 981
 982             v_int32 res0 = v_round(sum0);
 983             v_int32 res1 = v_round(sum1);
 984
 985             if (std::is_same<DST, ushort>::value)
 986             {
 987                 v_uint16 res = v_pack_u(res0, res1);
 988                 v_store(reinterpret_cast<ushort*>(&out[l]), res);
 989             }
 990             else // if DST == short
 991             {
 992                 v_int16 res = v_pack(res0, res1);
 993                 v_store(reinterpret_cast<short*>(&out[l]), res);
 994             }
 995         }
 996
 997         // tail (if any)
 998         if (l < length)
 999         {
1000             GAPI_DbgAssert(length >= nlanes);
1001             l = length - nlanes;
1002         }
1003     }
1004 }
1005
1006 template<bool noscale, typename SRC>
1007 static void run_filter2d_3x3_any2char(uchar out[], const SRC *in[], int width, int chan,
1008                                       const float kernel[], float scale, float delta)
1009 {
1010     static constexpr int ksize = 3;
1011     static constexpr int border = (ksize - 1) / 2;
1012
1013     const int length = width * chan;
1014     const int shift = border * chan;
1015
1016     const float k[3][3] = {
1017         { kernel[0], kernel[1], kernel[2] },
1018         { kernel[3], kernel[4], kernel[5] },
1019         { kernel[6], kernel[7], kernel[8] }
1020     };
1021
1022     for (int l=0; l < length;)
1023     {
1024         static constexpr int nlanes = v_uint8::nlanes;
1025
1026         // main part of output row
1027         for (; l <= length - nlanes; l += nlanes)
1028         {
1029             auto sumx = [in, shift, &k](int i, int j)
1030             {
1031                 v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]);
1032                     s = v_fma(vx_load_f32(&in[i][j        ]),  vx_setall_f32(k[i][1]), s);
1033                     s = v_fma(vx_load_f32(&in[i][j + shift]),  vx_setall_f32(k[i][2]), s);
1034                 return s;
1035             };
1036
1037             int l0 = l;
1038             int l1 = l +   nlanes/4;
1039             int l2 = l + 2*nlanes/4;
1040             int l3 = l + 3*nlanes/4;
1041             v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0);
1042             v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1);
1043             v_float32 sum2 = sumx(0, l2) + sumx(1, l2) + sumx(2, l2);
1044             v_float32 sum3 = sumx(0, l3) + sumx(1, l3) + sumx(2, l3);
1045
1046             if (!noscale)
1047             {
1048                 sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
1049                 sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
1050                 sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
1051                 sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
1052             }
1053
1054             v_int32 res0 = v_round(sum0);
1055             v_int32 res1 = v_round(sum1);
1056             v_int32 res2 = v_round(sum2);
1057             v_int32 res3 = v_round(sum3);
1058
1059             v_int16 resl = v_pack(res0, res1);
1060             v_int16 resh = v_pack(res2, res3);
1061             v_uint8 res = v_pack_u(resl, resh);
1062
1063             v_store(&out[l], res);
1064         }
1065
1066         // tail (if any)
1067         if (l < length)
1068         {
1069             GAPI_DbgAssert(length >= nlanes);
1070             l = length - nlanes;
1071         }
1072     }
1073 }
1074 #endif
1075
1076 template<bool noscale, typename DST, typename SRC>
1077 static void run_filter2d_3x3_code(DST out[], const SRC *in[], int width, int chan,
1078                                   const float kernel[], float scale, float delta)
1079 {
1080 #if CV_SIMD
1081     int length = width * chan;
1082
1083     // length variable may be unused if types do not match at 'if' statements below
1084     (void) length;
1085
1086     if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
1087     {
1088         run_filter2d_3x3_any2short<noscale>(reinterpret_cast<short*>(out), in,
1089                                             width, chan, kernel, scale, delta);
1090         return;
1091     }
1092
1093     if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
1094     {
1095         run_filter2d_3x3_any2short<noscale>(reinterpret_cast<ushort*>(out), in,
1096                                             width, chan, kernel, scale, delta);
1097         return;
1098     }
1099
1100
1101     if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
1102     {
1103         run_filter2d_3x3_any2char<noscale>(reinterpret_cast<uchar*>(out), in,
1104                                            width, chan, kernel, scale, delta);
1105         return;
1106     }
1107 #endif  // CV_SIMD
1108
1109     run_filter2d_3x3_reference<noscale>(out, in, width, chan, kernel, scale, delta);
1110 }
1111
1112 #define RUN_FILTER2D_3X3_IMPL(DST, SRC)                                             \
1113 void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan,         \
1114                            const float kernel[], float scale, float delta)          \
1115 {                                                                                   \
1116     if (scale == 1 && delta == 0)                                                   \
1117     {                                                                               \
1118         constexpr bool noscale = true;                                              \
1119         run_filter2d_3x3_code<noscale>(out, in, width, chan, kernel, scale, delta); \
1120     }                                                                               \
1121     else                                                                            \
1122     {                                                                               \
1123         constexpr bool noscale = false;                                             \
1124         run_filter2d_3x3_code<noscale>(out, in, width, chan, kernel, scale, delta); \
1125     }                                                                               \
1126 }
1127
1128 RUN_FILTER2D_3X3_IMPL(uchar , uchar )
1129 RUN_FILTER2D_3X3_IMPL(ushort, ushort)
1130 RUN_FILTER2D_3X3_IMPL( short,  short)
1131 RUN_FILTER2D_3X3_IMPL( float, uchar )
1132 RUN_FILTER2D_3X3_IMPL( float, ushort)
1133 RUN_FILTER2D_3X3_IMPL( float,  short)
1134 RUN_FILTER2D_3X3_IMPL( float,  float)
1135
1136 #undef RUN_FILTER2D_3X3_IMPL
1137
1138 //-----------------------------
1139 //
1140 // Fluid kernels: Erode, Dilate
1141 //
1142 //-----------------------------
1143
1144 template<typename T>
1145 static void run_morphology3x3_reference(T out[], const T *in[], int width, int chan,
1146                                         const uchar k[], MorphShape k_type,
1147                                         Morphology morphology)
1148 {
1149     constexpr int k_size = 3;
1150     constexpr int border = (k_size - 1) / 2;
1151
1152     const uchar kernel[3][3] = {{k[0], k[1], k[2]}, {k[3], k[4], k[5]}, {k[6], k[7], k[8]}};
1153
1154     const int length = width * chan;
1155     const int shift = border * chan;
1156
1157     if (M_ERODE == morphology)
1158     {
1159         if (M_FULL == k_type)
1160         {
1161             for (int l=0; l < length; l++)
1162             {
1163                 T result = std::numeric_limits<T>::max();
1164
1165                 result = (std::min)(result, in[0][l - shift]);
1166                 result = (std::min)(result, in[0][l        ]);
1167                 result = (std::min)(result, in[0][l + shift]);
1168
1169                 result = (std::min)(result, in[1][l - shift]);
1170                 result = (std::min)(result, in[1][l        ]);
1171                 result = (std::min)(result, in[1][l + shift]);
1172
1173                 result = (std::min)(result, in[2][l - shift]);
1174                 result = (std::min)(result, in[2][l        ]);
1175                 result = (std::min)(result, in[2][l + shift]);
1176
1177                 out[l] = result;
1178             }
1179             return;
1180         }
1181
1182         if (M_CROSS == k_type)
1183         {
1184             for (int l=0; l < length; l++)
1185             {
1186                 T result = std::numeric_limits<T>::max();
1187
1188             //  result = (std::min)(result, in[0][l - shift]);
1189                 result = (std::min)(result, in[0][l        ]);
1190             //  result = (std::min)(result, in[0][l + shift]);
1191
1192                 result = (std::min)(result, in[1][l - shift]);
1193                 result = (std::min)(result, in[1][l        ]);
1194                 result = (std::min)(result, in[1][l + shift]);
1195
1196             //  result = (std::min)(result, in[2][l - shift]);
1197                 result = (std::min)(result, in[2][l        ]);
1198             //  result = (std::min)(result, in[2][l + shift]);
1199
1200                 out[l] = result;
1201             }
1202             return;
1203         }
1204
1205         for (int l=0; l < length; l++)
1206         {
1207             T result = std::numeric_limits<T>::max();
1208
1209             result = kernel[0][0]? (std::min)(result, in[0][l - shift]): result;
1210             result = kernel[0][1]? (std::min)(result, in[0][l        ]): result;
1211             result = kernel[0][2]? (std::min)(result, in[0][l + shift]): result;
1212
1213             result = kernel[1][0]? (std::min)(result, in[1][l - shift]): result;
1214             result = kernel[1][1]? (std::min)(result, in[1][l        ]): result;
1215             result = kernel[1][2]? (std::min)(result, in[1][l + shift]): result;
1216
1217             result = kernel[2][0]? (std::min)(result, in[2][l - shift]): result;
1218             result = kernel[2][1]? (std::min)(result, in[2][l        ]): result;
1219             result = kernel[2][2]? (std::min)(result, in[2][l + shift]): result;
1220
1221             out[l] = result;
1222         }
1223         return;
1224     }
1225
1226     if (M_DILATE == morphology)
1227     {
1228         if (M_FULL == k_type)
1229         {
1230             for (int l=0; l < length; l++)
1231             {
1232                 T result = std::numeric_limits<T>::min();
1233
1234                 result = (std::max)(result, in[0][l - shift]);
1235                 result = (std::max)(result, in[0][l        ]);
1236                 result = (std::max)(result, in[0][l + shift]);
1237
1238                 result = (std::max)(result, in[1][l - shift]);
1239                 result = (std::max)(result, in[1][l        ]);
1240                 result = (std::max)(result, in[1][l + shift]);
1241
1242                 result = (std::max)(result, in[2][l - shift]);
1243                 result = (std::max)(result, in[2][l        ]);
1244                 result = (std::max)(result, in[2][l + shift]);
1245
1246                 out[l] = result;
1247             }
1248             return;
1249         }
1250
1251         if (M_CROSS == k_type)
1252         {
1253             for (int l=0; l < length; l++)
1254             {
1255                 T result = std::numeric_limits<T>::min();
1256
1257             //  result = (std::max)(result, in[0][l - shift]);
1258                 result = (std::max)(result, in[0][l        ]);
1259             //  result = (std::max)(result, in[0][l + shift]);
1260
1261                 result = (std::max)(result, in[1][l - shift]);
1262                 result = (std::max)(result, in[1][l        ]);
1263                 result = (std::max)(result, in[1][l + shift]);
1264
1265             //  result = (std::max)(result, in[2][l - shift]);
1266                 result = (std::max)(result, in[2][l        ]);
1267             //  result = (std::max)(result, in[2][l + shift]);
1268
1269                 out[l] = result;
1270             }
1271             return;
1272         }
1273
1274         for (int l=0; l < length; l++)
1275         {
1276             T result = std::numeric_limits<T>::min();
1277
1278             result = kernel[0][0]? (std::max)(result, in[0][l - shift]): result;
1279             result = kernel[0][1]? (std::max)(result, in[0][l        ]): result;
1280             result = kernel[0][2]? (std::max)(result, in[0][l + shift]): result;
1281
1282             result = kernel[1][0]? (std::max)(result, in[1][l - shift]): result;
1283             result = kernel[1][1]? (std::max)(result, in[1][l        ]): result;
1284             result = kernel[1][2]? (std::max)(result, in[1][l + shift]): result;
1285
1286             result = kernel[2][0]? (std::max)(result, in[2][l - shift]): result;
1287             result = kernel[2][1]? (std::max)(result, in[2][l        ]): result;
1288             result = kernel[2][2]? (std::max)(result, in[2][l + shift]): result;
1289
1290             out[l] = result;
1291         }
1292         return;
1293     }
1294
1295     CV_Error(cv::Error::StsBadArg, "unsupported morphology");
1296 }
1297
1298 #if CV_SIMD
1299 template<typename T, typename VT, typename S>
1300 static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
1301                                    const uchar k[], MorphShape k_type,
1302                                    Morphology morphology,
1303                                    S setall)
1304 {
1305     constexpr int k_size = 3;
1306     constexpr int border = (k_size - 1) / 2;
1307
1308     const uchar kernel[3][3] = {{k[0], k[1], k[2]}, {k[3], k[4], k[5]}, {k[6], k[7], k[8]}};
1309
1310     const int length = width * chan;
1311     const int shift = border * chan;
1312
1313     if (M_ERODE == morphology)
1314     {
1315         if (M_FULL == k_type)
1316         {
1317             for (int l=0; l < length;)
1318             {
1319                 constexpr int nlanes = VT::nlanes;
1320
1321                 // main part of output row
1322                 for (; l <= length - nlanes; l += nlanes)
1323                 {
1324                     VT r = setall(std::numeric_limits<T>::max());
1325
1326                     r = v_min(r, vx_load(&in[0][l - shift]));
1327                     r = v_min(r, vx_load(&in[0][l        ]));
1328                     r = v_min(r, vx_load(&in[0][l + shift]));
1329
1330                     r = v_min(r, vx_load(&in[1][l - shift]));
1331                     r = v_min(r, vx_load(&in[1][l        ]));
1332                     r = v_min(r, vx_load(&in[1][l + shift]));
1333
1334                     r = v_min(r, vx_load(&in[2][l - shift]));
1335                     r = v_min(r, vx_load(&in[2][l        ]));
1336                     r = v_min(r, vx_load(&in[2][l + shift]));
1337
1338                     v_store(&out[l], r);
1339                 }
1340
1341                 // tail (if any)
1342                 if (l < length)
1343                 {
1344                     GAPI_DbgAssert(length >= nlanes);
1345                     l = length - nlanes;
1346                 }
1347             }
1348             return;
1349         }
1350
1351         if (M_CROSS == k_type)
1352         {
1353             for (int l=0; l < length;)
1354             {
1355                 constexpr int nlanes = VT::nlanes;
1356
1357                 // main part of output row
1358                 for (; l <= length - nlanes; l += nlanes)
1359                 {
1360                     VT r = setall(std::numeric_limits<T>::max());
1361
1362                 //  r = v_min(r, vx_load(&in[0][l - shift]));
1363                     r = v_min(r, vx_load(&in[0][l        ]));
1364                 //  r = v_min(r, vx_load(&in[0][l + shift]));
1365
1366                     r = v_min(r, vx_load(&in[1][l - shift]));
1367                     r = v_min(r, vx_load(&in[1][l        ]));
1368                     r = v_min(r, vx_load(&in[1][l + shift]));
1369
1370                 //  r = v_min(r, vx_load(&in[2][l - shift]));
1371                     r = v_min(r, vx_load(&in[2][l        ]));
1372                 //  r = v_min(r, vx_load(&in[2][l + shift]));
1373
1374                     v_store(&out[l], r);
1375                 }
1376
1377                 // tail (if any)
1378                 if (l < length)
1379                 {
1380                     GAPI_DbgAssert(length >= nlanes);
1381                     l = length - nlanes;
1382                 }
1383             }
1384             return;
1385         }
1386
1387         for (int l=0; l < length;)
1388         {
1389             constexpr int nlanes = VT::nlanes;
1390
1391             // main part of output row
1392             for (; l <= length - nlanes; l += nlanes)
1393             {
1394                 VT r = setall(std::numeric_limits<T>::max());
1395
1396                 if (kernel[0][0]) r = v_min(r, vx_load(&in[0][l - shift]));
1397                 if (kernel[0][1]) r = v_min(r, vx_load(&in[0][l        ]));
1398                 if (kernel[0][2]) r = v_min(r, vx_load(&in[0][l + shift]));
1399
1400                 if (kernel[1][0]) r = v_min(r, vx_load(&in[1][l - shift]));
1401                 if (kernel[1][1]) r = v_min(r, vx_load(&in[1][l        ]));
1402                 if (kernel[1][2]) r = v_min(r, vx_load(&in[1][l + shift]));
1403
1404                 if (kernel[2][0]) r = v_min(r, vx_load(&in[2][l - shift]));
1405                 if (kernel[2][1]) r = v_min(r, vx_load(&in[2][l        ]));
1406                 if (kernel[2][2]) r = v_min(r, vx_load(&in[2][l + shift]));
1407
1408                 v_store(&out[l], r);
1409             }
1410
1411             // tail (if any)
1412             if (l < length)
1413             {
1414                 GAPI_DbgAssert(length >= nlanes);
1415                 l = length - nlanes;
1416             }
1417         }
1418         return;
1419     }
1420
1421     if (M_DILATE == morphology)
1422     {
1423         if (M_FULL == k_type)
1424         {
1425             for (int l=0; l < length;)
1426             {
1427                 constexpr int nlanes = VT::nlanes;
1428
1429                 // main part of output row
1430                 for (; l <= length - nlanes; l += nlanes)
1431                 {
1432                     VT r = setall(std::numeric_limits<T>::min());
1433
1434                     r = v_max(r, vx_load(&in[0][l - shift]));
1435                     r = v_max(r, vx_load(&in[0][l        ]));
1436                     r = v_max(r, vx_load(&in[0][l + shift]));
1437
1438                     r = v_max(r, vx_load(&in[1][l - shift]));
1439                     r = v_max(r, vx_load(&in[1][l        ]));
1440                     r = v_max(r, vx_load(&in[1][l + shift]));
1441
1442                     r = v_max(r, vx_load(&in[2][l - shift]));
1443                     r = v_max(r, vx_load(&in[2][l        ]));
1444                     r = v_max(r, vx_load(&in[2][l + shift]));
1445
1446                     v_store(&out[l], r);
1447                 }
1448
1449                 // tail (if any)
1450                 if (l < length)
1451                 {
1452                     GAPI_DbgAssert(length >= nlanes);
1453                     l = length - nlanes;
1454                 }
1455             }
1456             return;
1457         }
1458
1459         if (M_CROSS == k_type)
1460         {
1461             for (int l=0; l < length;)
1462             {
1463                 constexpr int nlanes = VT::nlanes;
1464
1465                 // main part of output row
1466                 for (; l <= length - nlanes; l += nlanes)
1467                 {
1468                     VT r = setall(std::numeric_limits<T>::min());
1469
1470                 //  r = v_max(r, vx_load(&in[0][l - shift]));
1471                     r = v_max(r, vx_load(&in[0][l        ]));
1472                 //  r = v_max(r, vx_load(&in[0][l + shift]));
1473
1474                     r = v_max(r, vx_load(&in[1][l - shift]));
1475                     r = v_max(r, vx_load(&in[1][l        ]));
1476                     r = v_max(r, vx_load(&in[1][l + shift]));
1477
1478                 //  r = v_max(r, vx_load(&in[2][l - shift]));
1479                     r = v_max(r, vx_load(&in[2][l        ]));
1480                 //  r = v_max(r, vx_load(&in[2][l + shift]));
1481
1482                     v_store(&out[l], r);
1483                 }
1484
1485                 // tail (if any)
1486                 if (l < length)
1487                 {
1488                     GAPI_DbgAssert(length >= nlanes);
1489                     l = length - nlanes;
1490                 }
1491             }
1492             return;
1493         }
1494
1495         for (int l=0; l < length;)
1496         {
1497             constexpr int nlanes = VT::nlanes;
1498
1499             // main part of output row
1500             for (; l <= length - nlanes; l += nlanes)
1501             {
1502                 VT r = setall(std::numeric_limits<T>::min());
1503
1504                 if (kernel[0][0]) r = v_max(r, vx_load(&in[0][l - shift]));
1505                 if (kernel[0][1]) r = v_max(r, vx_load(&in[0][l        ]));
1506                 if (kernel[0][2]) r = v_max(r, vx_load(&in[0][l + shift]));
1507
1508                 if (kernel[1][0]) r = v_max(r, vx_load(&in[1][l - shift]));
1509                 if (kernel[1][1]) r = v_max(r, vx_load(&in[1][l        ]));
1510                 if (kernel[1][2]) r = v_max(r, vx_load(&in[1][l + shift]));
1511
1512                 if (kernel[2][0]) r = v_max(r, vx_load(&in[2][l - shift]));
1513                 if (kernel[2][1]) r = v_max(r, vx_load(&in[2][l        ]));
1514                 if (kernel[2][2]) r = v_max(r, vx_load(&in[2][l + shift]));
1515
1516                 v_store(&out[l], r);
1517             }
1518
1519             // tail (if any)
1520             if (l < length)
1521             {
1522                 GAPI_DbgAssert(length >= nlanes);
1523                 l = length - nlanes;
1524             }
1525         }
1526         return;
1527     }
1528
1529     CV_Error(cv::Error::StsBadArg, "unsupported morphology");
1530 }
1531 #endif
1532
1533 template<typename T>
1534 static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
1535                                    const uchar k[], MorphShape k_type,
1536                                    Morphology morphology)
1537 {
1538 #if CV_SIMD
1539     int length = width * chan;
1540
1541     // length variable may be unused if types do not match at 'if' statements below
1542     (void) length;
1543
1544     if (std::is_same<T, float>::value && length >= v_float32::nlanes)
1545     {
1546         run_morphology3x3_simd<float, v_float32>(reinterpret_cast<float*>(out),
1547                                                  reinterpret_cast<const float**>(in),
1548                                                  width, chan, k, k_type, morphology,
1549                                                  vx_setall_f32);
1550         return;
1551     }
1552
1553     if (std::is_same<T, short>::value && length >= v_int16::nlanes)
1554     {
1555         run_morphology3x3_simd<short, v_int16>(reinterpret_cast<short*>(out),
1556                                                reinterpret_cast<const short**>(in),
1557                                                width, chan, k, k_type, morphology,
1558                                                vx_setall_s16);
1559         return;
1560     }
1561
1562     if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
1563     {
1564         run_morphology3x3_simd<ushort, v_uint16>(reinterpret_cast<ushort*>(out),
1565                                                  reinterpret_cast<const ushort**>(in),
1566                                                  width, chan, k, k_type, morphology,
1567                                                  vx_setall_u16);
1568         return;
1569     }
1570
1571     if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
1572     {
1573         run_morphology3x3_simd<uchar, v_uint8>(reinterpret_cast<uchar*>(out),
1574                                                reinterpret_cast<const uchar**>(in),
1575                                                width, chan, k, k_type, morphology,
1576                                                vx_setall_u8);
1577         return;
1578     }
1579 #endif  // CV_SIMD
1580
1581     run_morphology3x3_reference(out, in, width, chan, k, k_type, morphology);
1582 }
1583
1584 #define RUN_MORPHOLOGY3X3_IMPL(T)                                        \
1585 void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \
1586                             const uchar k[], MorphShape k_type,          \
1587                             Morphology morphology)                       \
1588 {                                                                        \
1589     run_morphology3x3_code(out, in, width, chan, k, k_type, morphology); \
1590 }
1591
1592 RUN_MORPHOLOGY3X3_IMPL(uchar )
1593 RUN_MORPHOLOGY3X3_IMPL(ushort)
1594 RUN_MORPHOLOGY3X3_IMPL( short)
1595 RUN_MORPHOLOGY3X3_IMPL( float)
1596
1597 #undef RUN_MORPHOLOGY3X3_IMPL
1598
1599 //---------------------------
1600 //
1601 // Fluid kernels: Median blur
1602 //
1603 //---------------------------
1604
1605 template<typename T>
1606 static void run_medblur3x3_reference(T out[], const T *in[], int width, int chan)
1607 {
1608     constexpr int ksize = 3;
1609     constexpr int border = (ksize - 1) / 2;
1610
1611     const int length = width * chan;
1612     const int shift = border * chan;
1613
1614     for (int l=0; l < length; l++)
1615     {
1616         T t[3][3];
1617
1618         // neighbourhood 3x3
1619         t[0][0] = in[0][l - shift];    t[0][1] = in[0][l];    t[0][2] = in[0][l + shift];
1620         t[1][0] = in[1][l - shift];    t[1][1] = in[1][l];    t[1][2] = in[1][l + shift];
1621         t[2][0] = in[2][l - shift];    t[2][1] = in[2][l];    t[2][2] = in[2][l + shift];
1622
1623         // sort 2 values
1624         auto sort = [](T& a, T& b)
1625         {
1626             T u=a, v=b;
1627             a = (std::min)(u, v);
1628             b = (std::max)(u, v);
1629         };
1630
1631         // horizontal: 3-elements bubble-sort per each row
1632         sort(t[0][0], t[0][1]);    sort(t[0][1], t[0][2]);    sort(t[0][0], t[0][1]);
1633         sort(t[1][0], t[1][1]);    sort(t[1][1], t[1][2]);    sort(t[1][0], t[1][1]);
1634         sort(t[2][0], t[2][1]);    sort(t[2][1], t[2][2]);    sort(t[2][0], t[2][1]);
1635
1636         // vertical: columns bubble-sort (although partial)
1637         sort(t[0][0], t[1][0]);    sort(t[0][1], t[1][1]);  /*sort(t[0][2], t[1][2]);*/
1638         sort(t[1][0], t[2][0]);    sort(t[1][1], t[2][1]);    sort(t[1][2], t[2][2]);
1639       /*sort(t[0][0], t[1][0]);*/  sort(t[0][1], t[1][1]);    sort(t[0][2], t[1][2]);
1640
1641         // diagonal: bubble-sort (in opposite order!)
1642         sort(t[1][1], t[0][2]);    sort(t[2][0], t[1][1]);    sort(t[1][1], t[0][2]);
1643
1644         out[l] = t[1][1];
1645     }
1646 }
1647
1648 #if CV_SIMD
1649 template<typename VT, typename T>
1650 static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan)
1651 {
1652     constexpr int ksize = 3;
1653     constexpr int border = (ksize - 1) / 2;
1654
1655     const int length = width * chan;
1656     const int shift = border * chan;
1657
1658     for (int l=0; l < length;)
1659     {
1660         constexpr int nlanes = VT::nlanes;
1661
1662         // main part of output row
1663         for (; l <= length - nlanes; l += nlanes)
1664         {
1665             VT t00, t01, t02, t10, t11, t12, t20, t21, t22;
1666
1667             // neighbourhood 3x3
1668
1669             t00 = vx_load(&in[0][l - shift]);
1670             t01 = vx_load(&in[0][l        ]);
1671             t02 = vx_load(&in[0][l + shift]);
1672
1673             t10 = vx_load(&in[1][l - shift]);
1674             t11 = vx_load(&in[1][l        ]);
1675             t12 = vx_load(&in[1][l + shift]);
1676
1677             t20 = vx_load(&in[2][l - shift]);
1678             t21 = vx_load(&in[2][l        ]);
1679             t22 = vx_load(&in[2][l + shift]);
1680
1681             // sort 2 values
1682             auto sort = [](VT& a, VT& b)
1683             {
1684                 VT u=a, v=b;
1685                 a = v_min(u, v);
1686                 b = v_max(u, v);
1687             };
1688
1689             // horizontal: 3-elements bubble-sort per each row
1690             sort(t00, t01);    sort(t01, t02);    sort(t00, t01);
1691             sort(t10, t11);    sort(t11, t12);    sort(t10, t11);
1692             sort(t20, t21);    sort(t21, t22);    sort(t20, t21);
1693
1694             // vertical: columns bubble-sort (although partial)
1695             sort(t00, t10);    sort(t01, t11);  /*sort(t02, t12);*/
1696             sort(t10, t20);    sort(t11, t21);    sort(t12, t22);
1697           /*sort(t00, t10);*/  sort(t01, t11);    sort(t02, t12);
1698
1699             // diagonal: bubble-sort (in opposite order!)
1700             sort(t11, t02);    sort(t20, t11);    sort(t11, t02);
1701
1702             v_store(&out[l], t11);
1703         }
1704
1705         // tail (if any)
1706         if (l < length)
1707         {
1708             GAPI_DbgAssert(length >= nlanes);
1709             l = length - nlanes;
1710         }
1711     }
1712 }
1713 #endif
1714
1715 template<typename T>
1716 static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
1717 {
1718 #if CV_SIMD
1719     int length = width * chan;
1720
1721     // length variable may be unused if types do not match at 'if' statements below
1722     (void) length;
1723
1724     if (std::is_same<T, float>::value && length >= v_float32::nlanes)
1725     {
1726         run_medblur3x3_simd<v_float32>(reinterpret_cast<float*>(out),
1727                                        reinterpret_cast<const float**>(in),
1728                                        width, chan);
1729         return;
1730     }
1731
1732     if (std::is_same<T, short>::value && length >= v_int16::nlanes)
1733     {
1734         run_medblur3x3_simd<v_int16>(reinterpret_cast<short*>(out),
1735                                      reinterpret_cast<const short**>(in),
1736                                      width, chan);
1737         return;
1738     }
1739
1740     if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
1741     {
1742         run_medblur3x3_simd<v_uint16>(reinterpret_cast<ushort*>(out),
1743                                       reinterpret_cast<const ushort**>(in),
1744                                       width, chan);
1745         return;
1746     }
1747
1748     if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
1749     {
1750         run_medblur3x3_simd<v_uint8>(reinterpret_cast<uchar*>(out),
1751                                      reinterpret_cast<const uchar**>(in),
1752                                      width, chan);
1753         return;
1754     }
1755 #endif
1756
1757     run_medblur3x3_reference(out, in, width, chan);
1758 }
1759
1760 #define RUN_MEDBLUR3X3_IMPL(T)                                        \
1761 void run_medblur3x3_impl(T out[], const T *in[], int width, int chan) \
1762 {                                                                     \
1763     run_medblur3x3_code(out, in, width, chan);                        \
1764 }
1765
1766 RUN_MEDBLUR3X3_IMPL(uchar )
1767 RUN_MEDBLUR3X3_IMPL(ushort)
1768 RUN_MEDBLUR3X3_IMPL( short)
1769 RUN_MEDBLUR3X3_IMPL( float)
1770
1771 #undef RUN_MEDBLUR3X3_IMPL
1772
1773 //------------------------------------------------------------------------------
1774
1775 #endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
1776
1777 CV_CPU_OPTIMIZATION_NAMESPACE_END
1778
1779 }  // namespace fluid
1780 }  // namespace gapi
1781 }  // namespace cv
1782
1783 #endif // !defined(GAPI_STANDALONE)