From: Anna Khakimova <anna.khakimova@intel.com>
Date: Tue, 7 Jul 2020 08:38:59 +0000 (+0300)
Subject: Preprocessing(GAPI): Universal intrinsics (AVX2) implementation of U8C1 linear Resize... 
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=987cc5ee528a04b92d24aa8ab88f6a028d017af8;p=platform%2Fupstream%2Fdldt.git

Preprocessing(GAPI): Universal intrinsics (AVX2) implementation of U8C1 linear Resize. (#942)

* Preprocessing(GAPI): Universal intrinsics (AVX2) implementation of U8C1 linear Resize

* Refactoring
---

diff --git a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp
index 71c23ce..d790517 100644
--- a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp
@@ -131,6 +131,314 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz,
     calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
 }
 
+static inline void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
+                                                        const v_uint8& val_1,
+                                                        const v_uint8& val_2,
+                                                        const v_uint8& val_3,
+                                                        const v_int16& a10,
+                                                        const v_int16& a32,
+                                                        const v_int16& a54,
+                                                        const v_int16& a76,
+                                                        v_uint8& shuf_mask1,
+                                                        v_uint8& shuf_mask2,
+                                                        v_uint8& res1, v_uint8& res2) {
+    v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
+    v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
+    v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
+    v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3));
+
+    v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0));
+    v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1));
+    v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2));
+    v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3));
+
+    v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10);
+    v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32);
+    v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54);
+    v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76);
+
+    v_int16 r0 = v_add_wrap(val1_0, t0);
+    v_int16 r1 = v_add_wrap(val1_1, t1);
+    v_int16 r2 = v_add_wrap(val1_2, t2);
+    v_int16 r3 = v_add_wrap(val1_3, t3);
+
+    v_uint8 q0 = v_packus(r0, r1);
+    v_uint8 q1 = v_packus(r2, r3);
+
+    v_uint8 q2 = v_shuffle_s8(q0, shuf_mask1);
+    v_uint8 q3 = v_shuffle_s8(q1, shuf_mask1);
+
+    v_uint8 q4 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q2, q3);
+    v_uint8 q5 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q2, q3);
+
+    v_uint8 q6 = v256_permute4x64<0xD8>(q4);
+    v_uint8 q7 = v256_permute4x64<0xD8>(q5);
+
+    res1 = v_shuffle_s8(q6, shuf_mask2);
+    res2 = v_shuffle_s8(q7, shuf_mask2);
+}
+
+static inline void verticalPass_lpi4_8U(const uint8_t* src0[], const uint8_t* src1[],
+                                        uint8_t tmp[], const short beta[],
+                                        const int& length, const int& half_nlanes) {
+    v_int16 b0 = vx_setall_s16(beta[0]);
+    v_int16 b1 = vx_setall_s16(beta[1]);
+    v_int16 b2 = vx_setall_s16(beta[2]);
+    v_int16 b3 = vx_setall_s16(beta[3]);
+
+    v_uint8 shuf_mask = v_setr_s8(0, 8,  4, 12, 1, 9,  5, 13,
+                                  2, 10, 6, 14, 3, 11, 7, 15,
+                                  0, 8,  4, 12, 1, 9,  5, 13,
+                                  2, 10, 6, 14, 3, 11, 7, 15);
+    for (int w = 0; w < length; ) {
+        for (; w <= length - half_nlanes; w += half_nlanes) {
+            v_int16 val0_0 = v_load_ccache_expand(&src0[0][w]);
+            v_int16 val0_1 = v_load_ccache_expand(&src0[1][w]);
+            v_int16 val0_2 = v_load_ccache_expand(&src0[2][w]);
+            v_int16 val0_3 = v_load_ccache_expand(&src0[3][w]);
+
+            v_int16 val1_0 = v_load_ccache_expand(&src1[0][w]);
+            v_int16 val1_1 = v_load_ccache_expand(&src1[1][w]);
+            v_int16 val1_2 = v_load_ccache_expand(&src1[2][w]);
+            v_int16 val1_3 = v_load_ccache_expand(&src1[3][w]);
+
+            v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
+            v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
+            v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
+            v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
+
+            v_int16 r0 = v_add_wrap(val1_0, t0);
+            v_int16 r1 = v_add_wrap(val1_1, t1);
+            v_int16 r2 = v_add_wrap(val1_2, t2);
+            v_int16 r3 = v_add_wrap(val1_3, t3);
+
+            v_uint8 q0 = v_packus(r0, r1);
+            v_uint8 q1 = v_packus(r2, r3);
+
+            v_uint8 q2 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q0, q1);
+            v_uint8 q3 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q0, q1);
+
+            v_uint8 q4 = v_shuffle_s8(q2, shuf_mask);
+            v_uint8 q5 = v_shuffle_s8(q3, shuf_mask);
+
+            v_uint8 q6 = v256_permute2x128<0x20>(q4, q5);
+            v_uint8 q7 = v256_permute2x128<0x31>(q4, q5);
+
+            vx_store(&tmp[4 * w + 0], q6);
+            vx_store(&tmp[4 * w + 2 * half_nlanes], q7);
+        }
+
+        if (w < length) {
+            w = length - half_nlanes;
+        }
+    }
+}
+
+static inline void insert64(v_uint8& val, const short mapsx[],
+                            uint8_t tmp[], const int& x, const int& shift) {
+    val = v_insert64<0>(val, *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + shift + 0]]));
+    val = v_insert64<1>(val, *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + shift + 1]]));
+    val = v_insert64<2>(val, *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + shift + 2]]));
+    val = v_insert64<3>(val, *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + shift + 3]]));
+}
+
+static inline v_uint8 setHorizontalShufMask1() {
+    return v_setr_s8(0, 4, 8, 12, 2, 6, 10, 14,
+                     1, 5, 9, 13, 3, 7, 11, 15,
+                     0, 4, 8, 12, 2, 6, 10, 14,
+                     1, 5, 9, 13, 3, 7, 11, 15);
+}
+
+static inline v_uint8 setHorizontalShufMask2() {
+    return v_setr_s8(0, 1, 8,  9,  2, 3, 10, 11,
+                     4, 5, 12, 13, 6, 7, 14, 15,
+                     0, 1, 8,  9,  2, 3, 10, 11,
+                     4, 5, 12, 13, 6, 7, 14, 15);
+}
+
+static inline void horizontalPass_lpi4_8UC1(const short clone[], const short mapsx[],
+                                            uint8_t tmp[], uint8_t* dst[], const int& length,
+                                            const int& half_nlanes) {
+    v_uint8 val_0, val_1, val_2, val_3, res1, res2;
+    constexpr int shift = 4;
+    v_uint8 shuf_mask1 = setHorizontalShufMask1();
+    v_uint8 shuf_mask2 = setHorizontalShufMask2();;
+    v_uint32 idxs = v_setr_s32(0, 2, 4, 6, 1, 3, 5, 7);
+
+    for (int x = 0; x < length; ) {
+        for (; x <= length - half_nlanes; x += half_nlanes) {
+            v_int16 a10 = vx_load(&clone[4 * x]);
+            v_int16 a32 = vx_load(&clone[4 * (x + 4)]);
+            v_int16 a54 = vx_load(&clone[4 * (x + 8)]);
+            v_int16 a76 = vx_load(&clone[4 * (x + 12)]);
+
+            insert64(val_0, mapsx, tmp, x, 0);
+            insert64(val_1, mapsx, tmp, x, shift);
+            insert64(val_2, mapsx, tmp, x, shift*2);
+            insert64(val_3, mapsx, tmp, x, shift*3);
+
+            val_0 = v_permutevar8x32(val_0, idxs);
+            val_1 = v_permutevar8x32(val_1, idxs);
+            val_2 = v_permutevar8x32(val_2, idxs);
+            val_3 = v_permutevar8x32(val_3, idxs);
+
+            main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
+                                                 a10, a32, a54, a76,
+                                                 shuf_mask1, shuf_mask2,
+                                                 res1, res2);
+
+            v_store_low(&dst[0][x], res1);
+            v_store_high(&dst[1][x], res1);
+            v_store_low(&dst[2][x], res2);
+            v_store_high(&dst[3][x], res2);
+        }
+
+        if (x < length) {
+            x = length - half_nlanes;
+        }
+    }
+}
+
+static inline void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[],
+                                          uint8_t tmp[], const int& beta0, const int& half_nlanes,
+                                          const int& l, const int& length1, const int& length2) {
+    for (int w = 0; w < length2; ) {
+        for (; w <= length1 - half_nlanes; w += half_nlanes) {
+            v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
+            v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w]));
+            v_int16 t = v_mulhrs(s0 - s1, beta0) + s1;
+            v_pack_u_store(tmp + w, t);
+        }
+
+        if (w < length1) {
+            w = length1 - half_nlanes;
+        }
+    }
+}
+
+static inline void horizontalPass_anylpi_8U(const short alpha[], const short mapsx[],
+                                            uint8_t* dst[], const uchar tmp[], const int& l,
+                                            const int& half_nlanes, const int& length) {
+    for (int x = 0; x < length; ) {
+        for (; x <= length - half_nlanes; x += half_nlanes) {
+            v_int16 a0 = vx_load(&alpha[x]);        // as signed Q1.1.14
+            v_int16 sx = vx_load(&mapsx[x]);        // as integer (int16)
+            v_uint8 t = v_gather_pairs(tmp, sx);  // 8 pairs of src0 pixels
+            v_int16 t0, t1;
+            v_deinterleave_expand(t, t0, t1);        // tmp pixels as int16
+            v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
+            v_pack_u_store(&dst[l][x], d);
+        }
+
+        if (x < length) {
+            x = length - half_nlanes;
+        }
+    }
+}
+
+// 8UC1 Resize (bi-linear)
+void calcRowLinear_8UC1(uint8_t*       dst[],
+                        const uint8_t* src0[],
+                        const uint8_t* src1[],
+                        const short    alpha[],
+                        const short    clone[],  // 4 clones of alpha
+                        const short    mapsx[],
+                        const short    beta[],
+                        uint8_t        tmp[],
+                        const Size&    inSz,
+                        const Size&    outSz,
+                        int            lpi) {
+    bool xRatioEq = inSz.width == outSz.width;
+    bool yRatioEq = inSz.height == outSz.height;
+
+    constexpr int nlanes = v_uint8::nlanes;
+    constexpr int half_nlanes = (nlanes / 2);
+
+    if (!xRatioEq && !yRatioEq) {
+        if (4 == lpi) {
+            // vertical pass
+            GAPI_DbgAssert(inSz.width >= half_nlanes);
+            verticalPass_lpi4_8U(src0, src1, tmp, beta, inSz.width, half_nlanes);
+
+            // horizontal pass
+            GAPI_DbgAssert(outSz.width >= half_nlanes);
+            horizontalPass_lpi4_8UC1(clone, mapsx, tmp, dst, outSz.width, half_nlanes);
+
+        } else {  // if any lpi
+            int inLength = inSz.width;
+            int outLength = outSz.width;
+            for (int l = 0; l < lpi; ++l) {
+                short beta0 = beta[l];
+
+                // vertical pass
+                GAPI_DbgAssert(inSz.width >= half_nlanes);
+                verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l, inLength, inLength);
+
+                // horizontal pass
+                GAPI_DbgAssert(outSz.width >= half_nlanes);
+                horizontalPass_anylpi_8U(alpha, mapsx, dst, tmp, l, half_nlanes, outLength);
+            }
+        }  // if lpi == 4
+
+    } else if (!xRatioEq) {
+        GAPI_DbgAssert(yRatioEq);
+
+        if (4 == lpi) {
+            // vertical pass
+            GAPI_DbgAssert(inSz.width >= nlanes);
+            for (int w = 0; w < inSz.width; ) {
+                for (; w <= inSz.width - nlanes; w += nlanes) {
+                    v_uint8 s0, s1, s2, s3;
+                    s0 = vx_load(&src0[0][w]);
+                    s1 = vx_load(&src0[1][w]);
+                    s2 = vx_load(&src0[2][w]);
+                    s3 = vx_load(&src0[3][w]);
+                    v_store_interleave(&tmp[4 * w], s0, s1, s2, s3);
+                }
+
+                if (w < inSz.width) {
+                    w = inSz.width - nlanes;
+                }
+            }
+
+            // horizontal pass
+            GAPI_DbgAssert(outSz.width >= half_nlanes);
+            horizontalPass_lpi4_8UC1(clone, mapsx, tmp, dst, outSz.width, half_nlanes);
+
+        } else {  // any LPI
+            for (int l = 0; l < lpi; ++l) {
+                const uchar *src = src0[l];
+
+                // horizontal pass
+                GAPI_DbgAssert(outSz.width >= half_nlanes);
+                horizontalPass_anylpi_8U(alpha, mapsx, dst, src, l, half_nlanes, outSz.width);
+            }
+        }
+
+    } else if (!yRatioEq) {
+        GAPI_DbgAssert(xRatioEq);
+        int inLength = inSz.width;
+        int outLength = outSz.width;
+
+        for (int l = 0; l < lpi; ++l) {
+            short beta0 = beta[l];
+
+            // vertical pass
+            GAPI_DbgAssert(inSz.width >= half_nlanes);
+            verticalPass_anylpi_8U(src0, src1, dst[l], beta0, half_nlanes, l,
+                                   inLength, outLength);
+        }
+
+    } else {
+        GAPI_DbgAssert(xRatioEq && yRatioEq);
+        int length = inSz.width;
+
+        for (int l = 0; l < lpi; ++l) {
+            memcpy(dst[l], src0[l], length);
+        }
+    }
+}
+
 template<int chanNum>
 void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
                             const uint8_t *src0[],
@@ -147,75 +455,18 @@ void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
     const int shift = (half_nlanes / 4);
 
     if (4 == lpi) {
-        GAPI_DbgAssert(inSz.width >= half_nlanes);
-
-        v_uint8 shuf_mask1 = v_setr_s8(0, 8,  4, 12, 1, 9,  5, 13,
-                                       2, 10, 6, 14, 3, 11, 7, 15,
-                                       0, 8,  4, 12, 1, 9,  5, 13,
-                                       2, 10, 6, 14, 3, 11, 7, 15);
-
-        v_uint8 shuf_mask2 = v_setr_s8(0, 4, 8, 12, 2, 6, 10, 14,
-                                       1, 5, 9, 13, 3, 7, 11, 15,
-                                       0, 4, 8, 12, 2, 6, 10, 14,
-                                       1, 5, 9, 13, 3, 7, 11, 15);
-
-        v_uint8 shuf_mask3 = v_setr_s8(0, 1, 8,  9,  2, 3, 10, 11,
-                                       4, 5, 12, 13, 6, 7, 14, 15,
-                                       0, 1, 8,  9,  2, 3, 10, 11,
-                                       4, 5, 12, 13, 6, 7, 14, 15);
-
-        // vertical pass
-        v_int16 b0 = vx_setall_s16(beta[0]);
-        v_int16 b1 = vx_setall_s16(beta[1]);
-        v_int16 b2 = vx_setall_s16(beta[2]);
-        v_int16 b3 = vx_setall_s16(beta[3]);
-
-        for (int w = 0; w < inSz.width*chanNum; ) {
-            for (; w <= inSz.width*chanNum - half_nlanes && w >= 0; w += half_nlanes) {
-                v_int16 val0_0 = v_load_ccache_expand(&src0[0][w]);
-                v_int16 val0_1 = v_load_ccache_expand(&src0[1][w]);
-                v_int16 val0_2 = v_load_ccache_expand(&src0[2][w]);
-                v_int16 val0_3 = v_load_ccache_expand(&src0[3][w]);
-
-                v_int16 val1_0 = v_load_ccache_expand(&src1[0][w]);
-                v_int16 val1_1 = v_load_ccache_expand(&src1[1][w]);
-                v_int16 val1_2 = v_load_ccache_expand(&src1[2][w]);
-                v_int16 val1_3 = v_load_ccache_expand(&src1[3][w]);
-
-                v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
-                v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
-                v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
-                v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
-
-                v_int16 r0 = v_add_wrap(val1_0, t0);
-                v_int16 r1 = v_add_wrap(val1_1, t1);
-                v_int16 r2 = v_add_wrap(val1_2, t2);
-                v_int16 r3 = v_add_wrap(val1_3, t3);
-
-                v_uint8 q0 = v_packus(r0, r1);
-                v_uint8 q1 = v_packus(r2, r3);
-
-                v_uint8 q2 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q0, q1);
-                v_uint8 q3 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q0, q1);             
-
-                v_uint8 q4 = v_shuffle_s8(q2, shuf_mask1);
-                v_uint8 q5 = v_shuffle_s8(q3, shuf_mask1);
-
-                v_uint8 q6 = v256_permute2x128<0x20>(q4, q5);
-                v_uint8 q7 = v256_permute2x128<0x31>(q4, q5);
-
-                vx_store(&tmp[4 * w + 0],  q6);
-                vx_store(&tmp[4 * w + 2 * half_nlanes], q7);
-            }
-
-            if (w < inSz.width*chanNum) {
-                w = inSz.width*chanNum - half_nlanes;
-            }
-        }
+        GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
+        verticalPass_lpi4_8U(src0, src1, tmp, beta,
+                             inSz.width*chanNum, half_nlanes);
 
         // horizontal pass
-        v_uint8 val_0, val_1, val_2, val_3;
         GAPI_DbgAssert(outSz.width >= half_nlanes);
+        //This variables are here to initialize them once. This variant don't affect performance.
+        v_uint8 val_0, val_1, val_2, val_3, res1, res2;
+
+        v_uint8 shuf_mask1 = setHorizontalShufMask1();
+        v_uint8 shuf_mask2 = setHorizontalShufMask2();
+
         for (int x = 0; x < outSz.width; ) {
             for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
                 v_int16 a10 = vx_load(&clone[4 * x]);
@@ -229,45 +480,15 @@ void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
                     v_gather_channel(val_2, tmp, mapsx, chanNum, c, x, shift * 2);
                     v_gather_channel(val_3, tmp, mapsx, chanNum, c, x, shift * 3);
 
-                    v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
-                    v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
-                    v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
-                    v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3));
-
-                    v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0));
-                    v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1));
-                    v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2));
-                    v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3));
-
-                    v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10);
-                    v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32);
-                    v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54);
-                    v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76);
-
-                    v_int16 r0 = v_add_wrap(val1_0, t0);
-                    v_int16 r1 = v_add_wrap(val1_1, t1);
-                    v_int16 r2 = v_add_wrap(val1_2, t2);
-                    v_int16 r3 = v_add_wrap(val1_3, t3);
-
-                    v_uint8 q0 = v_packus(r0, r1);
-                    v_uint8 q1 = v_packus(r2, r3);                    
-
-                    v_uint8 q2 = v_shuffle_s8(q0, shuf_mask2);
-                    v_uint8 q3 = v_shuffle_s8(q1, shuf_mask2);
-                    
-                    v_uint8 q4 = v_blend_shiftleft<0xCC /*0b11001100*/, 4>(q2, q3);
-                    v_uint8 q5 = v_blend_shiftright<0xCC /*0b11001100*/, 4>(q2, q3);
-
-                    v_uint8 q6 = v256_permute4x64<0xD8>(q4);
-                    v_uint8 q7 = v256_permute4x64<0xD8>(q5);                    
-
-                    v_uint8 q8 = v_shuffle_s8(q6, shuf_mask3);
-                    v_uint8 q9 = v_shuffle_s8(q7, shuf_mask3);
-                    
-                    v_store_low(&dst[c][0][x], q8);
-                    v_store_high(&dst[c][1][x], q8);
-                    v_store_low(&dst[c][2][x], q9);
-                    v_store_high(&dst[c][3][x], q9);
+                    main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
+                                                         a10, a32, a54, a76,
+                                                         shuf_mask1, shuf_mask2,
+                                                         res1, res2);
+
+                    v_store_low(&dst[c][0][x], res1);
+                    v_store_high(&dst[c][1][x], res1);
+                    v_store_low(&dst[c][2][x], res2);
+                    v_store_high(&dst[c][3][x], res2);
                 }
             }
 
@@ -281,22 +502,11 @@ void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
 
             // vertical pass
             GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
-            for (int w = 0; w < inSz.width*chanNum; ) {
-                for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) {
-                    v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
-                    v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w]));
-                    v_int16 t = v_mulhrs(s0 - s1, beta0) + s1;
-                    v_pack_u_store(tmp + w, t);
-                }
-
-                if (w < inSz.width*chanNum) {
-                    w = inSz.width*chanNum - half_nlanes;
-                }
-            }
+            verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l,
+                                   inSz.width*chanNum, inSz.width*chanNum);
 
             // horizontal pass
             GAPI_DbgAssert(outSz.width >= half_nlanes);
-
             for (int x = 0; x < outSz.width; ) {
                 for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
                     for (int c = 0; c < chanNum; ++c) {
diff --git a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp
index b2651bd..e2ddd76 100644
--- a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp
@@ -42,44 +42,44 @@ void calcRowArea_CVKL_U8_SSE42(const uchar  * src[],
 
 //-----------------------------------------------------------------------------
 
-// Resize (bi-linear, 8U)
-void calcRowLinear_8U(uint8_t *dst[],
-                      const uint8_t *src0[],
-                      const uint8_t *src1[],
-                      const short    alpha[],
-                      const short    clone[],
-                      const short    mapsx[],
-                      const short    beta[],
-                      uint8_t  tmp[],
-                      const Size   & inSz,
-                      const Size   & outSz,
-                      int      lpi);
+// Resize (bi-linear, 8UC1)
+void calcRowLinear_8UC1(uint8_t*       dst[],
+                        const uint8_t* src0[],
+                        const uint8_t* src1[],
+                        const short    alpha[],
+                        const short    clone[],
+                        const short    mapsx[],
+                        const short    beta[],
+                        uint8_t        tmp[],
+                        const Size&    inSz,
+                        const Size&    outSz,
+                        int            lpi);
 
 // Resize (bi-linear, 8UC3)
 void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
-                      const uint8_t *src0[],
-                      const uint8_t *src1[],
+                      const uint8_t* src0[],
+                      const uint8_t* src1[],
                       const short    alpha[],
                       const short    clone[],
                       const short    mapsx[],
                       const short    beta[],
-                        uint8_t  tmp[],
-                      const Size    &inSz,
-                      const Size    &outSz,
-                        int      lpi);
+                          uint8_t    tmp[],
+                      const Size&    inSz,
+                      const Size&    outSz,
+                              int    lpi);
 
 // Resize (bi-linear, 8UC4)
 void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
-                      const uint8_t *src0[],
-                      const uint8_t *src1[],
+                      const uint8_t* src0[],
+                      const uint8_t* src1[],
                       const short    alpha[],
                       const short    clone[],
                       const short    mapsx[],
                       const short    beta[],
-                        uint8_t  tmp[],
-                      const Size    &inSz,
-                      const Size    &outSz,
-                        int      lpi);
+                          uint8_t    tmp[],
+                      const Size&    inSz,
+                      const Size&    outSz,
+                              int    lpi);
 
 template<int numChan>
 void calcRowLinear_8UC(std::array<std::array<uint8_t*, 4>, numChan> &dst,
diff --git a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
index 8b994d8..8d32861 100644
--- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
+++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
@@ -50,18 +50,18 @@ namespace InferenceEngine {
 namespace gapi {
 namespace kernels {
 
-// Resize (bi-linear, 8U)
-void calcRowLinear_8U(uint8_t *dst[],
-                const uint8_t *src0[],
-                const uint8_t *src1[],
-                const short    alpha[],
-                const short    clone[],  // 4 clones of alpha
-                const short    mapsx[],
-                const short    beta[],
-                      uint8_t  tmp[],
-                const Size   & inSz,
-                const Size   & outSz,
-                      int      lpi) {
+// 8UC1 Resize (bi-linear)
+void calcRowLinear_8UC1(      uint8_t *dst[],
+                        const uint8_t *src0[],
+                        const uint8_t *src1[],
+                        const short    alpha[],
+                        const short    clone[],  // 4 clones of alpha
+                        const short    mapsx[],
+                        const short    beta[],
+                              uint8_t  tmp[],
+                        const Size&    inSz,
+                        const Size&    outSz,
+                              int      lpi) {
     bool xRatioEq1 = inSz.width  == outSz.width;
     bool yRatioEq1 = inSz.height == outSz.height;
 
@@ -650,9 +650,9 @@ void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
             GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
             for (int w = 0; w < inSz.width*chanNum; ) {
                 for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) {
-                    v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(&src0[l][w]));
-                    v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(&src1[l][w]));
-                    v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1;
+                    v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
+                    v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w]));
+                    v_int16 t = v_mulhrs(s0 - s1, beta0) + s1;
                     v_pack_u_store(tmp + w, t);
                 }
 
@@ -666,11 +666,11 @@ void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
             for (int x = 0; x < outSz.width; ) {
                 for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
                     for (int c = 0; c < chanNum; c++) {
-                        v_int16x8 a0 = v_load(&alpha[x]);        // as signed Q1.1.14
-                        v_int16x8 sx = v_load(&mapsx[x]);        // as integer (int16)
-                        v_int16x8 t0 = v_gather_chan<chanNum>(tmp, sx, c, 0);
-                        v_int16x8 t1 = v_gather_chan<chanNum>(tmp, sx, c, 1);
-                        v_int16x8 d = v_mulhrs(t0 - t1, a0) + t1;
+                        v_int16 a0 = vx_load(&alpha[x]);        // as signed Q1.1.14
+                        v_int16 sx = vx_load(&mapsx[x]);        // as integer (int16)
+                        v_int16 t0 = v_gather_chan<chanNum>(tmp, sx, c, 0);
+                        v_int16 t1 = v_gather_chan<chanNum>(tmp, sx, c, 1);
+                        v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
                         v_pack_u_store(&dst[c][l][x], d);
                     }
                 }
diff --git a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp
index e0a2664..bfc5755 100644
--- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp
+++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp
@@ -42,17 +42,17 @@ void calcRowArea_CVKL_U8_SSE42(const uchar  * src[],
 //----------------------------------------------------------------------
 
 // Resize (bi-linear, 8U)
-void calcRowLinear_8U(uint8_t *dst[],
-                const uint8_t *src0[],
-                const uint8_t *src1[],
-                const short    alpha[],
-                const short    clone[],
-                const short    mapsx[],
-                const short    beta[],
-                      uint8_t  tmp[],
-                const Size   & inSz,
-                const Size   & outSz,
-                      int      lpi);
+void calcRowLinear_8UC1(uint8_t *dst[],
+                        const uint8_t *src0[],
+                        const uint8_t *src1[],
+                        const short    alpha[],
+                        const short    clone[],
+                        const short    mapsx[],
+                        const short    beta[],
+                            uint8_t  tmp[],
+                        const Size&  inSz,
+                        const Size&  outSz,
+                              int    lpi);
 
 // Resize (bi-linear, 8UC3)
 void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
index 2272ba5..4d2f854 100644
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
@@ -805,12 +805,32 @@ static void calcRowLinear(const cv::gapi::fluid::View  & in,
         src1[l] = in.InLine<const T>(index1);
         dst[l] = out.OutLine<T>(l);
     }
+#if 1
+    #ifdef HAVE_AVX2
+    if (with_cpu_x86_avx2()) {
+        if (std::is_same<T, uint8_t>::value) {
+            if (inSz.width >= 32 && outSz.width >= 16) {
+                avx::calcRowLinear_8UC1(reinterpret_cast<uint8_t**>(dst),
+                                        reinterpret_cast<const uint8_t**>(src0),
+                                        reinterpret_cast<const uint8_t**>(src1),
+                                        reinterpret_cast<const short*>(alpha),
+                                        reinterpret_cast<const short*>(clone),
+                                        reinterpret_cast<const short*>(mapsx),
+                                        reinterpret_cast<const short*>(beta),
+                                        reinterpret_cast<uint8_t*>(tmp),
+                                        inSz, outSz, lpi);
 
+                return;
+            }
+        }
+    }
+    #endif
+#endif
     #ifdef HAVE_SSE
     if (with_cpu_x86_sse42()) {
         if (std::is_same<T, uint8_t>::value) {
             if (inSz.width >= 16 && outSz.width >= 8) {
-                calcRowLinear_8U(reinterpret_cast<uint8_t**>(dst),
+                calcRowLinear_8UC1(reinterpret_cast<uint8_t**>(dst),
                                  reinterpret_cast<const uint8_t**>(src0),
                                  reinterpret_cast<const uint8_t**>(src1),
                                  reinterpret_cast<const short*>(alpha),
diff --git a/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp b/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp
index 046f604..c057e3a 100644
--- a/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp
+++ b/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp
@@ -36,7 +36,7 @@ inline __m256d _v256_shuffle_odd_64(const __m256d& v)
 { return _mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); }
 
 template<int imm>
-inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
+static inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
 { return _mm256_permute2x128_si256(a, b, imm); }
 
 template<int imm>
@@ -52,7 +52,7 @@ static inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
 { return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
 
 template<int imm>
-inline __m256i _v256_permute4x64(const __m256i& a)
+static inline __m256i _v256_permute4x64(const __m256i& a)
 { return _mm256_permute4x64_epi64(a, imm); }
 
 template<int imm>
@@ -1956,9 +1956,14 @@ static inline v_uint8x32 v_setr_s8(char b0, char b1, char b2, char b3, char b4,
                                    char b30, char b31)
 {
     return v_uint8x32(_mm256_setr_epi8(b0, b1, b2, b3, b4, b5, b6, b7,
-                        b8, b9, b10, b11, b12, b13, b14, b15,
-                        b16, b17, b18, b19, b20, b21, b22, b23,
-                        b24, b25, b26, b27, b28, b29, b30, b31));
+                                       b8, b9, b10, b11, b12, b13, b14, b15,
+                                       b16, b17, b18, b19, b20, b21, b22, b23,
+                                       b24, b25, b26, b27, b28, b29, b30, b31));
+}
+
+static inline v_uint32x8 v_setr_s32(int b0, int b1, int b2, int b3, int b4, int b5, int b6, int b7)
+{
+    return v_uint32x8(_mm256_setr_epi32(b0, b1, b2, b3, b4, b5, b6, b7));
 }
 
 inline void v_pack_store(schar* ptr, const v_int16x16& a)
@@ -3001,36 +3006,30 @@ static inline void v_deinterleave(const v_float32x8& low, const v_float32x8& hig
     odd .val = _mm256_unpackhi_ps(tmp0, tmp1);
 }
 
-static inline void v_deinterleave(const v_uint8x32& i0, const v_uint8x32& i1,
-                                  const v_uint8x32& i2, const v_uint8x32& i3,
-                                        v_uint8x32& o0,       v_uint8x32& o1,
-                                        v_uint8x32& o2,       v_uint8x32& o3)
+static inline void v_deinterleave(const v_uint8x32& v0, const v_uint8x32& v1,
+                                  const v_uint8x32& v2, const v_uint8x32& v3,
+                                        v_uint8x32& a,        v_uint8x32& b,
+                                        v_uint8x32& c,        v_uint8x32& d,
+                                        v_uint8x32& shuf_mask)
 {
-    __m256i u0 = i0.val;                     // a0 b0 c0 d0 a1 b1 c1 d1 ...
-    __m256i u1 = i1.val;                     // a4 b4 c4 d4 ...
-    __m256i u2 = i2.val;                     // a8 b8 c8 d8 ...
-    __m256i u3 = i3.val;                     // a12 b12 c12 d12 ...
+    /* a0a1a2a3 b0b1b2b3 c0c1c2c3 d0d1d2d3 a16a17a18a19 b16b17b18b19 c16c17c18c19 d16d17d18d19 */
+    __m256i u0 = _mm256_shuffle_epi8(v0.val, shuf_mask.val);
+    /* a4a5a6a7 b4b5b6b7 c4c5c6c7 d4d5d6d7 a20a21a22a23 b20b21b22b23 c20c21c22c23 d20d21d22d23 */
+    __m256i u1 = _mm256_shuffle_epi8(v1.val, shuf_mask.val);
+    /* a8a9a10a11 b8b9b10b11 c8c9c10c11 d8d9d10d11 */
+    __m256i u2 = _mm256_shuffle_epi8(v2.val, shuf_mask.val);
+    __m256i u3 = _mm256_shuffle_epi8(v3.val, shuf_mask.val);
 
-    __m256i v0 = _mm256_unpacklo_epi8(u0, u2);  // a0 a8 b0 b8 ...
-    __m256i v1 = _mm256_unpackhi_epi8(u0, u2);  // a2 a10 b2 b10 ...
-    __m256i v2 = _mm256_unpacklo_epi8(u1, u3);  // a4 a12 b4 b12 ...
-    __m256i v3 = _mm256_unpackhi_epi8(u1, u3);  // a6 a14 b6 b14 ...
+    __m256i s0 = _mm256_blend_epi16(u0, _mm256_slli_si256(u1, 4), 0xCC /*0b11001100*/);  // a0a1a2a3a4a5a6a7 c0c1c2c3c4c5c6c7 a16a17a18a19a29a21a22a23 ...
+    __m256i s1 = _mm256_blend_epi16(_mm256_srli_si256(u0, 4), u1, 0xCC /*0b11001100*/);
+    __m256i s2 = _mm256_blend_epi16(u2, _mm256_slli_si256(u3, 4), 0xCC /*0b11001100*/);
+    __m256i s3 = _mm256_blend_epi16(_mm256_srli_si256(u2, 4), u3, 0xCC /*0b11001100*/);
 
-    u0 = _mm256_unpacklo_epi8(v0, v2);          // a0 a4 a8 a12 ...
-    u1 = _mm256_unpacklo_epi8(v1, v3);          // a2 a6 a10 a14 ...
-    u2 = _mm256_unpackhi_epi8(v0, v2);          // a1 a5 a9 a13 ...
-    u3 = _mm256_unpackhi_epi8(v1, v3);          // a3 a7 a11 a15 ...
-
-    v0 = _mm256_unpacklo_epi8(u0, u1);          // a0 a2 a4 a6 ...
-    v1 = _mm256_unpacklo_epi8(u2, u3);          // a1 a3 a5 a7 ...
-    v2 = _mm256_unpackhi_epi8(u0, u1);          // c0 c2 c4 c6 ...
-    v3 = _mm256_unpackhi_epi8(u2, u3);          // c1 c3 c5 c7 ...
-
-    o0.val = _mm256_unpacklo_epi8(v0, v1);      // a0 a1 a2 a3 ...
-    o1.val = _mm256_unpackhi_epi8(v0, v1);      // b0 b1 b2 b3 ...
-    o2.val = _mm256_unpacklo_epi8(v2, v3);      // c0 c1 c2 c3 ...
-    o3.val = _mm256_unpackhi_epi8(v2, v3);      // d0 d1 d2 d3 ...
-}
+    a.val = _mm256_blend_epi16(s0, _mm256_slli_si256(s2, 8), 0xF0 /*0b11110000*/);
+    c.val = _mm256_blend_epi16(_mm256_srli_si256(s0, 8), s2, 0xF0 /*0b11110000*/);
+    b.val = _mm256_blend_epi16(s1, _mm256_slli_si256(s3, 8), 0xF0 /*0b11110000*/);
+    d.val = _mm256_blend_epi16(_mm256_srli_si256(s1, 8), s3, 0xF0 /*0b11110000*/);
+ }
 
 static inline v_uint8x32 v_interleave_low(const v_uint8x32& a, const v_uint8x32& b)
 {
@@ -3090,23 +3089,17 @@ static inline void v_deinterleave_expand(const v_uint8x32& src, v_int16x16& even
 
 static inline v_int16x16 v_mulhi(const v_int16x16& a, short b)
 {
-    v_int16x16 r;
-    r.val = _mm256_mulhi_epi16(a.val, _mm256_set1_epi16(b));
-    return r;
+    return v_int16x16(_mm256_mulhi_epi16(a.val, _mm256_set1_epi16(b)));
 }
 
-static inline v_uint16x16 v_mulhi(const v_uint16x16& a, v_uint16x16 b)
+static inline v_uint16x16 v_mulhi(const v_uint16x16& a, v_uint16x16& b)
 {
-    v_uint16x16 r;
-    r.val = _mm256_mulhi_epu16(a.val, b.val);
-    return r;
+    return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
 }
 
 static inline v_uint16x16 v_mulhi(const v_uint16x16& a, uint16_t b)
 {
-    v_uint16x16 r;
-    r.val = _mm256_mulhi_epu16(a.val, _mm256_set1_epi16(b));
-    return r;
+    return v_uint16x16(_mm256_mulhi_epu16(a.val, _mm256_set1_epi16(b)));
 }
 
 static inline v_int16x16 v_mulhrs(const v_int16x16& a, const v_int16x16& b)
@@ -3149,6 +3142,17 @@ static inline v_uint8x32 v_shuffle_s8(const v_uint8x32& a, const v_uint8x32& mas
     return v_uint8x32(_mm256_shuffle_epi8(a.val, mask.val));
 }
 
+template<int index>
+static inline v_uint8x32 v_insert64(v_uint8x32& a, const int64_t& i)
+{
+    return v_uint8x32(_mm256_insert_epi64(a.val, i, index));
+}
+
+static inline v_uint8x32 v_permutevar8x32(v_uint8x32& a, v_uint32x8& idxs)
+{
+    return v_uint8x32(_mm256_permutevar8x32_epi32(a.val, idxs.val));
+}
+
 static inline void v_gather_channel(v_uint8x32& vec, const uint8_t tmp[], const short mapsx[],
                                     int chanNum, int c, int x, int shift)
 {
@@ -3163,6 +3167,29 @@ static inline void v_gather_channel(v_uint8x32& vec, const uint8_t tmp[], const
     vec.val = _mm256_insert_epi32(vec.val, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + shift + 3] + 1) + c)]), 7);
 }
 
+// for each j=index[k], load two chars src[j] and src[j+1]
+static inline v_uint8x32 v_gather_pairs(const uchar src[], const v_int16x16& index) {
+    v_uint8x32 r;
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm256_extract_epi16(index.val, 0)]), 0);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm256_extract_epi16(index.val, 1)]), 1);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm256_extract_epi16(index.val, 2)]), 2);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm256_extract_epi16(index.val, 3)]), 3);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm256_extract_epi16(index.val, 4)]), 4);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm256_extract_epi16(index.val, 5)]), 5);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm256_extract_epi16(index.val, 6)]), 6);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm256_extract_epi16(index.val, 7)]), 7);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm256_extract_epi16(index.val, 8)]), 8);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm256_extract_epi16(index.val, 9)]), 9);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm256_extract_epi16(index.val, 10)]), 10);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm256_extract_epi16(index.val, 11)]), 11);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm256_extract_epi16(index.val, 12)]), 12);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm256_extract_epi16(index.val, 13)]), 13);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm256_extract_epi16(index.val, 14)]), 14);
+    r.val = _mm256_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm256_extract_epi16(index.val, 15)]), 15);
+
+    return r;
+}
+
 namespace {
     template<int chanNum>
     static inline v_int16x16 v_gather_chan(const uchar src[], const v_int16x16& index, int channel, int pos) {