From 7987e268f4be28ff2f14d4c7afded9b2eba302d1 Mon Sep 17 00:00:00 2001
From: yang <yang.zhang@arm.com>
Date: Tue, 26 Feb 2013 11:18:07 +0800
Subject: [PATCH] add image resize functions(NEON version)

---
 CMakeLists.txt                      |   1 +
 common/NE10_mask_table.c            |  19 +-
 common/NE10_mask_table.h            |   5 +
 inc/NE10.h                          |  11 +
 inc/NE10_dsp.h                      |   4 +-
 inc/NE10_imgproc.h                  |  84 ++++++++
 modules/CMakeLists.txt              |  36 +++-
 modules/NE10_init.c                 |   9 +
 modules/imgproc/NE10_init_imgproc.c |  62 ++++++
 modules/imgproc/NE10_resize.neon.s  | 397 ++++++++++++++++++++++++++++++++++++
 10 files changed, 620 insertions(+), 8 deletions(-)
 create mode 100644 inc/NE10_imgproc.h
 create mode 100644 modules/imgproc/NE10_init_imgproc.c
 create mode 100644 modules/imgproc/NE10_resize.neon.s

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e6d8adb..f289251 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,6 +46,7 @@ endif()
 #select functionalities to be compiled
 option(NE10_ENABLE_MATH "Build math functionalities to NE10" ON)
 option(NE10_ENABLE_DSP "Build dsp functionalities to NE10" ON)
+option(NE10_ENABLE_IMGPROC "Build image processing functionalities to NE10" ON)
 
 set(NE10_VERSION 10)
 
diff --git a/common/NE10_mask_table.c b/common/NE10_mask_table.c
index 1e82ea5..0db75a8 100644
--- a/common/NE10_mask_table.c
+++ b/common/NE10_mask_table.c
@@ -33,11 +33,11 @@
 
 const ne10_uint32_t ne10_qMaskTable32[Q_MASK_TABLE_SIZE] =
 {
-        0x00000000, 0x00000000, 0x00000000, 0x00000000,
-        0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000,
-        0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000,
-        0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000,
-        0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
+    0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000,
+    0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
 };
 const ne10_uint32_t ne10_dMaskTable32[D_MASK_TABLE_SIZE] =
 {
@@ -67,3 +67,12 @@ const ne10_uint32_t ne10_divLookUpTable[DIV_LOOKUP_TABLE_SIZE]=
     291,290,289,287,286,285,284,282,281,280,279,278,277,275,274,273,
     272,271,270,269,267,266,265,264,263,262,261,260,259,258,257
     };
+
+const ne10_uint64_t ne10_vresize_mask_residual_table[VRESIZE_MASK_TABLE_SIZE] =
+{
+    0x00000000000000FF, 0x000000000000FFFF,
+    0x0000000000FFFFFF, 0x00000000FFFFFFFF,
+    0x000000FFFFFFFFFF, 0x0000FFFFFFFFFFFF,
+    0x00FFFFFFFFFFFFFF
+};
+
diff --git a/common/NE10_mask_table.h b/common/NE10_mask_table.h
index afcabba..e051ea1 100644
--- a/common/NE10_mask_table.h
+++ b/common/NE10_mask_table.h
@@ -37,8 +37,13 @@
 #define D_MASK_TABLE_SIZE        6
 #define DIV_LOOKUP_TABLE_SIZE    255
 
+/* mask table for dsp module */
 extern const ne10_uint32_t ne10_qMaskTable32[Q_MASK_TABLE_SIZE];
 extern const ne10_uint32_t ne10_dMaskTable32[D_MASK_TABLE_SIZE];
 extern const ne10_uint32_t ne10_divLookUpTable[DIV_LOOKUP_TABLE_SIZE];
+
+/* mask table for imgproc module */
+#define VRESIZE_MASK_TABLE_SIZE    7
+extern const ne10_uint64_t ne10_vresize_mask_residual_table[VRESIZE_MASK_TABLE_SIZE];
 #endif
 
diff --git a/inc/NE10.h b/inc/NE10.h
index c21fd96..148bc86 100644
--- a/inc/NE10.h
+++ b/inc/NE10.h
@@ -82,6 +82,7 @@
    *
    * - @link groupMaths Math Functions@endlink
    * - @link groupDSPs Signal Processing Functions@endlink
+   * - @link groupIMGPROCs Image Processing Functions@endlink
    * - Physics functions
    * - Image Processing functions
    * - Others
@@ -123,6 +124,15 @@
  * such as complex/real FFT/IFFT, FIR and IIR. Currently, only the float (single precision)
  * data type is supported.
  */
+
+/**
+ * @defgroup groupIMGPROCs Image Processing Functions
+ *
+ *
+ * This set of functions provide some commonly used functions in image processing,
+ * such as image scale, image rotate.
+ */
+
 /**
  * @defgroup groupSamples Sample Functions
  *
@@ -142,6 +152,7 @@ extern "C" {
 #include "NE10_init.h"
 #include "NE10_math.h"
 #include "NE10_dsp.h"
+#include "NE10_imgproc.h"
 
 #ifdef __cplusplus
 }
diff --git a/inc/NE10_dsp.h b/inc/NE10_dsp.h
index d25a9cd..ad3109b 100644
--- a/inc/NE10_dsp.h
+++ b/inc/NE10_dsp.h
@@ -217,14 +217,14 @@ extern "C" {
     /** @} */ //end of FIR group
 
     /**
-     * @addtogroup FIR_decimate
+     * @addtogroup FIR_Decimate
      * @{
      */
     extern void ne10_fir_decimate_float_neon (const ne10_fir_decimate_instance_f32_t * S,
             ne10_float32_t *pSrc,
             ne10_float32_t *pDst,
             ne10_uint32_t blockSize);
-    /** @} */ //end of FIR_decimate group
+    /** @} */ //end of FIR_Decimate group
 
     /**
      * @addtogroup FIR_Interpolate
diff --git a/inc/NE10_imgproc.h b/inc/NE10_imgproc.h
new file mode 100644
index 0000000..5a79b16
--- /dev/null
+++ b/inc/NE10_imgproc.h
@@ -0,0 +1,84 @@
+/*
+ *  Copyright 2013 ARM Limited
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NE10 Library : inc/NE10_imgproc.h
+ */
+
+
+#include <NE10_types.h>
+
+#ifndef NE10_IMGPROC_H
+#define NE10_IMGPROC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///////////////////////////
+// function prototypes:
+///////////////////////////
+
+    /* image resize functions*/
+
+    /* function pointers*/
+    extern void (*ne10_vresize) (const ne10_int32_t** src,
+                                 ne10_uint8_t* dst,
+                                 const ne10_int16_t* beta,
+                                 ne10_int32_t width);
+    extern void (*ne10_hresize_4channels) (const ne10_uint8_t** src,
+                                           ne10_int32_t** dst,
+                                           ne10_int32_t count,
+                                           const ne10_int32_t* xofs,
+                                           const ne10_int16_t* alpha,
+                                           ne10_int32_t swidth,
+                                           ne10_int32_t dwidth,
+                                           ne10_int32_t cn,
+                                           ne10_int32_t xmin,
+                                           ne10_int32_t xmax);
+
+    /* NEON version*/
+    extern void ne10_vresize_neon (const ne10_int32_t** src,
+                                   ne10_uint8_t* dst,
+                                   const ne10_int16_t* beta,
+                                   ne10_int32_t width);
+    extern void ne10_hresize_4channels_neon (const ne10_uint8_t** src,
+            ne10_int32_t** dst,
+            ne10_int32_t count,
+            const ne10_int32_t* xofs,
+            const ne10_int16_t* alpha,
+            ne10_int32_t swidth,
+            ne10_int32_t dwidth,
+            ne10_int32_t cn,
+            ne10_int32_t xmin,
+            ne10_int32_t xmax);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt
index 8dea024..6a4969e 100644
--- a/modules/CMakeLists.txt
+++ b/modules/CMakeLists.txt
@@ -171,13 +171,47 @@ if(NE10_ENABLE_DSP)
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_init_dsp.c
     )
 
-    # Add math files
+    # Add dsp files
     set(NE10_INIT_SRCS ${NE10_INIT_SRCS} ${NE10_DSP_INIT_SRCS})
     set(NE10_C_SRCS ${NE10_C_SRCS} ${NE10_DSP_C_SRCS})
     set(NE10_INTRINSIC_SRCS ${NE10_INTRINSIC_SRCS} ${NE10_DSP_INTRINSIC_SRCS})
     set(NE10_NEON_SRCS ${NE10_NEON_SRCS} ${NE10_DSP_NEON_SRCS})
 endif()
 
+if(NE10_ENABLE_IMGPROC)
+    #enable NE10_init_imgproc
+    add_definitions(-DNE10_ENABLE_IMGPROC)
+    # Add image processing C files.
+    set(NE10_IMGPROC_C_SRCS
+        ${PROJECT_SOURCE_DIR}/common/NE10_mask_table.c
+    )
+
+    # Add image processing NEON files.
+    set(NE10_IMGPROC_NEON_SRCS
+        ${PROJECT_SOURCE_DIR}/modules/imgproc/NE10_resize.neon.s
+    )
+
+    # Tell CMake these files need to go to the C compiler
+    set(FLAGS "-mfpu=neon -Wa,-I${PROJECT_SOURCE_DIR}/inc -Wa,-I${PROJECT_SOURCE_DIR}/common" )
+    foreach(neon_file ${NE10_IMGPROC_NEON_SRCS})
+        set_property (SOURCE ${neon_file} PROPERTY LANGUAGE C)
+        set_source_files_properties(
+            ${neon_file} PROPERTIES COMPILE_FLAGS
+            ${FLAGS}
+        )
+    endforeach(neon_file)
+
+    # Add image processing init files.
+    set(NE10_IMGPROC_INIT_SRCS
+        ${PROJECT_SOURCE_DIR}/modules/imgproc/NE10_init_imgproc.c
+    )
+
+    # Add image processing files
+    set(NE10_INIT_SRCS ${NE10_INIT_SRCS} ${NE10_IMGPROC_INIT_SRCS})
+    set(NE10_C_SRCS ${NE10_C_SRCS} ${NE10_IMGPROC_C_SRCS})
+    set(NE10_NEON_SRCS ${NE10_NEON_SRCS} ${NE10_IMGPROC_NEON_SRCS})
+endif()
+
 include_directories (
     ${PROJECT_SOURCE_DIR}/inc
     ${PROJECT_SOURCE_DIR}/common
diff --git a/modules/NE10_init.c b/modules/NE10_init.c
index b00482c..a4ad821 100644
--- a/modules/NE10_init.c
+++ b/modules/NE10_init.c
@@ -84,5 +84,14 @@ ne10_result_t ne10_init()
     }
 #endif
 
+#if defined (NE10_ENABLE_IMGPROC)
+    status = ne10_init_imgproc (is_NEON_available);
+    if (status != NE10_OK)
+    {
+        fprintf(stderr, "ERROR: init imgproc failed\n");
+        return NE10_ERR;
+    }
+#endif
+
     return NE10_OK;
 }
diff --git a/modules/imgproc/NE10_init_imgproc.c b/modules/imgproc/NE10_init_imgproc.c
new file mode 100644
index 0000000..afe7002
--- /dev/null
+++ b/modules/imgproc/NE10_init_imgproc.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2013 ARM Limited
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+
+#include "NE10_imgproc.h"
+
+ne10_result_t ne10_init_imgproc (ne10_int32_t is_NEON_available)
+{
+    if (NE10_OK == is_NEON_available)
+    {
+        ne10_vresize = ne10_vresize_neon;
+        ne10_hresize_4channels = ne10_hresize_4channels_neon;
+    }
+    else
+    {
+        ;
+    }
+    return NE10_OK;
+}
+
+// These are actual definitions of our function pointers that are declared in inc/NE10_imgproc.h
+void (*ne10_vresize) (const ne10_int32_t** src,
+                      ne10_uint8_t* dst,
+                      const ne10_int16_t* beta,
+                      ne10_int32_t width);
+void (*ne10_hresize_4channels) (const ne10_uint8_t** src,
+                                ne10_int32_t** dst,
+                                ne10_int32_t count,
+                                const ne10_int32_t* xofs,
+                                const ne10_int16_t* alpha,
+                                ne10_int32_t swidth,
+                                ne10_int32_t dwidth,
+                                ne10_int32_t cn,
+                                ne10_int32_t xmin,
+                                ne10_int32_t xmax);
+
+
diff --git a/modules/imgproc/NE10_resize.neon.s b/modules/imgproc/NE10_resize.neon.s
new file mode 100644
index 0000000..e72ae76
--- /dev/null
+++ b/modules/imgproc/NE10_resize.neon.s
@@ -0,0 +1,397 @@
+/*
+ *  Copyright 2013 ARM Limited
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NE10 Library : imgproc/NE10_resize.neon.s
+ */
+
+        .text
+        .syntax   unified
+
+
+        /**
+         * @details
+         * This function implements the vertical interpolation
+         *
+         * @param[in]   **src            points to input pointers
+         * @param[out]  *dst             points to the output buffer
+         * @param[in]   *beta            points to interpolate parameter
+         * @param[in]   width            width of output buffer
+         */
+
+        .align   4
+        .global   ne10_vresize_neon
+        .thumb
+        .extern ne10_vresize_mask_residual_table/* mask of store data */
+        .thumb_func
+        .equ         BITS,        0x16        /* INTER_RESIZE_COEF_BITS*2 */
+        .equ         DELTA,       0x200000    /* 1 << (INTER_RESIZE_COEF_BITS*2 - 1) */
+
+ne10_vresize_neon:
+                     push    {r4-r6,lr}
+
+/*ARM Registers*/
+pSrc             .req   r0
+pDst             .req   r1
+pBeta            .req   r2
+width            .req   r3
+
+pS0              .req   r4
+pS1              .req   r5
+tmp              .req   r6
+
+beta0            .req   r2
+beta1            .req   r6
+
+pMask            .req   r6
+
+/*NEON variale Declaration*/
+dBeta0           .dn   d0[0]
+dBeta1           .dn   d0[1]
+dMask            .dn   d1
+qDelta           .qn   q1
+qMin             .qn   q12
+qMax             .qn   q13
+
+qS0_0123         .qn   q2
+qS0_4567         .qn   q3
+qS1_0123         .qn   q8
+qS1_4567         .qn   q9
+
+qTmp_0123        .qn   q10
+qTmp_4567        .qn   q11
+dTmp_0123        .dn   d20
+dTmp_4567        .dn   d21
+qTmp_01234567    .qn   q10
+dTmp_01234567    .dn   d20
+dDst_01234567    .dn   d21
+
+                     ldr         beta0, [pBeta]
+                     ldr         pS0, [pSrc], #4
+                     ldr         pS1, [pSrc]
+                     lsr         beta1, beta0, #16
+                     lsl         beta0, beta0, #16
+                     lsr         beta0, beta0, #16
+                     vmov.s32    dBeta0, beta0
+                     vmov.s32    dBeta1, beta1
+
+                     mov         tmp, DELTA
+                     vdup.32     qDelta, tmp
+                     veor        qMin, qMin, qMin
+                     mov         tmp, #255
+                     vdup.32     qMax, tmp
+
+                     subs        width, width, #8
+                     blt         VResizeResidualLoop
+
+                     vld1.s32    {qS0_0123, qS0_4567}, [pS0]!
+                     vld1.s32    {qS1_0123, qS1_4567}, [pS1]!
+VResizeMainLoop:
+
+                     vmul.s32    qTmp_0123, qS0_0123, dBeta0
+                     vmul.s32    qTmp_4567, qS0_4567, dBeta0
+                     vmla.s32    qTmp_0123, qS1_0123, dBeta1
+                     vmla.s32    qTmp_4567, qS1_4567, dBeta1
+
+                     vadd.s32    qTmp_0123, qTmp_0123, qDelta
+                     vadd.s32    qTmp_4567, qTmp_4567, qDelta
+
+                     vshr.s32    qTmp_0123, qTmp_0123, #BITS
+                     vshr.s32    qTmp_4567, qTmp_4567, #BITS
+
+                     vmax.s32    qTmp_0123, qTmp_0123, qMin
+                     vmax.s32    qTmp_4567, qTmp_4567, qMin
+                     vmin.s32    qTmp_0123, qTmp_0123, qMax
+                     vmin.s32    qTmp_4567, qTmp_4567, qMax
+
+                     vmovn.I32    dTmp_0123, qTmp_0123
+                     vmovn.I32    dTmp_4567, qTmp_4567
+                     vmovn.I16    dTmp_01234567, qTmp_01234567
+                     vst1.8       {dTmp_01234567}, [pDst]!
+
+
+                     subs        width, width, #8
+                     vld1.s32    {qS0_0123, qS0_4567}, [pS0]!
+                     vld1.s32    {qS1_0123, qS1_4567}, [pS1]!
+                     bge         VResizeMainLoop
+
+                     adds        width, width, #8
+                     beq         VResizeEnd
+
+                     sub         width, width, #1
+                     ldr         pMask, =ne10_vresize_mask_residual_table
+                     sub         width, width, #1
+                     add         pMask, pMask, width, lsl #3
+                     vld1.64     {dMask}, [pMask]
+                     vld1.64     {dDst_01234567}, [pDst]
+
+VResizeResidualLoop:
+
+                     vmul.s32    qTmp_0123, qS0_0123, dBeta0
+                     vmul.s32    qTmp_4567, qS0_4567, dBeta0
+                     vmla.s32    qTmp_0123, qS1_0123, dBeta1
+                     vmla.s32    qTmp_4567, qS1_4567, dBeta1
+
+                     vadd.s32    qTmp_0123, qTmp_0123, qDelta
+                     vadd.s32    qTmp_4567, qTmp_4567, qDelta
+
+                     vshr.s32    qTmp_0123, qTmp_0123, #BITS
+                     vshr.s32    qTmp_4567, qTmp_4567, #BITS
+
+                     vmax.s32    qTmp_0123, qTmp_0123, qMin
+                     vmax.s32    qTmp_4567, qTmp_4567, qMin
+                     vmin.s32    qTmp_0123, qTmp_0123, qMax
+                     vmin.s32    qTmp_4567, qTmp_4567, qMax
+
+                     vmovn.I32    dTmp_0123, qTmp_0123
+                     vmovn.I32    dTmp_4567, qTmp_4567
+                     vmovn.I16    dTmp_01234567, qTmp_01234567
+                     vbsl         dMask, dTmp_01234567, dDst_01234567
+                     vst1.8       {dTmp_01234567}, [pDst]
+VResizeEnd:
+                     /*Return From Function*/
+                     pop     {r4-r6,pc}
+
+
+/*ARM Registers*/
+.unreq               pSrc
+.unreq               pDst
+.unreq               pBeta
+.unreq               width
+
+.unreq               pS0
+.unreq               pS1
+.unreq               tmp
+
+.unreq               beta0
+.unreq               beta1
+
+.unreq               pMask
+
+ /*NEON variale Declaration*/
+.unreq               dBeta0
+.unreq               dBeta1
+.unreq               qDelta
+.unreq               dMask
+
+.unreq               qMin
+.unreq               qMax
+
+.unreq               qS0_0123
+.unreq               qS0_4567
+.unreq               qS1_0123
+.unreq               qS1_4567
+.unreq               qTmp_0123
+.unreq               qTmp_4567
+.unreq               dTmp_0123
+.unreq               dTmp_4567
+
+.unreq               qTmp_01234567
+.unreq               dTmp_01234567
+.unreq               dDst_01234567
+
+        /**
+         * @details
+         * This function implements the horizontal interpolation
+         *
+         * @param[in]   **src            points to input pointers
+         * @param[out]  **dst            points to the output pointers
+         * @param[in]   count
+         * @param[in]   *xofs            points to interpolate offset
+         * @param[in]   *alpha           points to interpolate parameter
+         * @param[in]   swidth           width of input buffer
+         * @param[in]   dwidth           width of output buffer
+         * @param[in]   cn
+         * @param[in]   xmin
+         * @param[in]   xmax
+         */
+
+        .align   4
+        .global   ne10_hresize_4channels_neon
+        .thumb
+        .thumb_func
+        .equ         INTER_RESIZE_COEF_SCALE,        0x800  /* 1 << INTER_RESIZE_COEF_BITS */
+
+ne10_hresize_4channels_neon:
+                     push    {r4-r10,lr}
+
+/*ARM Registers*/
+pIn0             .req   r0
+pIn1             .req   r1
+pIn2             .req   r2
+pIn3             .req   r3
+
+pSrc             .req   r0
+pDst             .req   r1
+pXofs            .req   r2
+pAlpha           .req   r3
+
+pS0              .req   r4
+pS1              .req   r0
+pD0              .req   r5
+pD1              .req   r1
+
+dwidth           .req   r6
+xmax             .req   r7
+
+sx               .req   r8
+tmp              .req   r12
+pTmp0            .req   r9
+pTmp1            .req   r10
+
+
+/*NEON variale Declaration*/
+dAlpha_0         .dn   d0
+dAlpha_1         .dn   d1
+dCoeff           .dn   d2
+
+dS0_01234567     .dn   d4
+dS1_01234567     .dn   d5
+qS0_01234567     .qn   q11
+dS0_0123         .dn   d22
+dS0_4567         .dn   d23
+qS1_01234567     .qn   q8
+dS1_0123         .dn   d16
+dS1_4567         .dn   d17
+
+qDst0_0123       .qn   q9
+qDst1_0123       .qn   q10
+
+                     ldr         pS0, [pSrc], #4
+                     ldr         pS1, [pSrc]
+                     ldr         pD0, [pDst], #4
+                     ldr         pD1, [pDst]
+
+                     mov         tmp, INTER_RESIZE_COEF_SCALE
+                     vdup.16     dCoeff, tmp
+
+                     mov         pXofs, pIn3
+                     ldr         pAlpha, [sp, #32]
+                     ldr         dwidth, [sp, #40]
+                     ldr         xmax, [sp, #52]
+                     sub         dwidth, dwidth, xmax    /* calculate the residual */
+
+                     subs        xmax, xmax, #4
+                     blt         HResize4ResidualLoop
+
+                     ldr         sx, [pXofs], #16     /* for 4 channels only, xofs is changed based on channels */
+                     add         pTmp0, pS0, sx     /* find the address of starting element */
+                     add         pTmp1, pS1, sx
+                     vld2.16     {dAlpha_0, dAlpha_1}, [pAlpha]! /* alpha is repeated based on channels */
+                     vld1.8      {dS0_01234567}, [pTmp0]
+                     vld1.8      {dS1_01234567}, [pTmp1]
+
+HResize4MainLoop:
+
+                     vmovl.u8    qS0_01234567, dS0_01234567
+                     vmovl.u8    qS1_01234567, dS1_01234567
+
+                     vmull.u16   qDst0_0123, dS0_0123, dAlpha_0
+                     vmull.u16   qDst1_0123, dS1_0123, dAlpha_0
+                     vmlal.u16   qDst0_0123, dS0_4567, dAlpha_1
+                     vmlal.u16   qDst1_0123, dS1_4567, dAlpha_1
+
+                     vst1.32     {qDst0_0123}, [pD0]!
+                     vst1.32     {qDst1_0123}, [pD1]!
+
+                     ldr         sx, [pXofs], #16     /* for 4 channels only, xofs is changed based on channels */
+                     add         pTmp0, pS0, sx     /* find the address of starting element */
+                     add         pTmp1, pS1, sx
+                     vld2.16     {dAlpha_0, dAlpha_1}, [pAlpha]! /* alpha is repeated based on channels */
+                     vld1.8      {dS0_01234567}, [pTmp0]
+                     vld1.8      {dS1_01234567}, [pTmp1]
+
+                     subs        xmax, xmax, #4
+                     bge         HResize4MainLoop
+
+                     cbz         dwidth, HResize4End
+
+HResize4ResidualLoop:
+
+                     vmovl.u8    qS0_01234567, dS0_01234567
+                     vmovl.u8    qS1_01234567, dS1_01234567
+
+                     vmull.u16   qDst0_0123, dS0_0123, dCoeff
+                     vmull.u16   qDst1_0123, dS1_0123, dCoeff
+
+                     vst1.32     {qDst0_0123}, [pD0]!
+                     vst1.32     {qDst1_0123}, [pD1]!
+
+                     ldr         sx, [pXofs], #16     /* for 4 channels only, xofs is changed based on channels */
+                     add         pTmp0, pS0, sx     /* find the address of starting element */
+                     add         pTmp1, pS1, sx
+                     vld1.8      {dS0_01234567}, [pTmp0]
+                     vld1.8      {dS1_01234567}, [pTmp1]
+
+                     subs        dwidth, dwidth, #4
+                     bgt         HResize4ResidualLoop
+
+HResize4End:
+                     /*Return From Function*/
+                     pop     {r4-r10,pc}
+
+
+ /*ARM Registers*/
+.unreq               pIn0
+.unreq               pIn1
+.unreq               pIn2
+.unreq               pIn3
+
+.unreq               pSrc
+.unreq               pDst
+.unreq               pXofs
+.unreq               pAlpha
+
+.unreq               pS0
+.unreq               pS1
+.unreq               pD0
+.unreq               pD1
+
+.unreq               dwidth
+.unreq               xmax
+.unreq               sx
+.unreq               tmp
+
+/*NEON variale Declaration*/
+.unreq               dAlpha_0
+.unreq               dAlpha_1
+.unreq               dCoeff
+
+.unreq               dS0_01234567
+.unreq               dS1_01234567
+.unreq               qS0_01234567
+.unreq               dS0_0123
+.unreq               dS0_4567
+.unreq               qS1_01234567
+.unreq               dS1_0123
+.unreq               dS1_4567
+
+.unreq               qDst0_0123
+.unreq               qDst1_0123
+
+
+       .end
-- 
2.7.4