From 7987e268f4be28ff2f14d4c7afded9b2eba302d1 Mon Sep 17 00:00:00 2001 From: yang Date: Tue, 26 Feb 2013 11:18:07 +0800 Subject: [PATCH] add image resize functions(NEON version) --- CMakeLists.txt | 1 + common/NE10_mask_table.c | 19 +- common/NE10_mask_table.h | 5 + inc/NE10.h | 11 + inc/NE10_dsp.h | 4 +- inc/NE10_imgproc.h | 84 ++++++++ modules/CMakeLists.txt | 36 +++- modules/NE10_init.c | 9 + modules/imgproc/NE10_init_imgproc.c | 62 ++++++ modules/imgproc/NE10_resize.neon.s | 397 ++++++++++++++++++++++++++++++++++++ 10 files changed, 620 insertions(+), 8 deletions(-) create mode 100644 inc/NE10_imgproc.h create mode 100644 modules/imgproc/NE10_init_imgproc.c create mode 100644 modules/imgproc/NE10_resize.neon.s diff --git a/CMakeLists.txt b/CMakeLists.txt index e6d8adb..f289251 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,6 +46,7 @@ endif() #select functionalities to be compiled option(NE10_ENABLE_MATH "Build math functionalities to NE10" ON) option(NE10_ENABLE_DSP "Build dsp functionalities to NE10" ON) +option(NE10_ENABLE_IMGPROC "Build image processing functionalities to NE10" ON) set(NE10_VERSION 10) diff --git a/common/NE10_mask_table.c b/common/NE10_mask_table.c index 1e82ea5..0db75a8 100644 --- a/common/NE10_mask_table.c +++ b/common/NE10_mask_table.c @@ -33,11 +33,11 @@ const ne10_uint32_t ne10_qMaskTable32[Q_MASK_TABLE_SIZE] = { - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, - 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, - 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, - 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; const ne10_uint32_t ne10_dMaskTable32[D_MASK_TABLE_SIZE] = { @@ -67,3 +67,12 @@ const ne10_uint32_t ne10_divLookUpTable[DIV_LOOKUP_TABLE_SIZE]= 291,290,289,287,286,285,284,282,281,280,279,278,277,275,274,273, 272,271,270,269,267,266,265,264,263,262,261,260,259,258,257 }; + +const ne10_uint64_t ne10_vresize_mask_residual_table[VRESIZE_MASK_TABLE_SIZE] = +{ + 0x00000000000000FF, 0x000000000000FFFF, + 0x0000000000FFFFFF, 0x00000000FFFFFFFF, + 0x000000FFFFFFFFFF, 0x0000FFFFFFFFFFFF, + 0x00FFFFFFFFFFFFFF +}; + diff --git a/common/NE10_mask_table.h b/common/NE10_mask_table.h index afcabba..e051ea1 100644 --- a/common/NE10_mask_table.h +++ b/common/NE10_mask_table.h @@ -37,8 +37,13 @@ #define D_MASK_TABLE_SIZE 6 #define DIV_LOOKUP_TABLE_SIZE 255 +/* mask table for dsp module */ extern const ne10_uint32_t ne10_qMaskTable32[Q_MASK_TABLE_SIZE]; extern const ne10_uint32_t ne10_dMaskTable32[D_MASK_TABLE_SIZE]; extern const ne10_uint32_t ne10_divLookUpTable[DIV_LOOKUP_TABLE_SIZE]; + +/* mask table for imgproc module */ +#define VRESIZE_MASK_TABLE_SIZE 7 +extern const ne10_uint64_t ne10_vresize_mask_residual_table[VRESIZE_MASK_TABLE_SIZE]; #endif diff --git a/inc/NE10.h b/inc/NE10.h index c21fd96..148bc86 100644 --- a/inc/NE10.h +++ b/inc/NE10.h @@ -82,6 +82,7 @@ * * - @link groupMaths Math Functions@endlink * - @link groupDSPs Signal Processing Functions@endlink + * - @link groupIMGPROCs Image Processing Functions@endlink * - Physics functions * - Image Processing functions * - Others @@ -123,6 +124,15 @@ * such as complex/real FFT/IFFT, FIR and IIR. Currently, only the float (single precision) * data type is supported. */ + +/** + * @defgroup groupIMGPROCs Image Processing Functions + * + * + * This set of functions provide some commonly used functions in image processing, + * such as image scale, image rotate. + */ + /** * @defgroup groupSamples Sample Functions * @@ -142,6 +152,7 @@ extern "C" { #include "NE10_init.h" #include "NE10_math.h" #include "NE10_dsp.h" +#include "NE10_imgproc.h" #ifdef __cplusplus } diff --git a/inc/NE10_dsp.h b/inc/NE10_dsp.h index d25a9cd..ad3109b 100644 --- a/inc/NE10_dsp.h +++ b/inc/NE10_dsp.h @@ -217,14 +217,14 @@ extern "C" { /** @} */ //end of FIR group /** - * @addtogroup FIR_decimate + * @addtogroup FIR_Decimate * @{ */ extern void ne10_fir_decimate_float_neon (const ne10_fir_decimate_instance_f32_t * S, ne10_float32_t *pSrc, ne10_float32_t *pDst, ne10_uint32_t blockSize); - /** @} */ //end of FIR_decimate group + /** @} */ //end of FIR_Decimate group /** * @addtogroup FIR_Interpolate diff --git a/inc/NE10_imgproc.h b/inc/NE10_imgproc.h new file mode 100644 index 0000000..5a79b16 --- /dev/null +++ b/inc/NE10_imgproc.h @@ -0,0 +1,84 @@ +/* + * Copyright 2013 ARM Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of ARM Limited nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NE10 Library : inc/NE10_imgproc.h + */ + + +#include + +#ifndef NE10_IMGPROC_H +#define NE10_IMGPROC_H + +#ifdef __cplusplus +extern "C" { +#endif + +/////////////////////////// +// function prototypes: +/////////////////////////// + + /* image resize functions*/ + + /* function pointers*/ + extern void (*ne10_vresize) (const ne10_int32_t** src, + ne10_uint8_t* dst, + const ne10_int16_t* beta, + ne10_int32_t width); + extern void (*ne10_hresize_4channels) (const ne10_uint8_t** src, + ne10_int32_t** dst, + ne10_int32_t count, + const ne10_int32_t* xofs, + const ne10_int16_t* alpha, + ne10_int32_t swidth, + ne10_int32_t dwidth, + ne10_int32_t cn, + ne10_int32_t xmin, + ne10_int32_t xmax); + + /* NEON version*/ + extern void ne10_vresize_neon (const ne10_int32_t** src, + ne10_uint8_t* dst, + const ne10_int16_t* beta, + ne10_int32_t width); + extern void ne10_hresize_4channels_neon (const ne10_uint8_t** src, + ne10_int32_t** dst, + ne10_int32_t count, + const ne10_int32_t* xofs, + const ne10_int16_t* alpha, + ne10_int32_t swidth, + ne10_int32_t dwidth, + ne10_int32_t cn, + ne10_int32_t xmin, + ne10_int32_t xmax); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt index 8dea024..6a4969e 100644 --- a/modules/CMakeLists.txt +++ b/modules/CMakeLists.txt @@ -171,13 +171,47 @@ if(NE10_ENABLE_DSP) ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_init_dsp.c ) - # Add math files + # Add dsp files set(NE10_INIT_SRCS ${NE10_INIT_SRCS} ${NE10_DSP_INIT_SRCS}) set(NE10_C_SRCS ${NE10_C_SRCS} ${NE10_DSP_C_SRCS}) set(NE10_INTRINSIC_SRCS ${NE10_INTRINSIC_SRCS} ${NE10_DSP_INTRINSIC_SRCS}) set(NE10_NEON_SRCS ${NE10_NEON_SRCS} ${NE10_DSP_NEON_SRCS}) endif() +if(NE10_ENABLE_IMGPROC) + #enable NE10_init_imgproc + add_definitions(-DNE10_ENABLE_IMGPROC) + # Add image processing C files. + set(NE10_IMGPROC_C_SRCS + ${PROJECT_SOURCE_DIR}/common/NE10_mask_table.c + ) + + # Add image processing NEON files. + set(NE10_IMGPROC_NEON_SRCS + ${PROJECT_SOURCE_DIR}/modules/imgproc/NE10_resize.neon.s + ) + + # Tell CMake these files need to go to the C compiler + set(FLAGS "-mfpu=neon -Wa,-I${PROJECT_SOURCE_DIR}/inc -Wa,-I${PROJECT_SOURCE_DIR}/common" ) + foreach(neon_file ${NE10_IMGPROC_NEON_SRCS}) + set_property (SOURCE ${neon_file} PROPERTY LANGUAGE C) + set_source_files_properties( + ${neon_file} PROPERTIES COMPILE_FLAGS + ${FLAGS} + ) + endforeach(neon_file) + + # Add image processing init files. + set(NE10_IMGPROC_INIT_SRCS + ${PROJECT_SOURCE_DIR}/modules/imgproc/NE10_init_imgproc.c + ) + + # Add image processing files + set(NE10_INIT_SRCS ${NE10_INIT_SRCS} ${NE10_IMGPROC_INIT_SRCS}) + set(NE10_C_SRCS ${NE10_C_SRCS} ${NE10_IMGPROC_C_SRCS}) + set(NE10_NEON_SRCS ${NE10_NEON_SRCS} ${NE10_IMGPROC_NEON_SRCS}) +endif() + include_directories ( ${PROJECT_SOURCE_DIR}/inc ${PROJECT_SOURCE_DIR}/common diff --git a/modules/NE10_init.c b/modules/NE10_init.c index b00482c..a4ad821 100644 --- a/modules/NE10_init.c +++ b/modules/NE10_init.c @@ -84,5 +84,14 @@ ne10_result_t ne10_init() } #endif +#if defined (NE10_ENABLE_IMGPROC) + status = ne10_init_imgproc (is_NEON_available); + if (status != NE10_OK) + { + fprintf(stderr, "ERROR: init imgproc failed\n"); + return NE10_ERR; + } +#endif + return NE10_OK; } diff --git a/modules/imgproc/NE10_init_imgproc.c b/modules/imgproc/NE10_init_imgproc.c new file mode 100644 index 0000000..afe7002 --- /dev/null +++ b/modules/imgproc/NE10_init_imgproc.c @@ -0,0 +1,62 @@ +/* + * Copyright 2013 ARM Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of ARM Limited nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "NE10_imgproc.h" + +ne10_result_t ne10_init_imgproc (ne10_int32_t is_NEON_available) +{ + if (NE10_OK == is_NEON_available) + { + ne10_vresize = ne10_vresize_neon; + ne10_hresize_4channels = ne10_hresize_4channels_neon; + } + else + { + ; + } + return NE10_OK; +} + +// These are actual definitions of our function pointers that are declared in inc/NE10_imgproc.h +void (*ne10_vresize) (const ne10_int32_t** src, + ne10_uint8_t* dst, + const ne10_int16_t* beta, + ne10_int32_t width); +void (*ne10_hresize_4channels) (const ne10_uint8_t** src, + ne10_int32_t** dst, + ne10_int32_t count, + const ne10_int32_t* xofs, + const ne10_int16_t* alpha, + ne10_int32_t swidth, + ne10_int32_t dwidth, + ne10_int32_t cn, + ne10_int32_t xmin, + ne10_int32_t xmax); + + diff --git a/modules/imgproc/NE10_resize.neon.s b/modules/imgproc/NE10_resize.neon.s new file mode 100644 index 0000000..e72ae76 --- /dev/null +++ b/modules/imgproc/NE10_resize.neon.s @@ -0,0 +1,397 @@ +/* + * Copyright 2013 ARM Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of ARM Limited nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NE10 Library : imgproc/NE10_resize.neon.s + */ + + .text + .syntax unified + + + /** + * @details + * This function implements the vertical interpolation + * + * @param[in] **src points to input pointers + * @param[out] *dst points to the output buffer + * @param[in] *beta points to interpolate parameter + * @param[in] width width of output buffer + */ + + .align 4 + .global ne10_vresize_neon + .thumb + .extern ne10_vresize_mask_residual_table/* mask of store data */ + .thumb_func + .equ BITS, 0x16 /* INTER_RESIZE_COEF_BITS*2 */ + .equ DELTA, 0x200000 /* 1 << (INTER_RESIZE_COEF_BITS*2 - 1) */ + +ne10_vresize_neon: + push {r4-r6,lr} + +/*ARM Registers*/ +pSrc .req r0 +pDst .req r1 +pBeta .req r2 +width .req r3 + +pS0 .req r4 +pS1 .req r5 +tmp .req r6 + +beta0 .req r2 +beta1 .req r6 + +pMask .req r6 + +/*NEON variale Declaration*/ +dBeta0 .dn d0[0] +dBeta1 .dn d0[1] +dMask .dn d1 +qDelta .qn q1 +qMin .qn q12 +qMax .qn q13 + +qS0_0123 .qn q2 +qS0_4567 .qn q3 +qS1_0123 .qn q8 +qS1_4567 .qn q9 + +qTmp_0123 .qn q10 +qTmp_4567 .qn q11 +dTmp_0123 .dn d20 +dTmp_4567 .dn d21 +qTmp_01234567 .qn q10 +dTmp_01234567 .dn d20 +dDst_01234567 .dn d21 + + ldr beta0, [pBeta] + ldr pS0, [pSrc], #4 + ldr pS1, [pSrc] + lsr beta1, beta0, #16 + lsl beta0, beta0, #16 + lsr beta0, beta0, #16 + vmov.s32 dBeta0, beta0 + vmov.s32 dBeta1, beta1 + + mov tmp, DELTA + vdup.32 qDelta, tmp + veor qMin, qMin, qMin + mov tmp, #255 + vdup.32 qMax, tmp + + subs width, width, #8 + blt VResizeResidualLoop + + vld1.s32 {qS0_0123, qS0_4567}, [pS0]! + vld1.s32 {qS1_0123, qS1_4567}, [pS1]! +VResizeMainLoop: + + vmul.s32 qTmp_0123, qS0_0123, dBeta0 + vmul.s32 qTmp_4567, qS0_4567, dBeta0 + vmla.s32 qTmp_0123, qS1_0123, dBeta1 + vmla.s32 qTmp_4567, qS1_4567, dBeta1 + + vadd.s32 qTmp_0123, qTmp_0123, qDelta + vadd.s32 qTmp_4567, qTmp_4567, qDelta + + vshr.s32 qTmp_0123, qTmp_0123, #BITS + vshr.s32 qTmp_4567, qTmp_4567, #BITS + + vmax.s32 qTmp_0123, qTmp_0123, qMin + vmax.s32 qTmp_4567, qTmp_4567, qMin + vmin.s32 qTmp_0123, qTmp_0123, qMax + vmin.s32 qTmp_4567, qTmp_4567, qMax + + vmovn.I32 dTmp_0123, qTmp_0123 + vmovn.I32 dTmp_4567, qTmp_4567 + vmovn.I16 dTmp_01234567, qTmp_01234567 + vst1.8 {dTmp_01234567}, [pDst]! + + + subs width, width, #8 + vld1.s32 {qS0_0123, qS0_4567}, [pS0]! + vld1.s32 {qS1_0123, qS1_4567}, [pS1]! + bge VResizeMainLoop + + adds width, width, #8 + beq VResizeEnd + + sub width, width, #1 + ldr pMask, =ne10_vresize_mask_residual_table + sub width, width, #1 + add pMask, pMask, width, lsl #3 + vld1.64 {dMask}, [pMask] + vld1.64 {dDst_01234567}, [pDst] + +VResizeResidualLoop: + + vmul.s32 qTmp_0123, qS0_0123, dBeta0 + vmul.s32 qTmp_4567, qS0_4567, dBeta0 + vmla.s32 qTmp_0123, qS1_0123, dBeta1 + vmla.s32 qTmp_4567, qS1_4567, dBeta1 + + vadd.s32 qTmp_0123, qTmp_0123, qDelta + vadd.s32 qTmp_4567, qTmp_4567, qDelta + + vshr.s32 qTmp_0123, qTmp_0123, #BITS + vshr.s32 qTmp_4567, qTmp_4567, #BITS + + vmax.s32 qTmp_0123, qTmp_0123, qMin + vmax.s32 qTmp_4567, qTmp_4567, qMin + vmin.s32 qTmp_0123, qTmp_0123, qMax + vmin.s32 qTmp_4567, qTmp_4567, qMax + + vmovn.I32 dTmp_0123, qTmp_0123 + vmovn.I32 dTmp_4567, qTmp_4567 + vmovn.I16 dTmp_01234567, qTmp_01234567 + vbsl dMask, dTmp_01234567, dDst_01234567 + vst1.8 {dTmp_01234567}, [pDst] +VResizeEnd: + /*Return From Function*/ + pop {r4-r6,pc} + + +/*ARM Registers*/ +.unreq pSrc +.unreq pDst +.unreq pBeta +.unreq width + +.unreq pS0 +.unreq pS1 +.unreq tmp + +.unreq beta0 +.unreq beta1 + +.unreq pMask + + /*NEON variale Declaration*/ +.unreq dBeta0 +.unreq dBeta1 +.unreq qDelta +.unreq dMask + +.unreq qMin +.unreq qMax + +.unreq qS0_0123 +.unreq qS0_4567 +.unreq qS1_0123 +.unreq qS1_4567 +.unreq qTmp_0123 +.unreq qTmp_4567 +.unreq dTmp_0123 +.unreq dTmp_4567 + +.unreq qTmp_01234567 +.unreq dTmp_01234567 +.unreq dDst_01234567 + + /** + * @details + * This function implements the horizontal interpolation + * + * @param[in] **src points to input pointers + * @param[out] **dst points to the output pointers + * @param[in] count + * @param[in] *xofs points to interpolate offset + * @param[in] *alpha points to interpolate parameter + * @param[in] swidth width of input buffer + * @param[in] dwidth width of output buffer + * @param[in] cn + * @param[in] xmin + * @param[in] xmax + */ + + .align 4 + .global ne10_hresize_4channels_neon + .thumb + .thumb_func + .equ INTER_RESIZE_COEF_SCALE, 0x800 /* 1 << INTER_RESIZE_COEF_BITS */ + +ne10_hresize_4channels_neon: + push {r4-r10,lr} + +/*ARM Registers*/ +pIn0 .req r0 +pIn1 .req r1 +pIn2 .req r2 +pIn3 .req r3 + +pSrc .req r0 +pDst .req r1 +pXofs .req r2 +pAlpha .req r3 + +pS0 .req r4 +pS1 .req r0 +pD0 .req r5 +pD1 .req r1 + +dwidth .req r6 +xmax .req r7 + +sx .req r8 +tmp .req r12 +pTmp0 .req r9 +pTmp1 .req r10 + + +/*NEON variale Declaration*/ +dAlpha_0 .dn d0 +dAlpha_1 .dn d1 +dCoeff .dn d2 + +dS0_01234567 .dn d4 +dS1_01234567 .dn d5 +qS0_01234567 .qn q11 +dS0_0123 .dn d22 +dS0_4567 .dn d23 +qS1_01234567 .qn q8 +dS1_0123 .dn d16 +dS1_4567 .dn d17 + +qDst0_0123 .qn q9 +qDst1_0123 .qn q10 + + ldr pS0, [pSrc], #4 + ldr pS1, [pSrc] + ldr pD0, [pDst], #4 + ldr pD1, [pDst] + + mov tmp, INTER_RESIZE_COEF_SCALE + vdup.16 dCoeff, tmp + + mov pXofs, pIn3 + ldr pAlpha, [sp, #32] + ldr dwidth, [sp, #40] + ldr xmax, [sp, #52] + sub dwidth, dwidth, xmax /* calculate the residual */ + + subs xmax, xmax, #4 + blt HResize4ResidualLoop + + ldr sx, [pXofs], #16 /* for 4 channels only, xofs is changed based on channels */ + add pTmp0, pS0, sx /* find the address of starting element */ + add pTmp1, pS1, sx + vld2.16 {dAlpha_0, dAlpha_1}, [pAlpha]! /* alpha is repeated based on channels */ + vld1.8 {dS0_01234567}, [pTmp0] + vld1.8 {dS1_01234567}, [pTmp1] + +HResize4MainLoop: + + vmovl.u8 qS0_01234567, dS0_01234567 + vmovl.u8 qS1_01234567, dS1_01234567 + + vmull.u16 qDst0_0123, dS0_0123, dAlpha_0 + vmull.u16 qDst1_0123, dS1_0123, dAlpha_0 + vmlal.u16 qDst0_0123, dS0_4567, dAlpha_1 + vmlal.u16 qDst1_0123, dS1_4567, dAlpha_1 + + vst1.32 {qDst0_0123}, [pD0]! + vst1.32 {qDst1_0123}, [pD1]! + + ldr sx, [pXofs], #16 /* for 4 channels only, xofs is changed based on channels */ + add pTmp0, pS0, sx /* find the address of starting element */ + add pTmp1, pS1, sx + vld2.16 {dAlpha_0, dAlpha_1}, [pAlpha]! /* alpha is repeated based on channels */ + vld1.8 {dS0_01234567}, [pTmp0] + vld1.8 {dS1_01234567}, [pTmp1] + + subs xmax, xmax, #4 + bge HResize4MainLoop + + cbz dwidth, HResize4End + +HResize4ResidualLoop: + + vmovl.u8 qS0_01234567, dS0_01234567 + vmovl.u8 qS1_01234567, dS1_01234567 + + vmull.u16 qDst0_0123, dS0_0123, dCoeff + vmull.u16 qDst1_0123, dS1_0123, dCoeff + + vst1.32 {qDst0_0123}, [pD0]! + vst1.32 {qDst1_0123}, [pD1]! + + ldr sx, [pXofs], #16 /* for 4 channels only, xofs is changed based on channels */ + add pTmp0, pS0, sx /* find the address of starting element */ + add pTmp1, pS1, sx + vld1.8 {dS0_01234567}, [pTmp0] + vld1.8 {dS1_01234567}, [pTmp1] + + subs dwidth, dwidth, #4 + bgt HResize4ResidualLoop + +HResize4End: + /*Return From Function*/ + pop {r4-r10,pc} + + + /*ARM Registers*/ +.unreq pIn0 +.unreq pIn1 +.unreq pIn2 +.unreq pIn3 + +.unreq pSrc +.unreq pDst +.unreq pXofs +.unreq pAlpha + +.unreq pS0 +.unreq pS1 +.unreq pD0 +.unreq pD1 + +.unreq dwidth +.unreq xmax +.unreq sx +.unreq tmp + +/*NEON variale Declaration*/ +.unreq dAlpha_0 +.unreq dAlpha_1 +.unreq dCoeff + +.unreq dS0_01234567 +.unreq dS1_01234567 +.unreq qS0_01234567 +.unreq dS0_0123 +.unreq dS0_4567 +.unreq qS1_01234567 +.unreq dS1_0123 +.unreq dS1_4567 + +.unreq qDst0_0123 +.unreq qDst1_0123 + + + .end -- 2.7.4