From a4d7cf270e4dfeca379d1801e5a7778e7e8c9411 Mon Sep 17 00:00:00 2001 From: yang Date: Fri, 19 Oct 2012 11:02:42 +0800 Subject: [PATCH] add fir and iir filter function: 1. fir 2. fir decimate 3. fir interpolate 4. fir lattice 5. fir sparse 6. iir lattice --- common/NE10_mask_table.c | 19 + common/NE10_mask_table.h | 6 +- inc/NE10_dsp.h | 160 ++++ inc/NE10_types.h | 70 ++ modules/CMakeLists.txt | 6 + modules/dsp/NE10_fir.c | 1280 ++++++++++++++++++++++++++++ modules/dsp/NE10_fir.neon.s | 1959 +++++++++++++++++++++++++++++++++++++++++++ modules/dsp/NE10_fir_init.c | 276 ++++++ modules/dsp/NE10_iir.c | 306 +++++++ modules/dsp/NE10_iir.neon.s | 378 +++++++++ modules/dsp/NE10_iir_init.c | 61 ++ modules/dsp/NE10_init_dsp.c | 51 ++ 12 files changed, 4570 insertions(+), 2 deletions(-) create mode 100644 modules/dsp/NE10_fir.c create mode 100644 modules/dsp/NE10_fir.neon.s create mode 100644 modules/dsp/NE10_fir_init.c create mode 100644 modules/dsp/NE10_iir.c create mode 100644 modules/dsp/NE10_iir.neon.s create mode 100644 modules/dsp/NE10_iir_init.c diff --git a/common/NE10_mask_table.c b/common/NE10_mask_table.c index c0a414b..a7845aa 100644 --- a/common/NE10_mask_table.c +++ b/common/NE10_mask_table.c @@ -36,4 +36,23 @@ const ne10_uint32_t ne10_dMaskTable32[D_MASK_TABLE_SIZE] = }; +const ne10_uint32_t ne10_divLookUpTable[DIV_LOOKUP_TABLE_SIZE]= +{ + 65535,32768,21845,16384,13107,10923,9362,8192,7282,6554,5958,5461,5041,4681,4369,4096, + 3855,3641,3449,3277,3121,2979,2849,2731,2621,2521,2427,2341,2260,2185,2114,2048, + 1986,1928,1872,1820,1771,1725,1680,1638,1598,1560,1524,1489,1456,1425,1394,1365, + 1337,1311,1285,1260,1237,1214,1192,1170,1150,1130,1111,1092,1074,1057,1040,1024, + 1008,993,978,964,950,936,923,910,898,886,874,862,851,840,830,819, + 809,799,790,780,771,762,753,745,736,728,720,712,705,697,690,683, + 676,669,662,655,649,643,636,630,624,618,612,607,601,596,590,585, + 580,575,570,565,560,555,551,546,542,537,533,529,524,520,516,512, + 508,504,500,496,493,489,485,482,478,475,471,468,465,462,458,455, + 452,449,446,443,440,437,434,431,428,426,423,420,417,415,412,410, + 407,405,402,400,397,395,392,390,388,386,383,381,379,377,374,372, + 370,368,366,364,362,360,358,356,354,352,350,349,347,345,343,341, + 340,338,336,334,333,331,329,328,326,324,323,321,320,318,317,315, + 314,312,311,309,308,306,305,303,302,301,299,298,297,295,294,293, + 291,290,289,287,286,285,284,282,281,280,279,278,277,275,274,273, + 272,271,270,269,267,266,265,264,263,262,261,260,259,258,257 + }; diff --git a/common/NE10_mask_table.h b/common/NE10_mask_table.h index 9787ffc..67211c4 100644 --- a/common/NE10_mask_table.h +++ b/common/NE10_mask_table.h @@ -22,10 +22,12 @@ #ifndef _ARM_MASK_TABLE_H #define _ARM_MASK_TABLE_H -#define Q_MASK_TABLE_SIZE 20 -#define D_MASK_TABLE_SIZE 6 +#define Q_MASK_TABLE_SIZE 20 +#define D_MASK_TABLE_SIZE 6 +#define DIV_LOOKUP_TABLE_SIZE 255 extern const ne10_uint32_t ne10_qMaskTable32[Q_MASK_TABLE_SIZE]; extern const ne10_uint32_t ne10_dMaskTable32[D_MASK_TABLE_SIZE]; +extern const ne10_uint32_t ne10_divLookUpTable[DIV_LOOKUP_TABLE_SIZE]; #endif diff --git a/inc/NE10_dsp.h b/inc/NE10_dsp.h index 1cc492e..5240126 100644 --- a/inc/NE10_dsp.h +++ b/inc/NE10_dsp.h @@ -93,6 +93,166 @@ extern void ne10_rfft_float_neon( ne10_float32_t * pSrc, ne10_float32_t * pDst, ne10_float32_t * pTemp); +/* fir functions*/ + +/* function pointers*/ +extern void (*ne10_fir_float)(const ne10_fir_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); + +extern void (*ne10_fir_decimate_float)( + const ne10_fir_decimate_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); + +extern void (*ne10_fir_interpolate_float)( + const ne10_fir_interpolate_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); + +extern void (*ne10_fir_lattice_float)( + const ne10_fir_lattice_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); + +extern void (*ne10_fir_sparse_float)( + ne10_fir_sparse_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_float32_t * pScratchIn, + ne10_uint32_t blockSize); + + +/* init functions*/ +extern ne10_result_t ne10_fir_init_float(ne10_fir_instance_f32_t * S, + ne10_uint16_t numTaps, + ne10_float32_t * pCoeffs, + ne10_float32_t * pState, + ne10_uint32_t blockSize); + +extern ne10_result_t ne10_fir_decimate_init_float( + ne10_fir_decimate_instance_f32_t * S, + ne10_uint16_t numTaps, + ne10_uint8_t M, + ne10_float32_t * pCoeffs, + ne10_float32_t * pState, + ne10_uint32_t blockSize); + +extern ne10_result_t ne10_fir_interpolate_init_float( + ne10_fir_interpolate_instance_f32_t * S, + ne10_uint8_t L, + ne10_uint16_t numTaps, + ne10_float32_t * pCoeffs, + ne10_float32_t * pState, + ne10_uint32_t blockSize); + +extern ne10_result_t ne10_fir_lattice_init_float( + ne10_fir_lattice_instance_f32_t * S, + ne10_uint16_t numStages, + ne10_float32_t * pCoeffs, + ne10_float32_t * pState); + +extern ne10_result_t ne10_fir_sparse_init_float( + ne10_fir_sparse_instance_f32_t * S, + ne10_uint16_t numTaps, + ne10_float32_t * pCoeffs, + ne10_float32_t * pState, + ne10_int32_t * pTapDelay, + ne10_uint16_t maxDelay, + ne10_uint32_t blockSize); + +/* C version*/ +extern void ne10_fir_float_c(const ne10_fir_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); + +extern void ne10_fir_decimate_float_c( + const ne10_fir_decimate_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); + +extern void ne10_fir_interpolate_float_c( + const ne10_fir_interpolate_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); + +extern void ne10_fir_lattice_float_c( + const ne10_fir_lattice_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); + +extern void ne10_fir_sparse_float_c( + ne10_fir_sparse_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_float32_t * pScratchIn, + ne10_uint32_t blockSize); + + +/* NEON version*/ +extern void ne10_fir_float_neon(const ne10_fir_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); + +extern void ne10_fir_decimate_float_neon(const ne10_fir_decimate_instance_f32_t * S, + ne10_float32_t *pSrc, + ne10_float32_t *pDst, + ne10_uint32_t blockSize); + +extern void ne10_fir_interpolate_float_neon(const ne10_fir_interpolate_instance_f32_t * S, + ne10_float32_t *pSrc, + ne10_float32_t *pDst, + ne10_uint32_t blockSize); + +extern void ne10_fir_lattice_float_neon(const ne10_fir_lattice_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); + +extern void ne10_fir_sparse_float_neon(ne10_fir_sparse_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_float32_t * pScratch, + ne10_uint32_t blockSize); + + +/* iir functions*/ + +/* function pointers*/ +extern void (*ne10_iir_lattice_float)(const ne10_iir_lattice_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); + +/* init functions*/ +extern ne10_result_t ne10_iir_lattice_init_float(ne10_iir_lattice_instance_f32_t * S, + ne10_uint16_t numStages, + ne10_float32_t * pkCoeffs, + ne10_float32_t * pvCoeffs, + ne10_float32_t * pState, + ne10_uint32_t blockSize); + + +/* C version*/ +extern void ne10_iir_lattice_float_c(const ne10_iir_lattice_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); + +/* NEON version*/ +extern void ne10_iir_lattice_float_neon(const ne10_iir_lattice_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); #ifdef __cplusplus } diff --git a/inc/NE10_types.h b/inc/NE10_types.h index e4db06e..bd03629 100644 --- a/inc/NE10_types.h +++ b/inc/NE10_types.h @@ -199,4 +199,74 @@ typedef struct ne10_cfft_radix4_instance_f32_t *p_cfft; /**< Pointer to the complex FFT Instance. */ } ne10_rfft_instance_f32_t; +///////////////////////////////////////////////////////// +// definitions for fir +///////////////////////////////////////////////////////// + +/* + * @brief Instance structure for the floating-point FIR filter. + */ +typedef struct +{ + ne10_uint16_t numTaps; /**< Length of the filter. */ + ne10_float32_t *pState; /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */ + ne10_float32_t *pCoeffs; /**< Points to the coefficient array. The array is of length numTaps. */ +} ne10_fir_instance_f32_t; + +/* + * @brief Instance structure for the floating point FIR Lattice filter. + */ +typedef struct +{ + ne10_uint16_t numStages; /**< numStages of the of lattice filter. */ + ne10_float32_t *pState; /**< Points to the state variable array. The array is of length numStages. */ + ne10_float32_t *pCoeffs; /**< Points to the coefficient array. The array is of length numStages. */ +} ne10_fir_lattice_instance_f32_t; + +/* + * @brief Instance structure for the floating-point FIR Decimation. + */ +typedef struct +{ + ne10_uint8_t M; /**< Decimation Factor. */ + ne10_uint16_t numTaps; /**< Length of the filter. */ + ne10_float32_t *pCoeffs; /**< Points to the coefficient array. The array is of length numTaps.*/ + ne10_float32_t *pState; /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */ +} ne10_fir_decimate_instance_f32_t; + +/* + * @brief Instance structure for the floating-point FIR Interpolation. + */ +typedef struct +{ + ne10_uint8_t L; /**< Interpolation Factor. */ + ne10_uint16_t phaseLength; /**< Length of each polyphase filter component. */ + ne10_float32_t *pCoeffs; /**< Points to the coefficient array. The array is of length numTaps.*/ + ne10_float32_t *pState; /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */ +} ne10_fir_interpolate_instance_f32_t; + +/* + * @brief Instance structure for the floating-point FIR Sparse filter. + */ +typedef struct +{ + ne10_uint16_t numTaps; /**< Length of the filter. */ + ne10_uint16_t stateIndex; /**< Index pointer for the state buffer .*/ + ne10_float32_t *pState; /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */ + ne10_float32_t *pCoeffs; /**< Points to the coefficient array. The array is of length numTaps.*/ + ne10_uint16_t maxDelay; /**< the largest number of delay line values .*/ + ne10_int32_t *pTapDelay; /**< Pointer to the array containing positions of the non-zero tap values. */ +} ne10_fir_sparse_instance_f32_t; + +/** + * @brief Instance structure for the floating point IIR Lattice filter. + */ +typedef struct +{ + ne10_uint16_t numStages; /**< numStages of the of lattice filter. */ + ne10_float32_t *pState; /**< Points to the state variable array. The array is of length numStages + blockSize -1. */ + ne10_float32_t *pkCoeffs; /**< Points to the reflection coefficient array. The array is of length numStages. */ + ne10_float32_t *pvCoeffs; /**< Points to the ladder coefficient array. The array is of length numStages+1. */ +} ne10_iir_lattice_instance_f32_t; + #endif diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt index 7329b63..f74ed9d 100644 --- a/modules/CMakeLists.txt +++ b/modules/CMakeLists.txt @@ -122,6 +122,10 @@ if(NE10_ENABLE_DSP) ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_cfft_init.c ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_rfft.c ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_rfft_init.c + ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fir.c + ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fir_init.c + ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_iir.c + ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_iir_init.c ) # Add dsp intrinsic NEON files. @@ -137,6 +141,8 @@ if(NE10_ENABLE_DSP) # Add dsp NEON files. set(NE10_DSP_NEON_SRCS ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_cfft.neon.s + ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fir.neon.s + ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_iir.neon.s ) # Tell CMake these files need to go to the C compiler diff --git a/modules/dsp/NE10_fir.c b/modules/dsp/NE10_fir.c new file mode 100644 index 0000000..4c95d28 --- /dev/null +++ b/modules/dsp/NE10_fir.c @@ -0,0 +1,1280 @@ +/* + * Copyright 2012 ARM Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NE10 Library : dsp/NE10_fir.c + */ + +#include "NE10_types.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup FIR Finite Impulse Response (FIR) Filters + * + * This set of functions implements Finite Impulse Response (FIR) filters + * for floating-point data types. + * The functions operate on blocks of input and output data and each call to the function processes + * blockSize samples through the filter. pSrc and + * pDst points to input and output arrays containing blockSize values. + * + * \par Algorithm: + * The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations. + * Each filter coefficient b[n] is multiplied by a state variable which equals a previous input sample x[n]. + *
+ *    y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
+ * 
+ * \par + * \image html FIR.gif "Finite Impulse Response filter" + * \par + * pCoeffs points to a coefficient array of size numTaps. + * Coefficients are stored in time reversed order. + * \par + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * \par + * pState points to a state array of size numTaps + blockSize - 1. + * Samples in the state buffer are stored in the following order. + * \par + *
+ *    {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
+ * 
+ * \par + * Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1. + * The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters, + * to be avoided and yields a significant speed improvement. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. + * There are separate instance structure declarations for each of the 4 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * Set the values in the state buffer to zeros before static initialization. + * The code below statically initializes each of the 4 different data type filter instance structures + *
+ *ne10_fir_instance_f32_t S = {numTaps, pState, pCoeffs};
+ * 
+ * + * where numTaps is the number of filter coefficients in the filter; pState is the address of the state buffer; + * pCoeffs is the address of the coefficient buffer. + * + * \par Fixed-Point Behavior + * Care must be taken when using the fixed-point versions of the FIR filter functions. + * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * + * @param[in] *S points to an instance of the floating-point FIR filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + */ + +void ne10_fir_float_c ( + const ne10_fir_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize) +{ + + ne10_float32_t *pState = S->pState; /* State pointer */ + ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + ne10_float32_t *pStateCurnt; /* Points to the current sample of the state */ + ne10_float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + ne10_uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + ne10_uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + ne10_float32_t acc0, acc1, acc2, acc3; /* Accumulators */ + ne10_float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ + + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = & (S->pState[ (numTaps - 1u)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0u) + { + /* Copy four new input samples into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Set all accumulators to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* Read the first three samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */ + x0 = *px++; + x1 = *px++; + x2 = *px++; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2u; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while (tapCnt > 0u) + { + /* Read the b[numTaps-1] coefficient */ + c0 = * (pb++); + + /* Read x[n-numTaps-3] sample */ + x3 = * (px++); + + /* acc0 += b[numTaps-1] * x[n-numTaps] */ + acc0 += x0 * c0; + + /* acc1 += b[numTaps-1] * x[n-numTaps-1] */ + acc1 += x1 * c0; + + /* acc2 += b[numTaps-1] * x[n-numTaps-2] */ + acc2 += x2 * c0; + + /* acc3 += b[numTaps-1] * x[n-numTaps-3] */ + acc3 += x3 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = * (pb++); + + /* Read x[n-numTaps-4] sample */ + x0 = * (px++); + + /* Perform the multiply-accumulate */ + acc0 += x1 * c0; + acc1 += x2 * c0; + acc2 += x3 * c0; + acc3 += x0 * c0; + + /* Read the b[numTaps-3] coefficient */ + c0 = * (pb++); + + /* Read x[n-numTaps-5] sample */ + x1 = * (px++); + + /* Perform the multiply-accumulates */ + acc0 += x2 * c0; + acc1 += x3 * c0; + acc2 += x0 * c0; + acc3 += x1 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = * (pb++); + + /* Read x[n-numTaps-6] sample */ + x2 = * (px++); + + /* Perform the multiply-accumulates */ + acc0 += x3 * c0; + acc1 += x0 * c0; + acc2 += x1 * c0; + acc3 += x2 * c0; + + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while (tapCnt > 0u) + { + /* Read coefficients */ + c0 = * (pb++); + + /* Fetch 1 state variable */ + x3 = * (px++); + + /* Perform the multiply-accumulates */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + + /* Reuse the present sample states for next sample */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 4; + + /* The results in the 4 accumulators, store in the destination buffer. */ + *pDst++ = acc0; + *pDst++ = acc1; + *pDst++ = acc2; + *pDst++ = acc3; + + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4u; + + while (blkCnt > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = (pCoeffs); + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + acc0 += *px++ * *pb++; + i--; + + } + while (i > 0u); + + /* The result is store in the destination buffer. */ + *pDst++ = acc0; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (numTaps - 1u) >> 2u; + + /* copy data */ + while (tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* Copy the remaining q31_t data */ + while (tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + +/** + * @brief Processing function for the floating-point FIR decimator. + * @param[in] *S points to an instance of the floating-point FIR decimator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of input samples to process per call. + * @return none. + */ + +void ne10_fir_decimate_float_c ( + const ne10_fir_decimate_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize) +{ + ne10_float32_t *pState = S->pState; /* State pointer */ + ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + ne10_float32_t *pStateCurnt; /* Points to the current sample of the state */ + ne10_float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + ne10_float32_t sum0; /* Accumulator */ + ne10_float32_t x0, c0; /* Temporary variables to hold state and coefficient values */ + ne10_uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + ne10_uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */ + + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1u); + + /* Total number of output samples to be computed */ + blkCnt = outBlockSize; + + while (blkCnt > 0u) + { + /* Copy decimation factor number of new input samples into the state buffer */ + i = S->M; + + do + { + *pStateCurnt++ = *pSrc++; + + } + while (--i); + + /* Set accumulator to zero */ + sum0 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = pCoeffs; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + while (tapCnt > 0u) + { + /* Read the b[numTaps-1] coefficient */ + c0 = * (pb++); + + /* Read x[n-numTaps-1] sample */ + x0 = * (px++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = * (pb++); + + /* Read x[n-numTaps-2] sample */ + x0 = * (px++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-3] coefficient */ + c0 = * (pb++); + + /* Read x[n-numTaps-3] sample */ + x0 = * (px++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = * (pb++); + + /* Read x[n-numTaps-4] sample */ + x0 = * (px++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = numTaps % 0x4u; + + while (tapCnt > 0u) + { + /* Read coefficients */ + c0 = * (pb++); + + /* Fetch 1 state variable */ + x0 = * (px++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by the decimation factor + * to process the next group of decimation factor number samples */ + pState = pState + S->M; + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = sum0; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + i = (numTaps - 1u) >> 2; + + /* copy data */ + while (i > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + + i = (numTaps - 1u) % 0x04u; + + /* copy data */ + while (i > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + i--; + } + +} + +/** + * @brief Processing function for the floating-point FIR interpolator. + * @param[in] *S points to an instance of the floating-point FIR interpolator structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of input samples to process per call. + * @return none. + */ + +void ne10_fir_interpolate_float_c ( + const ne10_fir_interpolate_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize) +{ + ne10_float32_t *pState = S->pState; /* State pointer */ + ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + ne10_float32_t *pStateCurnt; /* Points to the current sample of the state */ + ne10_float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */ + + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + ne10_float32_t sum0; /* Accumulators */ + ne10_float32_t x0, c0; /* Temporary variables to hold state and coefficient values */ + ne10_uint32_t i, blkCnt, j; /* Loop counters */ + ne10_uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */ + + + /* S->pState buffer contains previous frame (phaseLen - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (phaseLen - 1u); + + /* Total number of intput samples */ + blkCnt = blockSize; + + /* Loop over the blockSize. */ + while (blkCnt > 0u) + { + /* Copy new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Address modifier index of coefficient buffer */ + j = 1u; + + /* Loop over the Interpolation factor. */ + i = S->L; + while (i > 0u) + { + /* Set accumulator to zero */ + sum0 = 0.0f; + + /* Initialize state pointer */ + ptr1 = pState; + + /* Initialize coefficient pointer */ + ptr2 = pCoeffs + (S->L - j); + + /* Loop over the polyPhase length. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(4*S->L) coefficients. */ + tapCnt = phaseLen >> 2u; + while (tapCnt > 0u) + { + + /* Read the coefficient */ + c0 = * (ptr2); + + /* Upsampling is done by stuffing L-1 zeros between each sample. + * So instead of multiplying zeros with coefficients, + * Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = * (ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the coefficient */ + c0 = * (ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = * (ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the coefficient */ + c0 = * (ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = * (ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Read the coefficient */ + c0 = * (ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Read the input sample */ + x0 = * (ptr1++); + + /* Perform the multiply-accumulate */ + sum0 += x0 * c0; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = phaseLen % 0x4u; + + while (tapCnt > 0u) + { + /* Perform the multiply-accumulate */ + sum0 += * (ptr1++) * (*ptr2); + + /* Increment the coefficient pointer by interpolation factor times. */ + ptr2 += S->L; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* The result is in the accumulator, store in the destination buffer. */ + *pDst++ = sum0; + + /* Increment the address modifier index of coefficient buffer */ + j++; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 1 + * to process the next group of interpolation factor number samples */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (phaseLen - 1u) >> 2u; + + /* copy data */ + while (tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + tapCnt = (phaseLen - 1u) % 0x04u; + + while (tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + +/** + * @brief Processing function for the floating-point FIR lattice filter. + * @param[in] *S points to an instance of the floating-point FIR lattice structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] blockSize number of samples to process. + * @return none. + */ + +void ne10_fir_lattice_float_c ( + const ne10_fir_lattice_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize) +{ + ne10_float32_t *pState; /* State pointer */ + ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + ne10_float32_t *px; /* temporary state pointer */ + ne10_float32_t *pk; /* temporary coefficient pointer */ + + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + ne10_float32_t fcurr1, fnext1, gcurr1, gnext1; /* temporary variables for first sample in loop unrolling */ + ne10_float32_t fcurr2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */ + ne10_float32_t fcurr3, fnext3, gnext3; /* temporary variables for third sample in loop unrolling */ + ne10_float32_t fcurr4, fnext4, gnext4; /* temporary variables for fourth sample in loop unrolling */ + ne10_uint32_t numStages = S->numStages; /* Number of stages in the filter */ + ne10_uint32_t blkCnt, stageCnt; /* temporary variables for counts */ + + gcurr1 = 0.0f; + pState = &S->pState[0]; + + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0u) + { + + /* Read two samples from input buffer */ + /* f0(n) = x(n) */ + fcurr1 = *pSrc++; + fcurr2 = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* Read g0(n-1) from state */ + gcurr1 = *px; + + /* Process first sample for first tap */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext1 = fcurr1 + ( (*pk) * gcurr1); + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext1 = (fcurr1 * (*pk)) + gcurr1; + + /* Process second sample for first tap */ + /* for sample 2 processing */ + fnext2 = fcurr2 + ( (*pk) * fcurr1); + gnext2 = (fcurr2 * (*pk)) + fcurr1; + + /* Read next two samples from input buffer */ + /* f0(n+2) = x(n+2) */ + fcurr3 = *pSrc++; + fcurr4 = *pSrc++; + + /* Copy only last input samples into the state buffer + which will be used for next four samples processing */ + *px++ = fcurr4; + + /* Process third sample for first tap */ + fnext3 = fcurr3 + ( (*pk) * fcurr2); + gnext3 = (fcurr3 * (*pk)) + fcurr2; + + /* Process fourth sample for first tap */ + fnext4 = fcurr4 + ( (*pk) * fcurr3); + gnext4 = (fcurr4 * (*pk++)) + fcurr3; + + /* Update of f values for next coefficient set processing */ + fcurr1 = fnext1; + fcurr2 = fnext2; + fcurr3 = fnext3; + fcurr4 = fnext4; + + /* Loop unrolling. Process 4 taps at a time . */ + stageCnt = (numStages - 1u) >> 2u; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numStages-3 coefficients. */ + + /* Process 2nd, 3rd, 4th and 5th taps ... here */ + while (stageCnt > 0u) + { + /* Read g1(n-1), g3(n-1) .... from state */ + gcurr1 = *px; + + /* save g1(n) in state buffer */ + *px++ = gnext4; + + /* Process first sample for 2nd, 6th .. tap */ + /* Sample processing for K2, K6.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext1 = fcurr1 + ( (*pk) * gcurr1); + /* Process second sample for 2nd, 6th .. tap */ + /* for sample 2 processing */ + fnext2 = fcurr2 + ( (*pk) * gnext1); + /* Process third sample for 2nd, 6th .. tap */ + fnext3 = fcurr3 + ( (*pk) * gnext2); + /* Process fourth sample for 2nd, 6th .. tap */ + fnext4 = fcurr4 + ( (*pk) * gnext3); + + /* g2(n) = f1(n) * K2 + g1(n-1) */ + /* Calculation of state values for next stage */ + gnext4 = (fcurr4 * (*pk)) + gnext3; + gnext3 = (fcurr3 * (*pk)) + gnext2; + gnext2 = (fcurr2 * (*pk)) + gnext1; + gnext1 = (fcurr1 * (*pk++)) + gcurr1; + + + /* Read g2(n-1), g4(n-1) .... from state */ + gcurr1 = *px; + + /* save g2(n) in state buffer */ + *px++ = gnext4; + + /* Sample processing for K3, K7.... */ + /* Process first sample for 3rd, 7th .. tap */ + /* f3(n) = f2(n) + K3 * g2(n-1) */ + fcurr1 = fnext1 + ( (*pk) * gcurr1); + /* Process second sample for 3rd, 7th .. tap */ + fcurr2 = fnext2 + ( (*pk) * gnext1); + /* Process third sample for 3rd, 7th .. tap */ + fcurr3 = fnext3 + ( (*pk) * gnext2); + /* Process fourth sample for 3rd, 7th .. tap */ + fcurr4 = fnext4 + ( (*pk) * gnext3); + + /* Calculation of state values for next stage */ + /* g3(n) = f2(n) * K3 + g2(n-1) */ + gnext4 = (fnext4 * (*pk)) + gnext3; + gnext3 = (fnext3 * (*pk)) + gnext2; + gnext2 = (fnext2 * (*pk)) + gnext1; + gnext1 = (fnext1 * (*pk++)) + gcurr1; + + + /* Read g1(n-1), g3(n-1) .... from state */ + gcurr1 = *px; + + /* save g3(n) in state buffer */ + *px++ = gnext4; + + /* Sample processing for K4, K8.... */ + /* Process first sample for 4th, 8th .. tap */ + /* f4(n) = f3(n) + K4 * g3(n-1) */ + fnext1 = fcurr1 + ( (*pk) * gcurr1); + /* Process second sample for 4th, 8th .. tap */ + /* for sample 2 processing */ + fnext2 = fcurr2 + ( (*pk) * gnext1); + /* Process third sample for 4th, 8th .. tap */ + fnext3 = fcurr3 + ( (*pk) * gnext2); + /* Process fourth sample for 4th, 8th .. tap */ + fnext4 = fcurr4 + ( (*pk) * gnext3); + + /* g4(n) = f3(n) * K4 + g3(n-1) */ + /* Calculation of state values for next stage */ + gnext4 = (fcurr4 * (*pk)) + gnext3; + gnext3 = (fcurr3 * (*pk)) + gnext2; + gnext2 = (fcurr2 * (*pk)) + gnext1; + gnext1 = (fcurr1 * (*pk++)) + gcurr1; + + /* Read g2(n-1), g4(n-1) .... from state */ + gcurr1 = *px; + + /* save g4(n) in state buffer */ + *px++ = gnext4; + + /* Sample processing for K5, K9.... */ + /* Process first sample for 5th, 9th .. tap */ + /* f5(n) = f4(n) + K5 * g4(n-1) */ + fcurr1 = fnext1 + ( (*pk) * gcurr1); + /* Process second sample for 5th, 9th .. tap */ + fcurr2 = fnext2 + ( (*pk) * gnext1); + /* Process third sample for 5th, 9th .. tap */ + fcurr3 = fnext3 + ( (*pk) * gnext2); + /* Process fourth sample for 5th, 9th .. tap */ + fcurr4 = fnext4 + ( (*pk) * gnext3); + + /* Calculation of state values for next stage */ + /* g5(n) = f4(n) * K5 + g4(n-1) */ + gnext4 = (fnext4 * (*pk)) + gnext3; + gnext3 = (fnext3 * (*pk)) + gnext2; + gnext2 = (fnext2 * (*pk)) + gnext1; + gnext1 = (fnext1 * (*pk++)) + gcurr1; + + stageCnt--; + } + + /* If the (filter length -1) is not a multiple of 4, compute the remaining filter taps */ + stageCnt = (numStages - 1u) % 0x4u; + + while (stageCnt > 0u) + { + gcurr1 = *px; + + /* save g value in state buffer */ + *px++ = gnext4; + + /* Process four samples for last three taps here */ + fnext1 = fcurr1 + ( (*pk) * gcurr1); + fnext2 = fcurr2 + ( (*pk) * gnext1); + fnext3 = fcurr3 + ( (*pk) * gnext2); + fnext4 = fcurr4 + ( (*pk) * gnext3); + + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext4 = (fcurr4 * (*pk)) + gnext3; + gnext3 = (fcurr3 * (*pk)) + gnext2; + gnext2 = (fcurr2 * (*pk)) + gnext1; + gnext1 = (fcurr1 * (*pk++)) + gcurr1; + + /* Update of f values for next coefficient set processing */ + fcurr1 = fnext1; + fcurr2 = fnext2; + fcurr3 = fnext3; + fcurr4 = fnext4; + + stageCnt--; + + } + + /* The results in the 4 accumulators, store in the destination buffer. */ + /* y(n) = fN(n) */ + *pDst++ = fcurr1; + *pDst++ = fcurr2; + *pDst++ = fcurr3; + *pDst++ = fcurr4; + + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4u; + + while (blkCnt > 0u) + { + /* f0(n) = x(n) */ + fcurr1 = *pSrc++; + + /* Initialize coeff pointer */ + pk = (pCoeffs); + + /* Initialize state pointer */ + px = pState; + + /* read g2(n) from state buffer */ + gcurr1 = *px; + + /* for sample 1 processing */ + /* f1(n) = f0(n) + K1 * g0(n-1) */ + fnext1 = fcurr1 + ( (*pk) * gcurr1); + /* g1(n) = f0(n) * K1 + g0(n-1) */ + gnext1 = (fcurr1 * (*pk++)) + gcurr1; + + /* save g1(n) in state buffer */ + *px++ = fcurr1; + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr1 = fnext1; + + stageCnt = (numStages - 1u); + + /* stage loop */ + while (stageCnt > 0u) + { + /* read g2(n) from state buffer */ + gcurr1 = *px; + + /* save g1(n) in state buffer */ + *px++ = gnext1; + + /* Sample processing for K2, K3.... */ + /* f2(n) = f1(n) + K2 * g1(n-1) */ + fnext1 = fcurr1 + ( (*pk) * gcurr1); + /* g2(n) = f1(n) * K2 + g1(n-1) */ + gnext1 = (fcurr1 * (*pk++)) + gcurr1; + + /* f1(n) is saved in fcurr1 + for next stage processing */ + fcurr1 = fnext1; + + stageCnt--; + + } + + /* y(n) = fN(n) */ + *pDst++ = fcurr1; + + blkCnt--; + + } + +} +/** + * @brief floating-point Circular write function. + */ + +static void ne10_circular_write_float ( + ne10_int32_t * circBuffer, + ne10_int32_t L, + ne10_uint16_t * writeOffset, + ne10_int32_t bufferInc, + const ne10_int32_t * src, + ne10_int32_t srcInc, + ne10_uint32_t blockSize) +{ + ne10_uint32_t i = 0u; + ne10_int32_t wOffset; + + /* Copy the value of Index pointer that points + * to the current location where the input samples to be copied */ + wOffset = *writeOffset; + + /* Loop over the blockSize */ + i = blockSize; + + while (i > 0u) + { + /* copy the input sample to the circular buffer */ + circBuffer[wOffset] = *src; + + /* Update the input pointer */ + src += srcInc; + + /* Circularly update wOffset. Watch out for positive and negative value */ + wOffset += bufferInc; + if (wOffset >= L) + wOffset -= L; + + /* Decrement the loop counter */ + i--; + } + + /* Update the index pointer */ + *writeOffset = wOffset; +} + + + +/** + * @brief floating-point Circular Read function. + */ +static void ne10_circular_read_float ( + ne10_int32_t * circBuffer, + ne10_int32_t L, + ne10_int32_t * readOffset, + ne10_int32_t bufferInc, + ne10_int32_t * dst, + ne10_int32_t * dst_base, + ne10_int32_t dst_length, + ne10_int32_t dstInc, + ne10_uint32_t blockSize) +{ + ne10_uint32_t i = 0u; + ne10_int32_t rOffset, dst_end; + + /* Copy the value of Index pointer that points + * to the current location from where the input samples to be read */ + rOffset = *readOffset; + dst_end = (ne10_int32_t) (dst_base + dst_length); + + /* Loop over the blockSize */ + i = blockSize; + + while (i > 0u) + { + /* copy the sample from the circular buffer to the destination buffer */ + *dst = circBuffer[rOffset]; + + /* Update the input pointer */ + dst += dstInc; + + if (dst == (ne10_int32_t *) dst_end) + { + dst = dst_base; + } + + /* Circularly update rOffset. Watch out for positive and negative value */ + rOffset += bufferInc; + + if (rOffset >= L) + { + rOffset -= L; + } + + /* Decrement the loop counter */ + i--; + } + + /* Update the index pointer */ + *readOffset = rOffset; +} + + +/** + * @brief Processing function for the floating-point sparse FIR filter. + * @param[in] *S points to an instance of the floating-point sparse FIR structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data + * @param[in] *pScratchIn points to a temporary buffer of size blockSize. + * @param[in] blockSize number of input samples to process per call. + * @return none. + */ + +void ne10_fir_sparse_float_c ( + ne10_fir_sparse_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_float32_t * pScratchIn, + ne10_uint32_t blockSize) +{ + + ne10_float32_t *pState = S->pState; /* State pointer */ + ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + ne10_float32_t *px; /* Scratch buffer pointer */ + ne10_float32_t *py = pState; /* Temporary pointers for state buffer */ + ne10_float32_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ + ne10_float32_t *pOut; /* Destination pointer */ + ne10_int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ + ne10_uint32_t delaySize = S->maxDelay + blockSize; /* state length */ + ne10_uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + ne10_int32_t readIndex; /* Read index of the state buffer */ + ne10_uint32_t tapCnt, blkCnt; /* loop counters */ + ne10_float32_t coeff = *pCoeffs++; /* Read the first coefficient value */ + + + + /* BlockSize of Input samples are copied into the state buffer */ + /* StateIndex points to the starting position to write in the state buffer */ + ne10_circular_write_float ( (ne10_int32_t *) py, delaySize, &S->stateIndex, 1, + (ne10_int32_t *) pSrc, 1, blockSize); + + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ( (ne10_int32_t) S->stateIndex - (ne10_int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (ne10_int32_t) delaySize; + } + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + ne10_circular_read_float ( (ne10_int32_t *) py, delaySize, &readIndex, 1, + (ne10_int32_t *) pb, (ne10_int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer */ + px = pb; + + /* Working pointer for destination buffer */ + pOut = pDst; + + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 Multiplications at a time. */ + blkCnt = blockSize >> 2u; + + while (blkCnt > 0u) + { + /* Perform Multiplications and store in destination buffer */ + *pOut++ = *px++ * coeff; + *pOut++ = *px++ * coeff; + *pOut++ = *px++ * coeff; + *pOut++ = *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4u; + + while (blkCnt > 0u) + { + /* Perform Multiplications and store in destination buffer */ + *pOut++ = *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ( (ne10_int32_t) S->stateIndex - (ne10_int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (ne10_int32_t) delaySize; + } + + /* Loop over the number of taps. */ + tapCnt = (ne10_uint32_t) numTaps - 1u; + + while (tapCnt > 0u) + { + + /* Working pointer for state buffer is updated */ + py = pState; + + /* blockSize samples are read from the state buffer */ + ne10_circular_read_float ( (ne10_int32_t *) py, delaySize, &readIndex, 1, + (ne10_int32_t *) pb, (ne10_int32_t *) pb, blockSize, 1, + blockSize); + + /* Working pointer for the scratch buffer */ + px = pb; + + /* Working pointer for destination buffer */ + pOut = pDst; + + /* Loop over the blockSize. Unroll by a factor of 4. + * Compute 4 MACS at a time. */ + blkCnt = blockSize >> 2u; + + while (blkCnt > 0u) + { + /* Perform Multiply-Accumulate */ + *pOut++ += *px++ * coeff; + *pOut++ += *px++ * coeff; + *pOut++ += *px++ * coeff; + *pOut++ += *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, + * compute the remaining samples */ + blkCnt = blockSize % 0x4u; + + while (blkCnt > 0u) + { + /* Perform Multiply-Accumulate */ + *pOut++ += *px++ * coeff; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Load the coefficient value and + * increment the coefficient buffer for the next set of state values */ + coeff = *pCoeffs++; + + /* Read Index, from where the state buffer should be read, is calculated. */ + readIndex = ( (ne10_int32_t) S->stateIndex - + (ne10_int32_t) blockSize) - *pTapDelay++; + + /* Wraparound of readIndex */ + if (readIndex < 0) + { + readIndex += (ne10_int32_t) delaySize; + } + + /* Decrement the tap loop counter */ + tapCnt--; + } + +} + + +/** + * @} end of FIR group + */ diff --git a/modules/dsp/NE10_fir.neon.s b/modules/dsp/NE10_fir.neon.s new file mode 100644 index 0000000..f63cb2e --- /dev/null +++ b/modules/dsp/NE10_fir.neon.s @@ -0,0 +1,1959 @@ +@/* +@ * Copyright 2012 ARM Limited +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License")@ +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ */ + +@/* +@ * NE10 Library : dsp/NE10_fir.neon.s +@ */ + + + .text + .syntax unified + + @/** + @ * @details + @ * This function operates on floating-point data types. + @ * There are no restrictions on numTaps and blockSize. + @ * + @ * The order of the coefficients in *coeffs should be + @ * bN, bN-1, bN-2, .....b1, b0 + @ * + @ * Cycle Count: + @ * + @ * 45 + 8 * numTaps + 12.25 * blockSize + 4.375 * numTaps * blockSize + @ * + @ * @param[in] *S points to struct parameter + @ * @param[in] *pSrc points to the input buffer + @ * @param[out] *pDst points to the output buffer + @ * @param[in] blockSize block size of filter + @ */ + + .align 4 + .global ne10_fir_float_neon + .extern ne10_qMaskTable32 + .thumb + .thumb_func + +ne10_fir_float_neon: + PUSH {r4-r12,lr} +@/*ARM Registers*/ +pStateStruct .req R0 +pSrc .req R1 +pDst .req R2 +blockSize .req R3 + +pState .req R4 @/* State pointer */ +pCoeffs .req R5 @/* Coefficient pointer */ +pStateCurnt .req R6 @/* Points to the current sample of the state */ + +pX .req R7 @/* Temporary pointers for state buffer */ +pB .req R8 @/* Temporary pointers for coefficient buffer */ +numTaps .req R9 @/* Length of the filter */ + +tapCnt .req R10 @ /* Loop counter */ +Count .req R11 @ /* Loop counter */ +pTemp .req R11 +pMask .req R14 @ /* Mask Table */ + +mask .req R12 + +@/*NEON variale Declaration*/ +qInp .qn Q0.F32 +dInp_0 .dn D0.F32 +dInp_1 .dn D1.F32 +qCoeff .qn Q1.F32 +dCoeff_0 .dn D2.F32 +dCoeff_1 .dn D3.F32 +qZero .qn Q2.F32 + +qMask .qn Q3.U32 +dMask_0 .dn D6.U32 +dMask_1 .dn D7.U32 +dOut_0 .dn D6.F32 +dOut_1 .dn D7.F32 + +qAcc0 .qn Q8.F32 +dAcc0_0 .dn D16.F32 +dAcc0_1 .dn D17.F32 + + +qTemp .qn Q9.F32 +dTemp_0 .dn D18.F32 +dTemp_1 .dn D19.F32 + +qTemp1 .qn Q10.F32 +dTemp1_0 .dn D20.F32 +dTemp1_1 .dn D21.F32 +qTemp2 .qn Q11.F32 +qTemp3 .qn Q12.F32 +qMask1 .qn Q13.U32 +dMask1_0 .dn D26.U32 +dMask1_1 .dn D27.U32 +qMaskTmp .qn Q14.U32 +dMaskTmp_0 .dn D28.U32 +dMaskTmp_1 .dn D29.U32 + + + + + + LDRH numTaps,[pStateStruct],#4 + LDR pState,[pStateStruct],#4 + LDR pCoeffs,[pStateStruct],#4 + + @/* S->state buffer contains previous frame (numTaps - 1) samples */ + @/* pStateCurnt points to the location where the new input data should be written */ + @/*pStateCurnt = &(S->state[(numTaps - 1u)])@*/ + SUB mask,numTaps,#1 + LDR pMask,=ne10_qMaskTable32 + AND tapCnt,numTaps,#3 + ADD pStateCurnt,pState,mask,LSL #2 + AND mask,blockSize,#3 + + + @/* Apply loop unrolling and compute 4 output values simultaneously. + @* The variables acc0 ... acc3 hold output values that are being computed: + @* + @* acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + @* acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + @* acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + @* acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + @*/ + + @/*If numTaps,blockSize are not multiples of 4, Get the appropriate Masks*/ + + + ADD pTemp,pMask,tapCnt,LSL #4 + VEOR qZero,qZero + ADD pX,pMask,mask,LSL #4 + VLD1 {dMaskTmp_0,dMaskTmp_1},[pTemp] + VLD1 {dMask1_0,dMask1_1},[pX] + + + @/* Copy blockCnt number of new input samples into the state buffer */ + + SUBS blockSize,#4 + BLT firEndOuterLoop + + @/* Compute 4 outputs at a time*/ + +firOuterLoop: + + VLD1 {dTemp_0,dTemp_1},[pSrc]! + MOV pX,pState + MOV pB,pCoeffs + @/* Read the first four samples from the state buffer: + @* x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2],x[n-numTaps-3] */ + + VST1 {dTemp_0,dTemp_1},[pStateCurnt]! + @/* Zero the Accumulators*/ + VEOR qAcc0,qAcc0 + VLD1 {dInp_0,dInp_1},[pX]! + + @//* Read the first four coefficients b[numTaps] to b[numTaps-3] */ + VLD1 {dCoeff_0,dCoeff_1},[pB]! + @/* Loop unrolling. Process 4 taps at a time. */ + SUBS tapCnt,numTaps,#4 + VLD1 {dTemp_0,dTemp_1},[pX]! + + BLT firEndInnerLoop + +firInnerLoop: + + + VEXT qTemp1,qInp,qTemp,#1 + @/* acc0 += b[numTaps] * x[n-numTaps-1]+ b[numTaps] * x[n-numTaps-2] + + @* b[numTaps] * x[n-numTaps-3] + b[numTaps] * x[n-numTaps-4]*/ + VMLA qAcc0,qInp,dCoeff_0[0] + VEXT qTemp2,qInp,qTemp,#2 + @/* acc1 += b[numTaps-1] * x[n-numTaps-2]+ b[numTaps-1] * x[n-numTaps-3] + + @b[numTaps-1] * x[n-numTaps-4] +*b[numTaps-1] * x[n-numTaps-5]*/ + VMLA qAcc0,qTemp1,dCoeff_0[1] + VEXT qTemp3,qInp,qTemp,#3 + @/* acc2 += b[numTaps-2] * x[n-numTaps-3]+ b[numTaps-2] * x[n-numTaps-4] + + @b[numTaps-2] * x[n-numTaps-5] + *b[numTaps-2] * x[n-numTaps-6]*/ + @//vacc0q_f32 = vmlaq_lane_f32(vacc0q_f32,vxtemp2q_f32,vget_high_f32(vcq_f32),0)@ + VMLA qAcc0,qTemp2,dCoeff_1[0] + VMOV qInp,qTemp + @/* acc3 += b[numTaps-3] * x[n-numTaps-4]+ b[numTaps-3] * x[n-numTaps-5] + + @b[numTaps-3] * x[n-numTaps-6] +*b[numTaps-3] * x[n-numTaps-7] */ + + VMLA qAcc0,qTemp3,dCoeff_1[1] + + @/* Read the b[numTaps-4] to b[numTaps-7] coefficients */ + VLD1 {dCoeff_0,dCoeff_1},[pB]! + SUBS tapCnt,#4 + VLD1 {dTemp_0,dTemp_1},[pX]! + + BGE firInnerLoop +firEndInnerLoop: + + + @/* If the filter length is not a multiple of 4, compute the remaining filter taps */ + @/*Select only the remaining filter Taps*/ + VMOV qMask,qMaskTmp + VBSL qMask,qCoeff,qZero + VEXT qTemp1,qInp,qTemp,#1 + VMLA qAcc0,qInp,dOut_0[0] + VEXT qTemp2,qInp,qTemp,#2 + VMLA qAcc0,qTemp1,dOut_0[1] + VEXT qTemp3,qInp,qTemp,#3 + VMLA qAcc0,qTemp2,dOut_1[0] + @/* Advance the state pointer by 4 to process the next group of 4 samples */ + ADD pState,#16 + + VMLA qAcc0,qTemp3,dOut_1[1] + + + @/* The results in the 4 accumulators are in 2.30 format. Convert to 1.31 + @ * Then store the 4 outputs in the destination buffer. */ + SUBS blockSize,#4 + VST1 {dAcc0_0,dAcc0_1},[pDst]! + + BGE firOuterLoop + +firEndOuterLoop: + @/*Handle BlockSize Not a Multiple of 4*/ + ADDS blockSize,#4 + BEQ firCopyData + @/*Copy the Remaining BlockSize Number of Input Sample to state Buffer*/ + VMOV qMask,qMask1 + VLD1 {dTemp1_0,dTemp1_1},[pStateCurnt] + VLD1 {dTemp_0,dTemp_1},[pSrc] + + ADD pSrc,pSrc,blockSize,LSL #2 + MOV pX,pState + MOV pB,pCoeffs + + VBSL qMask,qTemp,qTemp1 + VST1 {dMask_0,dMask_1},[pStateCurnt] + VLD1 {dInp_0,dInp_1},[pX]! + + ADD pStateCurnt,pStateCurnt,blockSize, LSL #2 + + @/* Zero the Accumulators*/ + VEOR qAcc0,qAcc0 + VLD1 {dCoeff_0,dCoeff_1},[pB]! + SUBS tapCnt,numTaps,#4 + VLD1 {dTemp_0,dTemp_1},[pX]! + + BLT firEndInnerLoop1 + +firInnerLoop1: + + VEXT qTemp1,qInp,qTemp,#1 + VMLA qAcc0,qInp,dCoeff_0[0] + VEXT qTemp2,qInp,qTemp,#2 + VMLA qAcc0,qTemp1,dCoeff_0[1] + VEXT qTemp3,qInp,qTemp,#3 + VMLA qAcc0,qTemp2,dCoeff_1[0] + VMOV qInp,qTemp + VMLA qAcc0,qTemp3,dCoeff_1[1] + VLD1 {dCoeff_0,dCoeff_1},[pB]! + SUBS tapCnt,#4 + VLD1 {dTemp_0,dTemp_1},[pX]! + + BGE firInnerLoop1 +firEndInnerLoop1: + + + VMOV qMask,qMaskTmp + VBSL qMask,qCoeff,qZero + VEXT qTemp1,qInp,qTemp,#1 + VMLA qAcc0,qInp,dOut_0[0] + VEXT qTemp2,qInp,qTemp,#2 + VMLA qAcc0,qTemp1,dOut_0[1] + VEXT qTemp3,qInp,qTemp,#3 + VMLA qAcc0,qTemp2,dOut_1[0] + VMOV qMask,qMask1 + VLD1 {dTemp_0,dTemp_1},[pDst] + VMLA qAcc0,qTemp3,dOut_1[1] + + + @/* If the blockSize is not a multiple of 4, Mask the unwanted Output */ + + VBSL qMask,qAcc0,qTemp + VST1 {dMask_0,dMask_1},[pDst] + ADD pDst,pDst,blockSize,LSL #2 + ADD pState,pState,blockSize,LSL #2 + + +firCopyData: + @/* Processing is complete. Now shift the data in the state buffer down by + @** blockSize samples. This prepares the state buffer for the next function + @** call. */ + + @/* Points to the start of the state buffer */ + + SUB numTaps,numTaps,#1 + AND mask,numTaps,#3 + LDR pStateCurnt,[pStateStruct,#-8] + ADD pTemp,pMask,mask,LSL #4 + VLD1 {dInp_0,dInp_1},[pState]! + VLD1 {dMask_0,dMask_1},[pTemp] + + + @/* copy data */ + + SUBS Count,numTaps,#4 + BLT firEnd +firCopyLoop: + VST1 {dInp_0,dInp_1},[pStateCurnt]! + SUBS Count,#4 + VLD1 {dInp_0,dInp_1},[pState]! + BGE firCopyLoop + +firEnd: + + VLD1 {dTemp_0,dTemp_1},[pStateCurnt] + VBSL qMask,qInp,qTemp + VST1 {dOut_0,dOut_1},[pStateCurnt] + ADD pStateCurnt,pStateCurnt,mask, LSL #2 + + @/*Return From Function*/ + POP {r4-r12,pc} +@/*ARM Registers*/ +.unreq pStateStruct +.unreq pSrc +.unreq pDst +.unreq blockSize + +.unreq pState +.unreq pCoeffs +.unreq pStateCurnt + +.unreq pX +.unreq pB +.unreq numTaps + +.unreq tapCnt +.unreq Count +.unreq pTemp +.unreq pMask + +.unreq mask + +@/*NEON variale Declaration*/ +.unreq qInp +.unreq dInp_0 +.unreq dInp_1 +.unreq qCoeff +.unreq dCoeff_0 +.unreq dCoeff_1 +.unreq qZero + +.unreq qMask +.unreq dMask_0 +.unreq dMask_1 +.unreq dOut_0 +.unreq dOut_1 + +.unreq qAcc0 +.unreq dAcc0_0 +.unreq dAcc0_1 + +.unreq qTemp +.unreq dTemp_0 +.unreq dTemp_1 + +.unreq qTemp1 +.unreq dTemp1_0 +.unreq dTemp1_1 +.unreq qTemp2 +.unreq qTemp3 +.unreq qMask1 +.unreq dMask1_0 +.unreq dMask1_1 +.unreq qMaskTmp +.unreq dMaskTmp_0 +.unreq dMaskTmp_1 + + @/** + @ * @details + @ * This function operates on floating-point data types. + @ * There are no restrictions on numTaps and blockSize. + @ * + @ * The order of the coefficients in *coeffs should be + @ * bN, bN-1, bN-2, .....b1, b0 + @ * + @ * Cycle Count: + @ * + @ * Co + C1 * numTaps + C3 * blockSize * decimation Factor + c4 * numTaps * blockSize + @ * + @ * @param[in] *S points to struct parameter + @ * @param[in] *pSrc points to the input buffer + @ * @param[out] *pDst points to the output buffer + @ * @param[in] blockSize block size of filter + @ */ + + .align 4 + .global ne10_fir_decimate_float_neon + .extern ne10_qMaskTable32 + .extern ne10_divLookUpTable + .thumb + .thumb_func + +ne10_fir_decimate_float_neon: + + PUSH {r4-r12,lr} + +@/*ARM Registers*/ +pStateStruct .req R0 +pSrc .req R1 +pDst .req R2 +blockSize .req R3 + +pState .req R4 @/* State pointer */ +pCoeffs .req R5 @/* Coefficient pointer */ +decimationFact .req R6 +outBlockSize .req R7 + +pX .req R6 @/* Temporary pointers for state buffer */ +pB .req R8 @/* Temporary pointers for coefficient buffer */ +numTaps .req R9 @/* Length of the filter */ + +tapCnt .req R10 @ /* Loop counter */ +Count .req R11 @ /* Loop counter */ +pTemp .req R11 +blkCnt .req R11 +pMask .req R14 @ /* Mask Table */ + +mask .req R12 +Offset .req R12 + +@/*NEON variale Declaration*/ +qInp0 .qn Q0.F32 +dInp0_0 .dn D0.F32 +dInp0_1 .dn D1.F32 + +qCoeff .qn Q1.F32 +dCoeff_0 .dn D2.F32 +dCoeff_1 .dn D3.F32 + +qZero .qn Q2.F32 +qMask .qn Q3.U32 +qMaskF32 .qn Q3.F32 +dMask_0 .dn D6.U32 +dMask_1 .dn D7.U32 +dOut_0 .dn D6.F32 +dOut_1 .dn D7.F32 + +qInp3 .qn Q4.F32 +dInp3_0 .dn D8.F32 +dInp3_1 .dn D9.F32 + +qAcc0 .qn Q8.F32 +dAcc0_0 .dn D16.F32 +dAcc0_1 .dn D17.F32 + + +qTemp .qn Q9.F32 +dTemp_0 .dn D18.F32 +dTemp_1 .dn D19.F32 + +qInp1 .qn Q9.F32 +dInp1_0 .dn D18.F32 +dInp1_1 .dn D19.F32 + +qAcc1 .qn Q10.F32 +dAcc1_0 .dn D20.F32 +dAcc1_1 .dn D21.F32 +qAcc2 .qn Q11.F32 +dAcc2_0 .dn D22.F32 +dAcc2_1 .dn D23.F32 +qAcc3 .qn Q12.F32 +dAcc3_0 .dn D24.F32 +dAcc3_1 .dn D25.F32 + +qMask1 .qn Q13.U32 +dMask1_0 .dn D26.U32 +dMask1_1 .dn D27.U32 + +qMaskTmp .qn Q14.U32 +dMaskTmp_0 .dn D28.U32 +dMaskTmp_1 .dn D29.U32 + + +qInp2 .qn Q15.F32 +dInp2_0 .dn D30.F32 +dInp2_1 .dn D31.F32 + + + + + LDRB decimationFact,[pStateStruct],#2 + LDRH numTaps,[pStateStruct],#2 + LDR pCoeffs,[pStateStruct],#4 + LDR pState,[pStateStruct],#4 + + @//outBlockSize = blockSize / S->M + LDR pTemp,=ne10_divLookUpTable + SUBS mask,decimationFact,#1 + ADD pTemp,pTemp,mask,LSL #2 + LDR mask,[pTemp] + @//MOV pX,#0 + + + SMULWB outBlockSize,blockSize,mask + CMP outBlockSize,#0 + IT LT + RSBLT outBlockSize,#0 + + + @/* S->state buffer contains previous frame (numTaps - 1) samples */ + @/* pStateCurnt points to the location where the new input data should be written */ + @//pStateCurnt = S->state + (numTaps - 1u)@ + + + @/* Copy Blocksize number of new input samples into the state buffer */ + + LDR pMask,=ne10_qMaskTable32 + SUB tapCnt,numTaps,#1 + AND mask,blockSize,#3 + + ADD pB,pState,tapCnt,LSL #2 + ADD mask,pMask,mask,LSL #4 + VLD1 {dTemp_0,dTemp_1},[pSrc]! + VLD1 {dMask1_0,dMask1_1},[mask] + + + SUBS Count,blockSize,#4 + LSL Offset,decimationFact, #2 + VMOV qMask,qMask1 + + BLT firDecimateEndCopy +firDecimateCopyLoop: + + VST1 {dTemp_0,dTemp_1},[pB]! + SUBS Count,#4 + VLD1 {dTemp_0,dTemp_1},[pSrc]! + BGE firDecimateCopyLoop +firDecimateEndCopy: + VLD1 {dCoeff_0,dCoeff_1},[pB] + + VBSL qMask,qTemp,qCoeff + VST1 {dMask_0,dMask_1},[pB] + ADD pB,pB,tapCnt,LSL #2 + + @// Load Mask Value + AND blkCnt,outBlockSize,#3 + ADD blkCnt,pMask,blkCnt,LSL #4 + VLD1 {dMask1_0,dMask1_1},[blkCnt] + + @/*Load Mask Table Values*/ + + AND tapCnt,numTaps,#3 + ADD pTemp,pMask,tapCnt,LSL #4 + VEOR qZero,qZero,qZero + VLD1 {dMaskTmp_0,dMaskTmp_1},[pTemp] + + @/*Handle 4 output samples at a time */ + SUBS blkCnt,outBlockSize,#4 + BLT firDecimateEndOuterLoop + + @//blkCnt = outBlockSize>>2@ +firDecimateOuterLoop: + @/* Set accumulator to zero */ + VEOR qAcc0,qAcc0,qAcc0 + VEOR qAcc1,qAcc1,qAcc1 + VEOR qAcc2,qAcc2,qAcc2 + VEOR qAcc3,qAcc3,qAcc3 + @/* Initialize state pointer */ + MOV pX,pState + @/* Initialize coeff pointer */ + MOV pB,pCoeffs + + SUBS tapCnt,numTaps,#4 + VLD1 {dCoeff_0,dCoeff_1},[pB]! + + VLD1 {dInp0_0,dInp0_1},[pX],Offset + VLD1 {dInp1_0,dInp1_1},[pX],Offset + VLD1 {dInp2_0,dInp2_1},[pX],Offset + VLD1 {dInp3_0,dInp3_1},[pX],Offset + SUB pX,pX,Offset, LSL #2 + ADD pX,pX,#16 + + BLT firDecimateEndInnerLoop +firDecimateInnerLoop: + VMLA qAcc0,qCoeff,qInp0 + VMLA qAcc1,qCoeff,qInp1 + VMLA qAcc2,qCoeff,qInp2 + VMLA qAcc3,qCoeff,qInp3 + + VLD1 {dCoeff_0,dCoeff_1},[pB]! + VLD1 {dInp0_0,dInp0_1},[pX],Offset + VLD1 {dInp1_0,dInp1_1},[pX],Offset + VLD1 {dInp2_0,dInp2_1},[pX],Offset + VLD1 {dInp3_0,dInp3_1},[pX],Offset + SUB pX,pX,Offset, LSL #2 + ADD pX,pX,#16 + + SUBS tapCnt,#4 + BGE firDecimateInnerLoop +firDecimateEndInnerLoop: + @/* If the filter length is not a multiple of 4, compute the remaining filter taps */ + + VMOV qMask,qMaskTmp + VBSL qMask,qCoeff,qZero + + VMLA qAcc0,qMaskF32,qInp0 + VMLA qAcc1,qMaskF32,qInp1 + VMLA qAcc2,qMaskF32,qInp2 + VMLA qAcc3,qMaskF32,qInp3 + + VADD dAcc0_0,dAcc0_0,dAcc0_1 + VADD dAcc1_0,dAcc1_0,dAcc1_1 + VADD dAcc2_0,dAcc2_0,dAcc2_1 + VADD dAcc3_0,dAcc3_0,dAcc3_1 + + VPADD dAcc0_0,dAcc0_0,dAcc1_0 + VPADD dAcc0_1,dAcc2_0,dAcc3_0 + ADD pState,pState,Offset,LSL #2 + VST1 {dAcc0_0,dAcc0_1},[pDst]! + + SUBS blkCnt,#4 + BGE firDecimateOuterLoop + +firDecimateEndOuterLoop: + @/*Handle BlockSize Not a Multiple of 4*/ + ADDS blkCnt,#4 + BEQ firDecimateCopyData + + + @/* Set accumulator to zero */ + VEOR qAcc0,qAcc0,qAcc0 + VEOR qAcc1,qAcc1,qAcc1 + VEOR qAcc2,qAcc2,qAcc2 + VEOR qAcc3,qAcc3,qAcc3 + @/* Initialize state pointer */ + MOV pX,pState + @/* Initialize coeff pointer */ + MOV pB,pCoeffs + SUBS tapCnt,numTaps,#4 + VLD1 {dCoeff_0,dCoeff_1},[pB]! + + VLD1 {dInp0_0,dInp0_1},[pX],Offset + VLD1 {dInp1_0,dInp1_1},[pX],Offset + VLD1 {dInp2_0,dInp2_1},[pX],Offset + VLD1 {dInp3_0,dInp3_1},[pX],Offset + SUB pX,pX,Offset, LSL #2 + ADD pX,pX,#16 + + BLT firDecimateEndInnerLoop1 +firDecimateInnerLoop1: + VMLA qAcc0,qCoeff,qInp0 + VMLA qAcc1,qCoeff,qInp1 + VMLA qAcc2,qCoeff,qInp2 + VMLA qAcc3,qCoeff,qInp3 + + VLD1 {dCoeff_0,dCoeff_1},[pB]! + VLD1 {dInp0_0,dInp0_1},[pX],Offset + VLD1 {dInp1_0,dInp1_1},[pX],Offset + VLD1 {dInp2_0,dInp2_1},[pX],Offset + VLD1 {dInp3_0,dInp3_1},[pX],Offset + SUB pX,pX,Offset, LSL #2 + ADD pX,pX,#16 + + SUBS tapCnt,#4 + BGE firDecimateInnerLoop1 +firDecimateEndInnerLoop1: + @/* If the filter length is not a multiple of 4, compute the remaining filter taps */ + + VMOV qMask,qMaskTmp + VBSL qMask,qCoeff,qZero + + VMLA qAcc0,qMaskF32,qInp0 + VMLA qAcc1,qMaskF32,qInp1 + VMLA qAcc2,qMaskF32,qInp2 + VMLA qAcc3,qMaskF32,qInp3 + + VADD dAcc0_0,dAcc0_0,dAcc0_1 + VADD dAcc1_0,dAcc1_0,dAcc1_1 + VADD dAcc2_0,dAcc2_0,dAcc2_1 + VADD dAcc3_0,dAcc3_0,dAcc3_1 + + + MUL Offset,Offset,blkCnt + VPADD dAcc0_0,dAcc0_0,dAcc1_0 + VPADD dAcc0_1,dAcc2_0,dAcc3_0 + ADD pState,pState,Offset + + VMOV qMask,qMask1 + VLD1 {dTemp_0,dTemp_1},[pDst] + VBSL qMask,qAcc0,qTemp + + VST1 {dMask_0,dMask_1},[pDst] + ADD pDst,pDst,blkCnt,LSL #2 + + +firDecimateCopyData: + @/* Processing is complete. Now shift the data in the state buffer down by + @** blockSize samples. This prepares the state buffer for the next function + @** call. */ + + @/* Points to the start of the state buffer */ + + SUB numTaps,numTaps,#1 + AND mask,numTaps,#3 + LDR pX,[pStateStruct,#-4] + ADD pTemp,pMask,mask,LSL #4 + VLD1 {dInp0_0,dInp0_1},[pState]! + VLD1 {dMask_0,dMask_1},[pTemp] + + @/* copy data */ + + SUBS Count,numTaps,#4 + BLT firDecimateEnd +firDecimateCopyLoop1: + VST1 {dInp0_0,dInp0_1},[pX]! + SUBS Count,#4 + VLD1 {dInp0_0,dInp0_1},[pState]! + BGE firDecimateCopyLoop1 +firDecimateEnd: + VLD1 {dTemp_0,dTemp_1},[pX] + VBSL qMask,qInp0,qTemp + VST1 {dOut_0,dOut_1},[pX] + ADD pX,pX,mask, LSL #2 + + @// Return From Function + POP {r4-r12,pc} + +@/*ARM Registers*/ +.unreq pStateStruct +.unreq pSrc +.unreq pDst +.unreq blockSize + +.unreq pState +.unreq pCoeffs +.unreq decimationFact +.unreq outBlockSize + +.unreq pX +.unreq pB +.unreq numTaps + +.unreq tapCnt +.unreq Count +.unreq pTemp +.unreq blkCnt +.unreq pMask + +.unreq mask +.unreq Offset + +@/*NEON variale Declaration*/ +.unreq qInp0 +.unreq dInp0_0 +.unreq dInp0_1 + +.unreq qCoeff +.unreq dCoeff_0 +.unreq dCoeff_1 + +.unreq qZero +.unreq qMask +.unreq qMaskF32 +.unreq dMask_0 +.unreq dMask_1 +.unreq dOut_0 +.unreq dOut_1 + +.unreq qInp3 +.unreq dInp3_0 +.unreq dInp3_1 + +.unreq qAcc0 +.unreq dAcc0_0 +.unreq dAcc0_1 + +.unreq qTemp +.unreq dTemp_0 +.unreq dTemp_1 + +.unreq qInp1 +.unreq dInp1_0 +.unreq dInp1_1 + +.unreq qAcc1 +.unreq dAcc1_0 +.unreq dAcc1_1 +.unreq qAcc2 +.unreq dAcc2_0 +.unreq dAcc2_1 +.unreq qAcc3 +.unreq dAcc3_0 +.unreq dAcc3_1 + +.unreq qMask1 +.unreq dMask1_0 +.unreq dMask1_1 + +.unreq qMaskTmp +.unreq dMaskTmp_0 +.unreq dMaskTmp_1 + +.unreq qInp2 +.unreq dInp2_0 +.unreq dInp2_1 + + + @/** + @ * @details + @ * This function operates on floating-point data types. + @ * There are no restrictions on numTaps and blockSize. + @ * + @ * The order of the coefficients in *coeffs should be + @ * bN, bN-1, bN-2, .....b1, b0 + @ * + @ * Cycle Count: + @ * + @ * C0 + C2 * blockSize + C3 * blockSize * interpolateFactor + C4 * numTaps * blockSize * interpolateFactor + @ * + @ * @param[in] *S points to struct parameter + @ * @param[in] *pSrc points to the input buffer + @ * @param[out] *pDst points to the output buffer + @ * @param[in] blockSize block size of filter + @ */ + + .align 4 + .global ne10_fir_interpolate_float_neon + .extern ne10_qMaskTable32 + .thumb + .thumb_func + +ne10_fir_interpolate_float_neon: + PUSH {r4-r12,lr} + + +@/*ARM Registers*/ +pStateStruct .req R0 +pSrc .req R1 +pDst .req R2 +blockSize .req R3 + +pState .req R4 @/* State pointer */ + +pB .req R5 @/* Temporary pointers for coefficient buffer */ +pCoeffs .req R5 @/* Coefficient pointer */ +pStateCurnt .req R5 @/* Points to the current sample of the state */ + +pX .req R6 @/* Temporary pointers for state buffer */ + +interpolationFact .req R7 +intFact .req R8 +phaseLen .req R9 +Offset .req R10 + +Count .req R11 @ /* Loop counter */ +pTemp .req R11 + +mask .req R12 + +pMask .req R14 @ /* Mask Table */ +index .req R14 + +@/*NEON variale Declaration*/ +qInp .qn Q0.F32 +dInp_0 .dn D0.F32 +dInp_1 .dn D1.F32 +qCoeff0 .qn Q1.F32 +dCoeff0_0 .dn D2.F32 +dCoeff0_1 .dn D3.F32 +qZero .qn Q2.F32 + +qMask .qn Q3.U32 +dMask_0 .dn D6.U32 +dMask_1 .dn D7.U32 +dOut_0 .dn D6.F32 +dOut_1 .dn D7.F32 + +qAcc0 .qn Q8.F32 +dAcc0_0 .dn D16.F32 +dAcc0_1 .dn D17.F32 + + +qTemp .qn Q9.F32 +dTemp_0 .dn D18.F32 +dTemp_1 .dn D19.F32 + +qCoeff1 .qn Q10.F32 +dCoeff1_0 .dn D20.F32 +dCoeff1_1 .dn D21.F32 +qCoeff2 .qn Q11.F32 +dCoeff2_0 .dn D22.F32 +dCoeff2_1 .dn D23.F32 +qCoeff3 .qn Q12.F32 +dCoeff3_0 .dn D24.F32 +dCoeff3_1 .dn D25.F32 + +qMask1 .qn Q13.F32 +dMask1_0 .dn D26.F32 +dMask1_1 .dn D27.F32 + + +qMaskTemp .qn Q14.U32 +dMaskTemp_0 .dn D28.U32 +dMaskTemp_1 .dn D29.U32 + + LDRB interpolationFact,[pStateStruct],#2 + LDRH phaseLen,[pStateStruct],#2 + LDR pCoeffs,[pStateStruct],#4 + LDR pState,[pStateStruct],#4 + + LSL Offset,interpolationFact, #2 + + + + @/* S->state buffer contains previous frame (phaseLen - 1) samples */ + @/* pStateCurnt points to the location where the new input data should be written */ + + + AND phaseLen,#3 + LDR pMask,=ne10_qMaskTable32 + + @/* Total number of intput samples */ + @/*Load Mask Value*/ + + AND mask,interpolationFact,#3 + ADD pTemp,pMask,phaseLen,LSL #4 + ADD mask,pMask,mask,LSL #4 + + VLD1 {dMaskTemp_0,dMaskTemp_1},[pTemp] + VLD1 {dMask1_0,dMask1_1},[mask] + + + VEOR qZero,qZero,qZero + + + + @/* Loop over the blockSize. */ + CMP blockSize,#0 + BEQ firInterpolateCopyData +firInterpolateBlockLoop: + @/* Copy new input sample into the state buffer */ + LDRH phaseLen,[pStateStruct,#-10] + LDR mask,[pSrc],#4 + SUB phaseLen,#1 + ADD pStateCurnt,pState,phaseLen, LSL #2 + + LDRB interpolationFact,[pStateStruct,#-12] + STR mask,[pStateCurnt] + + + SUBS intFact,interpolationFact,#4 + MOV index,#4 + + BLT firInterpolateEndIntplLoop +firInterpolateInterpolLoop: + VEOR qAcc0,qAcc0,qAcc0 + LDRH phaseLen,[pStateStruct,#-10] + LDR pCoeffs,[pStateStruct,#-8] + MOV pX,pState + SUB mask,interpolationFact,index + ADD pB,pCoeffs,mask, LSL #2 + @/*Load Coefficients*/ + @/*c0 c1 c2 c3*/ + VLD1 {dCoeff0_0,dCoeff0_1},[pB],Offset + @/*c0 c1 c2 c3*/ + VLD1 {dCoeff1_0,dCoeff1_1},[pB],Offset + @/*c0 c1 c2 c3*/ + VLD1 {dCoeff2_0,dCoeff2_1},[pB],Offset + @/*c0 c1 c2 c3*/ + VLD1 {dCoeff3_0,dCoeff3_1},[pB],Offset + VLD1 {dInp_0,dInp_1},[pX]! + @/* Loop over the polyPhase length. Unroll by a factor of 4. + @ ** Repeat until we've computed numTaps-(4*S->L) coefficients. */ + SUBS phaseLen,#4 + BLT firInterpolateEndPhaseLoop +firInterpolatePhaseLoop: + @/* Perform the multiply-accumulate */ + VMLA qAcc0,qCoeff0,dInp_0[0] + VMLA qAcc0,qCoeff1,dInp_0[1] + VMLA qAcc0,qCoeff2,dInp_1[0] + VMLA qAcc0,qCoeff3,dInp_1[1] + + VLD1 {dInp_0,dInp_1},[pX]! + @ /*Load Coefficients*/ + @/*c0 c1 c2 c3*/ + VLD1 {dCoeff0_0,dCoeff0_1},[pB],Offset + @/*c0 c1 c2 c3*/ + VLD1 {dCoeff1_0,dCoeff1_1},[pB],Offset + @/*c0 c1 c2 c3*/ + VLD1 {dCoeff2_0,dCoeff2_1},[pB],Offset + @/*c0 c1 c2 c3*/ + VLD1 {dCoeff3_0,dCoeff3_1},[pB],Offset + + SUBS phaseLen,#4 + BGE firInterpolatePhaseLoop +firInterpolateEndPhaseLoop: + @/* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */ + VMOV qMask,qMaskTemp + VBSL qMask,qInp,qZero + @/* Perform the multiply-accumulate */ + VMLA qAcc0,qCoeff0,dOut_0[0] + VMLA qAcc0,qCoeff1,dOut_0[1] + VMLA qAcc0,qCoeff2,dOut_1[0] + VMLA qAcc0,qCoeff3,dOut_1[1] + @ /* The result is in the accumulator is in Reverse Order*/ + VREV64 qAcc0,qAcc0 + @/*Swap the D-Regs of Acc*/ + VMOV dCoeff0_0,dAcc0_1 + VMOV dCoeff0_1,dAcc0_0 + + VST1 {dCoeff0_0,dCoeff0_1},[pDst]! + ADD index,#4 + SUBS intFact,#4 + BGE firInterpolateInterpolLoop + +firInterpolateEndIntplLoop: + ADDS intFact,#4 + BEQ firInterpolateNextSample + @/*Handle the Remaining Samples*/ + VEOR qAcc0,qAcc0,qAcc0 + LDRH phaseLen,[pStateStruct,#-10] + LDR pCoeffs,[pStateStruct,#-8] + MOV pX,pState + SUB mask,interpolationFact,index + ADD pB,pCoeffs,mask, LSL #2 + @/*Load Coefficients*/ + @/*c0 c1 c2 c3*/ + VLD1 {dCoeff0_0,dCoeff0_1},[pB],Offset + @/*c0 c1 c2 c3*/ + VLD1 {dCoeff1_0,dCoeff1_1},[pB],Offset + @/*c0 c1 c2 c3*/ + VLD1 {dCoeff2_0,dCoeff2_1},[pB],Offset + @/*c0 c1 c2 c3*/ + VLD1 {dCoeff3_0,dCoeff3_1},[pB],Offset + VLD1 {dInp_0,dInp_1},[pX]! + @/* Loop over the polyPhase length. Unroll by a factor of 4. + @ ** Repeat until we've computed numTaps-(4*S->L) coefficients. */ + SUBS phaseLen,#4 + BLT firInterpolateEndPhaseLoop1 +firInterpolatePhaseLoop1: + @/* Perform the multiply-accumulate */ + VMLA qAcc0,qCoeff0,dInp_0[0] + VMLA qAcc0,qCoeff1,dInp_0[1] + VMLA qAcc0,qCoeff2,dInp_1[0] + VMLA qAcc0,qCoeff3,dInp_1[1] + + VLD1 {dInp_0,dInp_1},[pX]! + @ /*Load Coefficients*/ + @/*c0 c1 c2 c3*/ + VLD1 {dCoeff0_0,dCoeff0_1},[pB],Offset + @/*c0 c1 c2 c3*/ + VLD1 {dCoeff1_0,dCoeff1_1},[pB],Offset + @/*c0 c1 c2 c3*/ + VLD1 {dCoeff2_0,dCoeff2_1},[pB],Offset + @/*c0 c1 c2 c3*/ + VLD1 {dCoeff3_0,dCoeff3_1},[pB],Offset + SUBS phaseLen,#4 + BGE firInterpolatePhaseLoop1 + +firInterpolateEndPhaseLoop1: + @/* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */ + VMOV qMask,qMaskTemp + VBSL qMask,qInp,qZero + @/* Perform the multiply-accumulate */ + VMLA qAcc0,qCoeff0,dOut_0[0] + VMLA qAcc0,qCoeff1,dOut_0[1] + VMLA qAcc0,qCoeff2,dOut_1[0] + VMLA qAcc0,qCoeff3,dOut_1[1] + @ /* The result is in the accumulator is in Reverse Order*/ + VREV64 qAcc0,qAcc0 + + VMOV qMask,qMask1 + VLD1 {dTemp_0,dTemp_1},[pDst] + @/*Swap the D-Regs of Acc*/ + VMOV dCoeff0_0,dAcc0_1 + VMOV dCoeff0_1,dAcc0_0 + + VBSL qMask,qCoeff0,qTemp + VST1 {dMask_0,dMask_1},[pDst] + ADD pDst,pDst,intFact, LSL #2 + + +firInterpolateNextSample: + SUBS blockSize,#1 + ADD pState,#4 + BGT firInterpolateBlockLoop + + @/*End of Processing*/ + +firInterpolateCopyData: + + @/* Save previous phaseLen - 1 samples and get rid of other samples */ + @/* Points to the start of the state buffer */ + LDRH phaseLen,[pStateStruct,#-10] + LDR pMask,=ne10_qMaskTable32 + LDR pStateCurnt,[pStateStruct,#-4] + + SUB phaseLen,phaseLen,#1 + AND mask,phaseLen,#3 + ADD pTemp,pMask,mask,LSL #4 + + VLD1 {dInp_0,dInp_1},[pState]! + VLD1 {dMask_0,dMask_1},[pTemp] + + @/* copy data */ + + SUBS Count,phaseLen,#4 + BLT firInterpolateEnd +firInterpolateCopyLoop: + VST1 {dInp_0,dInp_1},[pStateCurnt]! + SUBS Count,#4 + VLD1 {dInp_0,dInp_1},[pState]! + BGE firInterpolateCopyLoop +firInterpolateEnd: + + VLD1 {dTemp_0,dTemp_1},[pStateCurnt] + VBSL qMask,qInp,qTemp + VST1 {dOut_0,dOut_1},[pStateCurnt] + + ADD pStateCurnt,pStateCurnt,mask, LSL #2 + + @/*Return From Function*/ + POP {r4-r12,pc} +@/*ARM Registers*/ +.unreq pStateStruct +.unreq pSrc +.unreq pDst +.unreq blockSize + +.unreq pState + +.unreq pB +.unreq pCoeffs +.unreq pStateCurnt + +.unreq pX + +.unreq interpolationFact +.unreq intFact +.unreq phaseLen +.unreq Offset + +.unreq Count +.unreq pTemp + +.unreq mask + +.unreq pMask +.unreq index + +@/*NEON variale Declaration*/ +.unreq qInp +.unreq dInp_0 +.unreq dInp_1 +.unreq qCoeff0 +.unreq dCoeff0_0 +.unreq dCoeff0_1 +.unreq qZero + +.unreq qMask +.unreq dMask_0 +.unreq dMask_1 +.unreq dOut_0 +.unreq dOut_1 + +.unreq qAcc0 +.unreq dAcc0_0 +.unreq dAcc0_1 + +.unreq qTemp +.unreq dTemp_0 +.unreq dTemp_1 + +.unreq qCoeff1 +.unreq dCoeff1_0 +.unreq dCoeff1_1 +.unreq qCoeff2 +.unreq dCoeff2_0 +.unreq dCoeff2_1 +.unreq qCoeff3 +.unreq dCoeff3_0 +.unreq dCoeff3_1 + +.unreq qMask1 +.unreq dMask1_0 +.unreq dMask1_1 + +.unreq qMaskTemp +.unreq dMaskTemp_0 +.unreq dMaskTemp_1 + + + @/** + @ * @details + @ * This function operates on floating-point data types. + @ * There are no restrictions on numStages and blockSize. + @ * + @ * The order of the coefficients in *coeffs should be + @ * k1, k2, ...kM-1 + @ * + @ * Cycle Count: + @ * + @ * c0 + c1 * blockSize + c2 * numStages * blockSize + @ * + @ * @param[in] *S points to struct parameter + @ * @param[in] *pSrc points to the input buffer + @ * @param[out] *pDst points to the output buffer + @ * @param[in] blockSize block size of filter + @ */ + + .align 4 + .global ne10_fir_lattice_float_neon + .extern ne10_qMaskTable32 + .thumb + .thumb_func + +ne10_fir_lattice_float_neon: + + PUSH {r4-r12,lr} + +@/*ARM Registers*/ +pStateStruct .req R0 +pSrc .req R1 +pDst .req R2 +blockSize .req R3 + +pState .req R4 @/* State pointer */ +pCoeffs .req R5 @/* Coefficient pointer */ + +pX .req R7 @/* Temporary pointers for state buffer */ +pB .req R8 @/* Temporary pointers for coefficient buffer */ +numStages .req R9 @/* Length of the filter */ + +stageCnt .req R10 @ /* Loop counter */ + + +pTemp .req R11 +pMask .req R14 @ /* Mask Table */ +mask .req R12 + +@/*NEON variale Declaration*/ +qFcurr .qn Q0.F32 +dFcurr_0 .dn D0.F32 +dFcurr_1 .dn D1.F32 +qCoeff .qn Q1.F32 +dCoeff_0 .dn D2.F32 +dCoeff_1 .dn D3.F32 + +qZero .qn Q2.F32 + +qMask .qn Q3.U32 +dMask_0 .dn D6.U32 +dMask_1 .dn D7.U32 +dOut_0 .dn D6.F32 +dOut_1 .dn D7.F32 + +qAcc0 .qn Q8.F32 +dAcc0_0 .dn D16.F32 +dAcc0_1 .dn D17.F32 + +qTemp .qn Q9.F32 +dTemp_0 .dn D18.F32 +dTemp_1 .dn D19.F32 + +qFnext .qn Q10.F32 +dFnext_0 .dn D20.F32 +dFnext_1 .dn D21.F32 +qGcurr .qn Q11.F32 +dGcurr_0 .dn D22.F32 +dGcurr_1 .dn D23.F32 +qGnext .qn Q12.F32 +dGnext_0 .dn D24.F32 +dGnext_1 .dn D25.F32 + +qMask1 .qn Q13.U32 +dMask1_0 .dn D26.U32 +dMask1_1 .dn D27.U32 +qMaskTmp .qn Q14.U32 +dMaskTmp_0 .dn D28.U32 +dMaskTmp_1 .dn D29.U32 +qTemp1 .qn Q15.F32 +dTemp1_0 .dn D30.F32 +dTemp1_1 .dn D31.F32 + +fNext .dn D0.F32 +gCurr .dn D1.F32 +gNext .dn D2.F32 +fCurr .dn D3.F32 +Coeff .dn D4.F32 + + @/* Length of the filter */ + LDRH numStages,[pStateStruct],#4 + @/* State pointer */ + LDR pState,[pStateStruct],#4 + @/* Coefficient pointer */ + LDR pCoeffs,[pStateStruct],#4 + + + @// Get the Mask Values + + LDR pMask,=ne10_qMaskTable32 + SUB numStages,#1 + AND mask,numStages, #3 + AND stageCnt,blockSize,#3 + + ADD pTemp,pMask,mask,LSL #4 + ADD stageCnt,pMask,stageCnt,LSL #4 + VLD1 {dMaskTmp_0,dMaskTmp_1},[pTemp] + VLD1 {dMask1_0,dMask1_1},[stageCnt] + VEOR qZero,qZero,qZero + + + SUBS blockSize,#4 + BLT firLatticeEndOuterLoop +firLatticeOuterLoop: + @/* Initialize coeff pointer */ + MOV pB,pCoeffs + @/* Initialize state pointer */ + MOV pX,pState + @/* Read Four samples from input buffer: fcurr0, fcurr1,fcurr2,fcurr3*/ + @/* f0(n) = x(n) */ + VLD1 {dFcurr_0,dFcurr_1},[pSrc]! + @/*Read one Sample from the State Buffer*/ + VLD1 {dGcurr_1[1]},[pX] + VEXT qGnext,qGcurr,qFcurr,#3 + + VLD1 {dCoeff_0[],dCoeff_1[]},[pB]! + VMOV qFnext,qFcurr + VST1 {dFcurr_1[1]},[pX]! + @/* fi(n) = fi-1(n) + Ki * gi-1(n-1) */ + @/* gi(n) = fi-1(n) * Ki + gi-1(n-1) */ + @/* ki*gcurr4+fcurr4 ki*gcurr3+fcurr3 ki*gcurr2+fcurr2 ki*gcurr1+fcurr1*/ + + VMLA qFcurr,qGnext,qCoeff + @/* ki*fcurr4+gcurr4 ki*fcurr3+gcurr3 ki*fcurr2+gcurr2 ki*fcurr1+gcurr1*/ + VMLA qGnext,qFnext,qCoeff + + + @/* Loop unrolling. Process 4 taps at a time . */ + SUBS stageCnt,numStages,#4 + BLT firLatticeEndInnerLoop + @/* Loop over the number of taps. Unroll by a factor of 4. + @ * Repeat until we've computed numStages-3 coefficients. */ + @/* Process 2nd, 3rd, 4th and 5th taps ... here */ +firLatticeInnerLoop: + VLD1 {dGcurr_1[1]},[pX]! + VREV64 dTemp_0,dGnext_1 + + VLD1 {dCoeff_0[],dCoeff_1[]},[pB]! + VEXT qGcurr,qGcurr,qGnext,#3 + + @ /* fi(n) = fi-1(n) + Ki * gi-1(n-1) */ + @/* gi(n) = fi-1(n) * Ki + gi-1(n-1) */ + @/* ki*gcurr4+fcurr4 ki*gcurr3+fcurr3 ki*gcurr2+fcurr2 ki*gcurr1+fcurr1*/ + VMOV qFnext,qFcurr + VMOV qGnext,qGcurr + @/* ki*fcurr4+gcurr4 ki*fcurr3+gcurr3 ki*fcurr2+gcurr2 ki*fcurr1+gcurr1*/ + VMLA qGnext,qFnext,qCoeff + VMLA qFcurr,qGcurr,qCoeff + + @/*Prepare for Next Stage*/ + VLD1 {dGcurr_1[1]},[pX]! + + VLD1 {dCoeff_0[],dCoeff_1[]},[pB]! + VEXT dTemp_0,dGnext_1,dTemp_0,#1 + VEXT qGcurr,qGcurr,qGnext,#3 + + @/*Next Stage*/ + VMOV qFnext,qFcurr + VMOV qGnext,qGcurr + VMLA qGnext,qFnext,qCoeff + VMLA qFcurr,qGcurr,qCoeff + + @/*Prepare for Next Stage*/ + VLD1 {dGcurr_1[1]},[pX]! + VLD1 {dCoeff_0[],dCoeff_1[]},[pB]! + VEXT dTemp_1,dGnext_1,dTemp_1,#1 + VEXT qGcurr,qGcurr,qGnext,#3 + + + @/*Next Stage*/ + VMOV qFnext,qFcurr + VMOV qGnext,qGcurr + VMLA qGnext,qFnext,qCoeff + VMLA qFcurr,qGcurr,qCoeff + + + @/*Prepare for Next Stage*/ + VLD1 {dGcurr_1[1]},[pX]! + VLD1 {dCoeff_0[],dCoeff_1[]},[pB]! + VEXT dTemp_1,dGnext_1,dTemp_1,#1 + VEXT qGcurr,qGcurr,qGnext,#3 + + VREV64 qTemp,qTemp + @/*Next Stage*/ + VMOV qFnext,qFcurr + VMOV qGnext,qGcurr + VMLA qFcurr,qGcurr,qCoeff + VMLA qGnext,qFnext,qCoeff + SUB pX,#16 + + @/*Store the samples in the state buffer for next frame*/ + VST1 {dTemp_0,dTemp_1},[pX]! + SUBS stageCnt,#4 + BGE firLatticeInnerLoop +firLatticeEndInnerLoop: + ADDS stageCnt,#4 + BEQ firLatticeFinishInner + VMOV qMask,qMaskTmp + VLD1 {dCoeff_0,dCoeff_1},[pB]! + + VLD1 {dGcurr_1[1]},[pX]! + VREV64 dTemp_0,dGnext_1 + VBSL qMask,qCoeff,qZero + + + VEXT qGcurr,qGcurr,qGnext,#3 + VDUP qCoeff,dMask_0[0] + VMOV qFnext,qFcurr + VMOV qGnext,qGcurr + VMLA qGnext,qFnext,qCoeff + VMLA qFcurr,qGcurr,qCoeff + + VLD1 {dGcurr_1[1]},[pX]! + + VDUP qCoeff,dMask_0[1] + VEXT dTemp_0,dGnext_1,dTemp_0,#1 + VEXT qGcurr,qGcurr,qGnext,#3 + + VMOV qFnext,qFcurr + VMOV qGnext,qGcurr + VMLA qGnext,qFnext,qCoeff + VMLA qFcurr,qGcurr,qCoeff + + VLD1 {dGcurr_1[1]},[pX]! + VDUP qCoeff,dMask_1[0] + VEXT dTemp_1,dGnext_1,dTemp_1,#1 + VEXT qGcurr,qGcurr,qGnext,#3 + + VMOV qFnext,qFcurr + VMOV qGnext,qGcurr + VMLA qGnext,qFnext,qCoeff + VMLA qFcurr,qGcurr,qCoeff + + VLD1 {dGcurr_1[1]},[pX]! + VDUP qCoeff,dMask_1[1] + VEXT dTemp_1,dGnext_1,dTemp_1,#1 + VEXT qGcurr,qGcurr,qGnext,#3 + + VREV64 qTemp,qTemp + + VMOV qFnext,qFcurr + VMOV qGnext,qGcurr + SUB pX,pX,#16 + + VMOV qMask,qMaskTmp + VMLA qFcurr,qGcurr,qCoeff + VLD1 {dTemp1_0,dTemp1_1},[pX] + VMLA qGnext,qFnext,qCoeff + VBSL qMask,qTemp,qTemp1 + VST1 {dMask_0,dMask_1},[pX] + ADD pX,pX,stageCnt, LSL #2 + +firLatticeFinishInner: + + VST1 {dFcurr_0,dFcurr_1},[pDst]! + SUBS blockSize,#4 + BGE firLatticeOuterLoop + +firLatticeEndOuterLoop: + ADDS blockSize,#4 + BEQ firLatticeEnd + +firLatticeOuterLoop1: + VLD1 {fCurr[0]},[pSrc]! + MOV pB,pCoeffs + MOV pX,pState + VLD1 {gCurr[0]},[pX] + VLD1 {Coeff[0]},[pB]! + + VST1 {fCurr[0]},[pX]! + VMOV gNext,gCurr + VMLA gNext,Coeff,fCurr + VMLA fCurr,Coeff,gCurr + + SUBS stageCnt,numStages,#1 + BLE firLatticeEndinnerLoop1 +firLatticeInnerLoop1: + VLD1 {gCurr[0]},[pX] + VST1 {gNext[0]},[pX]! + + VLD1 {Coeff[0]},[pB]! + + VMOV gNext,gCurr + VMLA gNext,Coeff,fCurr + VMLA fCurr,Coeff,gCurr + SUBS stageCnt,#1 + BGE firLatticeInnerLoop1 +firLatticeEndinnerLoop1: + VST1 {fCurr[0]},[pDst]! + SUBS blockSize,#1 + BGT firLatticeOuterLoop1 + +firLatticeEnd: + @/*Return From Function*/ + POP {r4-r12,pc} + +@/*ARM Registers*/ +.unreq pStateStruct +.unreq pSrc +.unreq pDst +.unreq blockSize + +.unreq pState +.unreq pCoeffs + +.unreq pX +.unreq pB +.unreq numStages + +.unreq stageCnt + +.unreq pTemp +.unreq pMask +.unreq mask + +.unreq fNext +.unreq gCurr +.unreq gNext +.unreq fCurr +.unreq Coeff + +@/*NEON variale Declaration*/ +.unreq qFcurr +.unreq dFcurr_0 +.unreq dFcurr_1 +.unreq qCoeff +.unreq dCoeff_0 +.unreq dCoeff_1 + +.unreq qZero + +.unreq qMask +.unreq dMask_0 +.unreq dMask_1 +.unreq dOut_0 +.unreq dOut_1 + +.unreq qAcc0 +.unreq dAcc0_0 +.unreq dAcc0_1 + +.unreq qTemp +.unreq dTemp_0 +.unreq dTemp_1 + +.unreq qFnext +.unreq dFnext_0 +.unreq dFnext_1 +.unreq qGcurr +.unreq dGcurr_0 +.unreq dGcurr_1 +.unreq qGnext +.unreq dGnext_0 +.unreq dGnext_1 + +.unreq qMask1 +.unreq dMask1_0 +.unreq dMask1_1 +.unreq qMaskTmp +.unreq dMaskTmp_0 +.unreq dMaskTmp_1 +.unreq qTemp1 +.unreq dTemp1_0 +.unreq dTemp1_1 + + @/** + @ * @details + @ * This function operates on floating-point data types. + @ * There are no restrictions on numTaps and blockSize. + @ * + @ * The scratch buffer, pScratch is internally used for holding the state values temporarily. + @ * Cycle Count: + @ * + @ * C0 * blockSize + C1 * numTaps + C2 * numTaps * blockSize + @ * + @ * Cycle Count: + @ * + @ * C0 + C2 * blockSize + C3 * blockSize * interpolateFactor + C4 * numTaps * blockSize * interpolateFactor + @ * + @ * @param[in] *S points to struct parameter + @ * @param[in] *pSrc points to the input buffer + @ * @param[out] *pDst points to the output buffer + @ * @param[out] *pScratch points to the scratch buffer + @ * @param[in] blockSize block size of filter + @ */ + + .align 4 + .global ne10_fir_sparse_float_neon + .extern ne10_qMaskTable32 + .thumb + .thumb_func + +ne10_fir_sparse_float_neon: + PUSH {r4-r12,lr} + PUSH {r0} + +@/*ARM Registers*/ +pStateStruct .req R0 +pSrc .req R1 +pDst .req R2 +pScratch .req R3 +blockSize .req R4 +size2 .req R4 + +pYtmp1 .req R0 +pOut .req R0 +Offset .req R0 + +readIndex .req R1 + +numTaps .req R5 @/* Length of the filter */ + +pState .req R6 @/* State pointer */ +pCoeffs .req R7 @/* Coefficient pointer */ +stateIndex .req R8 + +maxDelay .req R9 +delaySize .req R9 + +pTapDelay .req R10 + +blkCnt .req R11 +size1 .req R11 +temp .req R1 +mask .req R11 +pMask .req R11 + +pX .req R12 + +pY .req R14 +pYtmp2 .req R14 + + +@/*NEON variale Declaration*/ +qInp .qn Q0.F32 +dInp_0 .dn D0.F32 +dInp_1 .dn D1.F32 + +qCoeff .qn Q1.F32 +dCoeff_0 .dn D2.F32 +dCoeff_1 .dn D3.F32 + +qZero .qn Q2.F32 + +qMask .qn Q3.U32 +qMaskF32 .qn Q3.F32 +dMask_0 .dn D6.U32 +dMask_1 .dn D7.U32 + + +qAcc0 .qn Q4.F32 +dAcc0_0 .dn D8.F32 +dAcc0_1 .dn D9.F32 + + +qTemp .qn Q8.F32 +dTemp_0 .dn D16.F32 +dTemp_1 .dn D17.F32 + + +qMaskTmp .qn Q9.U32 +dMaskTmp_0 .dn D18.U32 +dMaskTmp_1 .dn D19.U32 + + + /*Load Mask Table*/ + + LDRH numTaps,[pStateStruct],#2 + LDRH stateIndex,[pStateStruct],#2 + LDR pState,[pStateStruct],#4 + LDR pCoeffs,[pStateStruct],#4 + LDRH maxDelay,[pStateStruct],#4 + LDR pTapDelay,[pStateStruct],#4 + + @// Load blockSize from Stack + LDR blockSize,[SP,#44] + LDR pMask,=ne10_qMaskTable32 + ADD delaySize,blockSize,maxDelay + + VEOR qZero,qZero + AND pY,blockSize,#3 + ADD pY,pMask,pY,LSL #4 + VLD1 {dMaskTmp_0,dMaskTmp_1},[pY] + + + @/* BlockSize of Input samples are copied into the state buffer */ + @/* StateIndex points to the starting position to write in the state buffer */ + MOV pX,pState + LSL Offset,stateIndex,#2 + + SUBS blkCnt,blockSize,#1 + BLT firSparseEndSrcCopy +firSparseSrcCopyLoop: + + LDR pY,[pSrc],#4 + STR pY,[pX,Offset] + ADD Offset,#4 + CMP delaySize,Offset,LSR #2 + IT LE + SUBLE Offset,Offset,delaySize, LSL #2 + SUBS blkCnt,#1 + BGE firSparseSrcCopyLoop +firSparseEndSrcCopy: + + LSR stateIndex,Offset,#2 + LDR Offset,[SP,#0] + STRH stateIndex,[Offset,#2] + + LDR readIndex,[pTapDelay],#4 + ADD readIndex,readIndex,blockSize + SUBS readIndex,stateIndex,readIndex + + @/*Wrap arround index*/ + IT LT + ADDLT readIndex,readIndex,delaySize + + + @/*Processing begins*/ + @/*First stage*/ + MOV pY,pState + MOV pX,pScratch + + @/* copy the sample from the circular buffer to the destination buffer */ + SUB size1,delaySize,readIndex + CMP size1,blockSize + IT GT + MOVGT size1,blockSize + + ADD pYtmp1,pY,readIndex, LSL #2 + SUB size2,blockSize,size1 + MOV pYtmp2,pY + + CMP size1,#0 + BLE firSparseEndcopy1 +firSparseCopy1: + LDR temp,[pYtmp1],#4 + SUBS size1,#1 + STR temp,[pScratch],#4 + BGT firSparseCopy1 +firSparseEndcopy1: + + CMP size2,#0 + BLE firSparseEndcopy2 +firSparseCopy2: + LDR temp,[pYtmp2],#4 + SUBS size2,#1 + STR temp,[pScratch],#4 + BGT firSparseCopy2 +firSparseEndcopy2: + + + @// Load blockSize from Stack + LDR blockSize,[SP,#44] + + + MOV pOut,pDst + VLD1 {dCoeff_0[],dCoeff_1[]},[pCoeffs]! + @//CMP tapCnt,numTaps + + @//Complete the case of tapCnt=numTaps + SUBS blkCnt,blockSize,#4 + VLD1 {dInp_0,dInp_1},[pX]! + BLT firSparseEndInnerLoop +firSparseInnerLoop: + VMUL qAcc0,qInp,qCoeff + VLD1 {dInp_0,dInp_1},[pX]! + SUBS blkCnt,#4 + VST1 {dAcc0_0,dAcc0_1},[pOut]! + BGE firSparseInnerLoop + +firSparseEndInnerLoop: + ADDS blkCnt,#4 + @/* If the blockSize is not a multiple of 4, + @* * compute the remaining samples */ + + VLD1 {dTemp_0,dTemp_1},[pOut] + VMUL qAcc0,qInp,qCoeff + VMOV qMask,qMaskTmp + VBSL qMask,qAcc0,qTemp + VST1 {dMask_0,dMask_1},[pOut] + ADD pOut,pOut,blkCnt,LSL #2 + + LDR readIndex,[pTapDelay],#4 + ADD readIndex,readIndex,blockSize + SUBS readIndex,stateIndex,readIndex + + @/*Wrap arround index*/ + IT LT + ADDLT readIndex,readIndex,delaySize + + SUBS numTaps,#1 + BLE firSparseEnd +firSparseOuterLoop: + + @// Load blockSize from Stack + LDR blockSize,[SP,#44] + + MOV pY,pState + MOV pX,pScratch + + @/* copy the sample from the circular buffer to the destination buffer */ + SUB size1,delaySize,readIndex + CMP size1,blockSize + IT GT + MOVGT size1,blockSize + + ADD pYtmp1,pY,readIndex, LSL #2 + SUB size2,blockSize,size1 + MOV pYtmp2,pY + + + CMP size1,#0 + BLE firSparseEndcopy3 +firSparseCopy3: + LDR temp,[pYtmp1],#4 + SUBS size1,#1 + STR temp,[pScratch],#4 + BGT firSparseCopy3 +firSparseEndcopy3: + CMP size2,#0 + BLE firSparseEndcopy4 +firSparseCopy4: + LDR temp,[pYtmp2],#4 + SUBS size2,#1 + STR temp,[pScratch],#4 + BGT firSparseCopy4 +firSparseEndcopy4: + + @// Load blockSize from Stack + LDR blockSize,[SP,#44] + + + MOV pOut,pDst + VLD1 {dCoeff_0[],dCoeff_1[]},[pCoeffs]! + + + @//Complete the case of tapCnt=numTaps + SUBS blkCnt,blockSize,#4 + VLD1 {dInp_0,dInp_1},[pX]! + VLD1 {dAcc0_0,dAcc0_1},[pOut] + BLT firSparseEndInnerLoop1 +firSparseInnerLoop1: + VMLA qAcc0,qInp,qCoeff + VLD1 {dInp_0,dInp_1},[pX]! + SUBS blkCnt,#4 + VST1 {dAcc0_0,dAcc0_1},[pOut]! + VLD1 {dAcc0_0,dAcc0_1},[pOut] + BGE firSparseInnerLoop1 + +firSparseEndInnerLoop1: + ADDS blkCnt,#4 + @/* If the blockSize is not a multiple of 4, + @* * compute the remaining samples */ + + + VMOV qMask,qMaskTmp + VBSL qMask,qInp,qZero + VMLA qAcc0,qMaskF32,qCoeff + + VST1 {dAcc0_0,dAcc0_1},[pOut] + ADD pOut,pOut,blkCnt,LSL #2 + + LDR readIndex,[pTapDelay],#4 + ADD readIndex,readIndex,blockSize + SUBS readIndex,stateIndex,readIndex + + @/*Wrap arround index*/ + IT LT + ADDLT readIndex,readIndex,delaySize + + SUBS numTaps,#1 + + BGT firSparseOuterLoop +firSparseEnd: + @// Return From Function + POP {r0} + POP {r4-r12,pc} + +@/*ARM Registers*/ +.unreq pStateStruct +.unreq pSrc +.unreq pDst +.unreq pScratch +.unreq blockSize +.unreq size2 + +.unreq pYtmp1 +.unreq pOut +.unreq Offset + +.unreq readIndex + +.unreq numTaps + +.unreq pState +.unreq pCoeffs +.unreq stateIndex + +.unreq maxDelay +.unreq delaySize + +.unreq pTapDelay + +.unreq blkCnt +.unreq size1 +.unreq temp +.unreq mask +.unreq pMask + +.unreq pX + +.unreq pY +.unreq pYtmp2 + +@/*NEON variale Declaration*/ +.unreq qInp +.unreq dInp_0 +.unreq dInp_1 + +.unreq qCoeff +.unreq dCoeff_0 +.unreq dCoeff_1 + +.unreq qZero + +.unreq qMask +.unreq qMaskF32 +.unreq dMask_0 +.unreq dMask_1 + +.unreq qAcc0 +.unreq dAcc0_0 +.unreq dAcc0_1 + +.unreq qTemp +.unreq dTemp_0 +.unreq dTemp_1 + +.unreq qMaskTmp +.unreq dMaskTmp_0 +.unreq dMaskTmp_1 + + .end diff --git a/modules/dsp/NE10_fir_init.c b/modules/dsp/NE10_fir_init.c new file mode 100644 index 0000000..de9bce4 --- /dev/null +++ b/modules/dsp/NE10_fir_init.c @@ -0,0 +1,276 @@ +/* + * Copyright 2012 ARM Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "NE10_types.h" + +/** + * @details + * + * @param[in,out] *S points to an instance of the floating-point FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficients buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of samples that are processed per call. + * @return none. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_f32(). + */ + +ne10_result_t ne10_fir_init_float (ne10_fir_instance_f32_t * S, + ne10_uint16_t numTaps, + ne10_float32_t * pCoeffs, + ne10_float32_t * pState, + ne10_uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and the size of state buffer is (blockSize + numTaps - 1) */ + memset (pState, 0, (numTaps + (blockSize - 1u)) * sizeof (ne10_float32_t)); + + /* Assign state pointer */ + S->pState = pState; + return NE10_OK; +} + +/** + * @brief Initialization function for the floating-point FIR decimator. + * @param[in,out] *S points to an instance of the floating-point FIR decimator structure. + * @param[in] numTaps number of coefficients in the filter. + * @param[in] M decimation factor. + * @param[in] *pCoeffs points to the filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns NE10_OK if initialization was successful or NE10_ERR if + * blockSize is not a multiple of M. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples passed to arm_fir_decimate_f32(). + * M is the decimation factor. + */ + +ne10_result_t ne10_fir_decimate_init_float ( + ne10_fir_decimate_instance_f32_t * S, + ne10_uint16_t numTaps, + ne10_uint8_t M, + ne10_float32_t * pCoeffs, + ne10_float32_t * pState, + ne10_uint32_t blockSize) +{ + ne10_result_t status; + + /* The size of the input block must be a multiple of the decimation factor */ + if ( (blockSize % M) != 0u) + { + /* Set status as NE10_ERR */ + status = NE10_ERR; + } + else + { + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always (blockSize + numTaps - 1) */ + memset (pState, 0, (numTaps + (blockSize - 1u)) * sizeof (ne10_float32_t)); + + /* Assign state pointer */ + S->pState = pState; + + /* Assign Decimation Factor */ + S->M = M; + + status = NE10_OK; + } + + return (status); + +} + +/** + * @brief Initialization function for the floating-point FIR interpolator. + * @param[in,out] *S points to an instance of the floating-point FIR interpolator structure. + * @param[in] L upsample factor. + * @param[in] numTaps number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficient buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of input samples to process per call. + * @return The function returns NE10_OK if initialization was successful or NE10_ERR if + * the filter length numTaps is not a multiple of the interpolation factor L. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
+ *    {b[numTaps-1], b[numTaps-2], b[numTaps-2], ..., b[1], b[0]}
+ * 
+ * The length of the filter numTaps must be a multiple of the interpolation factor L. + * \par + * pState points to the array of state variables. + * pState is of length (numTaps/L)+blockSize-1 words + * where blockSize is the number of input samples processed by each call to arm_fir_interpolate_f32(). + */ + +ne10_result_t ne10_fir_interpolate_init_float ( + ne10_fir_interpolate_instance_f32_t * S, + ne10_uint8_t L, + ne10_uint16_t numTaps, + ne10_float32_t * pCoeffs, + ne10_float32_t * pState, + ne10_uint32_t blockSize) +{ + ne10_result_t status; + + /* The filter length must be a multiple of the interpolation factor */ + if ( (numTaps % L) != 0u) + { + /* Set status as NE10_ERR */ + status = NE10_ERR; + } + else + { + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Assign Interpolation factor */ + S->L = L; + + /* Assign polyPhaseLength */ + S->phaseLength = numTaps / L; + + /* Clear state buffer and size of state array is always phaseLength + blockSize - 1 */ + memset (pState, 0, + (blockSize + + ( (ne10_uint32_t) S->phaseLength - 1u)) * sizeof (ne10_float32_t)); + + /* Assign state pointer */ + S->pState = pState; + + status = NE10_OK; + } + + return (status); + +} + +/** + * @brief Initialization function for the floating-point FIR lattice filter. + * @param[in] *S points to an instance of the floating-point FIR lattice structure. + * @param[in] numStages number of filter stages. + * @param[in] *pCoeffs points to the coefficient buffer. The array is of length numStages. + * @param[in] *pState points to the state buffer. The array is of length numStages. + * @return none. + */ + +ne10_result_t ne10_fir_lattice_init_float ( + ne10_fir_lattice_instance_f32_t * S, + ne10_uint16_t numStages, + ne10_float32_t * pCoeffs, + ne10_float32_t * pState) +{ + /* Assign filter taps */ + S->numStages = numStages; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always numStages */ + memset (pState, 0, (numStages) * sizeof (ne10_float32_t)); + + /* Assign state pointer */ + S->pState = pState; + + return NE10_OK; +} + +/** + * @brief Initialization function for the floating-point sparse FIR filter. + * @param[in,out] *S points to an instance of the floating-point sparse FIR structure. + * @param[in] numTaps number of nonzero coefficients in the filter. + * @param[in] *pCoeffs points to the array of filter coefficients. + * @param[in] *pState points to the state buffer. + * @param[in] *pTapDelay points to the array of offset times. + * @param[in] maxDelay maximum offset time supported. + * @param[in] blockSize number of samples that will be processed per block. + * @return none + * + * Description: + * \par + * pCoeffs holds the filter coefficients and has length numTaps. + * pState holds the filter's state variables and must be of length + * maxDelay + blockSize, where maxDelay + * is the maximum number of delay line values. + * blockSize is the + * number of samples processed by the arm_fir_sparse_f32() function. + */ + +ne10_result_t ne10_fir_sparse_init_float ( + ne10_fir_sparse_instance_f32_t * S, + ne10_uint16_t numTaps, + ne10_float32_t * pCoeffs, + ne10_float32_t * pState, + ne10_int32_t * pTapDelay, + ne10_uint16_t maxDelay, + ne10_uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Assign TapDelay pointer */ + S->pTapDelay = pTapDelay; + + /* Assign MaxDelay */ + S->maxDelay = maxDelay; + + /* reset the stateIndex to 0 */ + S->stateIndex = 0u; + + /* Clear state buffer and size is always maxDelay + blockSize */ + memset (pState, 0, (maxDelay + blockSize) * sizeof (ne10_float32_t)); + + /* Assign state pointer */ + S->pState = pState; + + return NE10_OK; +} + + diff --git a/modules/dsp/NE10_iir.c b/modules/dsp/NE10_iir.c new file mode 100644 index 0000000..21afc7d --- /dev/null +++ b/modules/dsp/NE10_iir.c @@ -0,0 +1,306 @@ +/* + * Copyright 2012 ARM Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NE10 Library : dsp/NE10_iir.c + */ + +#include "NE10_types.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup IIR_Lattice Infinite Impulse Response (IIR) Lattice Filters + * + * This set of functions implements lattice filters + * for Q15, Q31 and floating-point data types. Lattice filters are used in a + * variety of adaptive filter applications. The filter structure has feedforward and + * feedback components and the net impulse response is infinite length. + * The functions operate on blocks + * of input and output data and each call to the function processes + * blockSize samples through the filter. pSrc and + * pDst point to input and output arrays containing blockSize values. + + * \par Algorithm: + * \image html IIRLattice.gif "Infinite Impulse Response Lattice filter" + *
+ *    fN(n)   =  x(n)
+ *    fm-1(n) = fm(n) - km * gm-1(n-1)   for m = N, N-1, ...1
+ *    gm(n)   = km * fm-1(n) + gm-1(n-1) for m = N, N-1, ...1
+ *    y(n)    = vN * gN(n) + vN-1 * gN-1(n) + ...+ v0 * g0(n)
+ * 
+ * \par + * pkCoeffs points to array of reflection coefficients of size numStages. + * Reflection coefficients are stored in time-reversed order. + * \par + *
+ *    {kN, kN-1, ....k1}
+ * 
+ * pvCoeffs points to the array of ladder coefficients of size (numStages+1). + * Ladder coefficients are stored in time-reversed order. + * \par + *
+ *    v0, v1, ...vN
+ * 
+ * pState points to a state array of size numStages + blockSize. + * The state variables shown in the figure above (the g values) are stored in the pState array. + * The state variables are updated after each block of data is processed; the coefficients are untouched. + * \par Instance Structure + * The coefficients and state variables for a filter are stored together in an instance data structure. + * A separate instance structure must be defined for each filter. + * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. + * There are separate instance structure declarations for each of the 3 supported data types. + * + * \par Initialization Functions + * There is also an associated initialization function for each data type. + * The initialization function performs the following operations: + * - Sets the values of the internal structure fields. + * - Zeros out the values in the state buffer. + * + * \par + * Use of the initialization function is optional. + * However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + * To place an instance structure into a const data section, the instance structure must be manually initialized. + * Set the values in the state buffer to zeros and then manually initialize the instance structure as follows: + *
+ *arm_iir_lattice_instance_f32 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ *arm_iir_lattice_instance_q31 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ *arm_iir_lattice_instance_q15 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ * 
+ * \par + * where numStages is the number of stages in the filter; pState points to the state buffer array; + * pkCoeffs points to array of the reflection coefficients; pvCoeffs points to the array of ladder coefficients. + * \par Fixed-Point Behavior + * Care must be taken when using the fixed-point versions of the IIR lattice filter functions. + * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. + * Refer to the function specific documentation below for usage guidelines. + */ + +/** + * @addtogroup IIR_Lattice + * @{ + */ + +/** + * @brief Processing function for the floating-point IIR lattice filter. + * @param[in] *S points to an instance of the floating-point IIR lattice structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + * @return none. + */ + +void ne10_iir_lattice_float_c (const ne10_iir_lattice_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize) +{ + ne10_float32_t fcurr, fnext = 0, gcurr, gnext; /* Temporary variables for lattice stages */ + ne10_float32_t acc; /* Accumlator */ + ne10_uint32_t blkCnt, tapCnt; /* temporary variables for counts */ + ne10_float32_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */ + ne10_uint32_t numStages = S->numStages; /* number of stages */ + ne10_float32_t *pState; /* State pointer */ + ne10_float32_t *pStateCurnt; /* State current pointer */ + + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + gcurr = 0.0f; + blkCnt = blockSize; + + pState = &S->pState[0]; + + /* Sample processing */ + while (blkCnt > 0u) + { + /* Read Sample from input buffer */ + /* fN(n) = x(n) */ + fcurr = *pSrc++; + + /* Initialize state read pointer */ + px1 = pState; + /* Initialize state write pointer */ + px2 = pState; + /* Set accumulator to zero */ + acc = 0.0f; + /* Initialize Ladder coeff pointer */ + pv = &S->pvCoeffs[S->numStages]; + /* Initialize Reflection coeff pointer */ + pk = &S->pkCoeffs[0]; + + + /* Process sample for first tap */ + gcurr = *px1++; + /* fN-1(n) = fN(n) - kN * gN-1(n-1) */ + fnext = fcurr - ( (*pk) * gcurr); + /* gN(n) = kN * fN-1(n) + gN-1(n-1) */ + gnext = (fnext * (*pk++)) + gcurr; + /* write gN(n) into state for next sample processing */ + *px2++ = gnext; + /* y(n) += gN(n) * vN */ + acc += (gnext * (*pv--)); + + /* Update f values for next coefficient processing */ + fcurr = fnext; + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = (numStages - 1u) >> 2; + + while (tapCnt > 0u) + { + /* Process sample for 2nd, 6th ...taps */ + /* Read gN-2(n-1) from state buffer */ + gcurr = *px1++; + /* Process sample for 2nd, 6th .. taps */ + /* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */ + fnext = fcurr - ( (*pk) * gcurr); + /* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */ + gnext = (fnext * (*pk++)) + gcurr; + /* y(n) += gN-1(n) * vN-1 */ + /* process for gN-5(n) * vN-5, gN-9(n) * vN-9 ... */ + acc += (gnext * (*pv--)); + /* write gN-1(n) into state for next sample processing */ + *px2++ = gnext; + + + /* Process sample for 3nd, 7th ...taps */ + /* Read gN-3(n-1) from state buffer */ + gcurr = *px1++; + /* Process sample for 3rd, 7th .. taps */ + /* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */ + fcurr = fnext - ( (*pk) * gcurr); + /* gN-2(n) = kN-2 * fN-3(n) + gN-3(n-1) */ + gnext = (fcurr * (*pk++)) + gcurr; + /* y(n) += gN-2(n) * vN-2 */ + /* process for gN-6(n) * vN-6, gN-10(n) * vN-10 ... */ + acc += (gnext * (*pv--)); + /* write gN-2(n) into state for next sample processing */ + *px2++ = gnext; + + + /* Process sample for 4th, 8th ...taps */ + /* Read gN-4(n-1) from state buffer */ + gcurr = *px1++; + /* Process sample for 4th, 8th .. taps */ + /* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */ + fnext = fcurr - ( (*pk) * gcurr); + /* gN-3(n) = kN-3 * fN-4(n) + gN-4(n-1) */ + gnext = (fnext * (*pk++)) + gcurr; + /* y(n) += gN-3(n) * vN-3 */ + /* process for gN-7(n) * vN-7, gN-11(n) * vN-11 ... */ + acc += (gnext * (*pv--)); + /* write gN-3(n) into state for next sample processing */ + *px2++ = gnext; + + + /* Process sample for 5th, 9th ...taps */ + /* Read gN-5(n-1) from state buffer */ + gcurr = *px1++; + /* Process sample for 5th, 9th .. taps */ + /* fN-5(n) = fN-4(n) - kN-4 * gN-1(n-1) */ + fcurr = fnext - ( (*pk) * gcurr); + /* gN-4(n) = kN-4 * fN-5(n) + gN-5(n-1) */ + gnext = (fcurr * (*pk++)) + gcurr; + /* y(n) += gN-4(n) * vN-4 */ + /* process for gN-8(n) * vN-8, gN-12(n) * vN-12 ... */ + acc += (gnext * (*pv--)); + /* write gN-4(n) into state for next sample processing */ + *px2++ = gnext; + + tapCnt--; + + } + + fnext = fcurr; + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + tapCnt = (numStages - 1u) % 0x4u; + + while (tapCnt > 0u) + { + gcurr = *px1++; + /* Process sample for last taps */ + fnext = fcurr - ( (*pk) * gcurr); + gnext = (fnext * (*pk++)) + gcurr; + /* Output samples for last taps */ + acc += (gnext * (*pv--)); + *px2++ = gnext; + fcurr = fnext; + + tapCnt--; + + } + + + /* y(n) += g0(n) * v0 */ + acc += (fnext * (*pv)); + + *px2++ = fnext; + + /* write out into pDst */ + *pDst++ = acc; + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 1u; + blkCnt--; + + } + + /* Processing is complete. Now copy last S->numStages samples to start of the buffer + for the preperation of next frame process */ + + /* Points to the start of the state buffer */ + pStateCurnt = &S->pState[0]; + pState = &S->pState[blockSize]; + + tapCnt = numStages >> 2u; + + /* copy data */ + while (tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + + } + + /* Calculate remaining number of copies */ + tapCnt = (numStages) % 0x4u; + + /* Copy the remaining q31_t data */ + while (tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + + + + +/** + * @} end of IIR_Lattice group + */ diff --git a/modules/dsp/NE10_iir.neon.s b/modules/dsp/NE10_iir.neon.s new file mode 100644 index 0000000..81af588 --- /dev/null +++ b/modules/dsp/NE10_iir.neon.s @@ -0,0 +1,378 @@ +@/* +@ * Copyright 2012 ARM Limited +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License")@ +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ */ + +@/* +@ * NE10 Library : dsp/NE10_iir.neon.s +@ */ + + + .text + .syntax unified + + @/** + @ * @brief Processing function for the floating-point IIR lattice filter. + @ * @param[in] *S points to an instance of the floating-point IIR lattice structure. + @ * @param[in] *pSrc points to the block of input data. + @ * @param[out] *pDst points to the block of output data. + @ * @param[in] blockSize number of samples to process. + @ * @return none. + @ */ + + .align 4 + .global ne10_iir_lattice_float_neon + .extern ne10_qMaskTable32 + .thumb + .thumb_func + +ne10_iir_lattice_float_neon: + PUSH {r4-r12,lr} + @VPUSH {d8,d9} + +@/*ARM Registers*/ +pStateStruct .req R0 +pSrc .req R1 +pDst .req R2 +blockSize .req R3 + +pState .req R4 @/* State pointer */ +pKcoeffs .req R5 @/* Coefficient pointer */ +pVcoeffs .req R6 @/* Coefficient pointer */ + +pX .req R7 @/* Temporary pointers for state buffer */ +pK .req R8 @/* Temporary pointers for coefficient buffer */ +numStages .req R9 @/* Length of the filter */ + +tapCnt .req R10 @ /* Loop counter */ +pTemp .req R11 + + +pMask .req R14 @ /* Mask Table */ + +mask .req R12 +pV .req R12 + +@/*NEON variale Declaration*/ +dTemp3a_0 .dn D0.U32 +dTemp3_0 .dn D0.F32 +dMask2 .dn D1.U32 + +qGcurr .qn Q1.F32 +dGcurr_0 .dn D2.F32 +dGcurr_1 .dn D3.F32 + +qZero .qn Q2.F32 + +qMask .qn Q3.U32 +dMask_0 .dn D6.U32 +dMask_1 .dn D7.U32 +dOut_0 .dn D6.F32 +dOut_1 .dn D7.F32 + +qGK .qn Q4.F32 +dGK_0 .dn D8.F32 +dGK_1 .dn D9.F32 + +qAcc0 .qn Q8.F32 +dAcc0_0 .dn D16.F32 +dAcc0_1 .dn D17.F32 + +qTemp .qn Q9.F32 +dTemp_0 .dn D18.F32 +dTemp_1 .dn D19.F32 + +qFnext .qn Q10.F32 +dFnext_0 .dn D20.F32 +dFnext_1 .dn D21.F32 + +qFcurr .qn Q11.F32 +dFcurr_0 .dn D22.F32 +dFcurr_1 .dn D23.F32 + +qCoeff0 .qn Q12.F32 +dCoeff0_0 .dn D24.F32 +dCoeff0_1 .dn D25.F32 + +qMask1 .qn Q13.U32 +dMask1_0 .dn D26.U32 +dMask1_1 .dn D27.U32 + + +qMaskTmp .qn Q14.U32 +dMaskTmp_0 .dn D28.U32 +dMaskTmp_1 .dn D29.U32 + +qGnext .qn Q15.F32 +dGnext_0 .dn D30.F32 +dGnext_1 .dn D31.F32 + + + @/* Length of the filter */ + LDRH numStages,[pStateStruct],#4 + @/* State pointer */ + LDR pState,[pStateStruct],#4 + @/* Coefficient pointer */ + LDR pKcoeffs,[pStateStruct],#4 + LDR pVcoeffs,[pStateStruct],#4 + + + @/*Load Mask Valies*/ + LDR pMask,=ne10_qMaskTable32 + AND mask,numStages,#3 + ADD tapCnt,mask,#1 + + ADD pTemp,pMask,mask,LSL #4 + ADD tapCnt,pMask,tapCnt,LSL #4 + + VLD1 {dMaskTmp_0,dMaskTmp_1},[pTemp] + VLD1 {dMask1_0,dMask1_1},[tapCnt] + + ADD pTemp,pMask,#16 + VEOR qZero,qZero + VLD1 {dMask2},[pTemp] + + @/*while blockSize > 0*/ + CMP blockSize, #0 + BEQ firLatticeCopy + + +firLatticeOuterLoop: + VLD1 {dFcurr_0[],dFcurr_1[]},[pSrc]! + MOV pX,pState + VEOR qAcc0,qAcc0 + @/* Initialize Ladder coeff pointer */ + ADD pV,pVcoeffs,numStages, LSL #2 + MOV pK,pKcoeffs + + VLD1 {dGcurr_0,dGcurr_1},[pX] + @/* Load the filter Taps */ + VLD1 {dCoeff0_0,dCoeff0_1},[pK]! + + SUBS tapCnt,numStages,#4 + ADD pV,pV,#4 + BLT firLatticeEndInnerLoop + + +firLatticeInnerLoop: + + + + VMUL qGK,qGcurr,qCoeff0 + + @/* g4k4+g5k5 g6k6+g7k7*/ + VPADD dTemp_0,dGK_1,dGK_0 + @/*g6k6 g4k4+g5k5*/ + VEXT dTemp_1,dTemp_0,dGK_1,#1 + @/*g7k7+g6k6+g5k5+g4k4 g6k6+g5k5+g4k4*/ + VPADD dTemp_1,dTemp_1,dTemp_0 + VMOV dTemp3a_0,dMask2 + VBSL dTemp3a_0,dGK_0,dTemp_0 + VMOV dTemp_0,dTemp3_0 + VSUB qFnext,qFcurr,qTemp + + @/* gN(n) = kN * fN-1(n) + gN-1(n-1) */ + VMLA qGcurr,qFnext,qCoeff0 + + @/* y(n) += gN(n) * vN */ + SUB pV,pV,#16 + VLD1 {dCoeff0_0,dCoeff0_1},[pV] + + @/* write gN-1(n-1) into state for next sample processing */ + VST1 {dGcurr_0,dGcurr_1},[pX]! + VREV64 qCoeff0,qCoeff0 + @/* acc0 += gnext * (*pv--)@ */ + VMLA dAcc0_0,dGcurr_0,dCoeff0_1 + VMLA dAcc0_1,dGcurr_1,dCoeff0_0 + + @/* Update f values for next coefficients processing */ + + VDUP qFcurr,dFnext_1[1] + + VLD1 {dGcurr_0,dGcurr_1},[pX] + @/* Load the filter Taps */ + VLD1 {dCoeff0_0,dCoeff0_1},[pK]! + + SUBS tapCnt,#4 + BGE firLatticeInnerLoop +firLatticeEndInnerLoop: + @/* If the filter length is not a multiple of 4, compute the remaining filter taps */ + ADDS tapCnt,#4 + IT GT + SUBGT tapCnt,#1 + + + VMUL qGK,qGcurr,qCoeff0 + + VPADD dTemp_0,dGK_1,dGK_0 + VEXT dTemp_1,dTemp_0,dGK_1,#1 + VPADD dTemp_1,dTemp_1,dTemp_0 + VMOV dTemp3a_0,dMask2 + VBSL dTemp3a_0,dGK_0,dTemp_0 + VMOV dTemp_0,dTemp3_0 + + @/*Mask the Uncessary f values*/ + VMOV qFnext,qMaskTmp + VBSL qFnext,qTemp,qZero + VSUB qFnext,qFcurr,qFnext + + VMOV qGnext,qGcurr + VMLA qGnext,qFnext,qCoeff0 + + @/*Store on to stack for getting proper Fnext*/ + SUB pTemp,SP,#20 + VST1 {dFnext_0,dFnext_1},[pTemp] + + ADD pTemp,pTemp,tapCnt, LSL #2 + VLD1 {dTemp_0[],dTemp_1[]},[pTemp] + + VMOV qGcurr,qMaskTmp + VBSL qGcurr,qGnext,qTemp + + VLD1 {dTemp_0,dTemp_1},[pX] + VMOV qMask,qMask1 + VBSL qMask,qGcurr,qTemp + VST1 {dMask_0,dMask_1},[pX] + + ADD pX,pX,tapCnt,LSL #2 + + SUB pV,pV,#16 + VLD1 {dCoeff0_0,dCoeff0_1},[pV] + + @// MASk the Gnext value used for Output calculation + VMOV qGnext,qMask1 + VBSL qGnext,qGcurr,qZero + ADD pX,pX,#4 + + VREV64 qCoeff0,qCoeff0 + + VMLA dAcc0_0,dGnext_0,dCoeff0_1 + VMLA dAcc0_1,dGnext_1,dCoeff0_0 + + /*Get Accumulated Result in to single Value*/ + + VLD1 {dTemp_1},[pDst] + VPADD dTemp_0,dAcc0_0,dAcc0_1 + VPADD dTemp_0,dTemp_0 + + VMOV dMask_0,dMask2 + VBSL dMask_0,dTemp_0,dTemp_1 + + VST1 {dMask_0},[pDst] + ADD pDst,#4 + ADD pState,#4 + + SUBS blockSize,#1 + + BGT firLatticeOuterLoop + + + @/* copy last S->numStages samples to start of the buffer + @for next frame process */ + +firLatticeCopy: + AND mask,numStages,#3 + ADD pTemp,pMask,mask,LSL #4 + LDR pX,[pStateStruct,#-12] + + VLD1 {dFcurr_0,dFcurr_1},[pState]! + VLD1 {dMask_0,dMask_1},[pTemp] + SUBS tapCnt,numStages,#4 + BLT firLatticeEnd +firLatticeCopyLoop: + VST1 {dFcurr_0,dFcurr_1},[pX]! + SUBS tapCnt,#4 + VLD1 {dFcurr_0,dFcurr_1},[pState]! + BGE firLatticeCopyLoop +firLatticeEnd: + VLD1 {dTemp_0,dTemp_1},[pX] + VBSL qMask,qFcurr,qTemp + VST1 {dOut_0,dOut_1},[pX] + ADD pX,pX,mask, LSL #2 +@/*ARM Registers*/ +.unreq pStateStruct +.unreq pSrc +.unreq pDst +.unreq blockSize + +.unreq pState +.unreq pKcoeffs +.unreq pVcoeffs + +.unreq pX +.unreq pK +.unreq numStages + +.unreq tapCnt +.unreq pTemp +.unreq pMask +.unreq mask +.unreq pV + +@/*NEON variale Declaration*/ +.unreq dTemp3a_0 +.unreq dTemp3_0 +.unreq dMask2 + +.unreq qGcurr +.unreq dGcurr_0 +.unreq dGcurr_1 + +.unreq qZero +.unreq qMask +.unreq dMask_0 +.unreq dMask_1 +.unreq dOut_0 +.unreq dOut_1 + +.unreq qGK +.unreq dGK_0 +.unreq dGK_1 + +.unreq qAcc0 +.unreq dAcc0_0 +.unreq dAcc0_1 + +.unreq qTemp +.unreq dTemp_0 +.unreq dTemp_1 + +.unreq qFnext +.unreq dFnext_0 +.unreq dFnext_1 + +.unreq qFcurr +.unreq dFcurr_0 +.unreq dFcurr_1 + +.unreq qCoeff0 +.unreq dCoeff0_0 +.unreq dCoeff0_1 + +.unreq qMask1 +.unreq dMask1_0 +.unreq dMask1_1 + +.unreq qMaskTmp +.unreq dMaskTmp_0 +.unreq dMaskTmp_1 + +.unreq qGnext +.unreq dGnext_0 +.unreq dGnext_1 + + @VPOP {d8,d9} + POP {r4-r12,pc} + + .end diff --git a/modules/dsp/NE10_iir_init.c b/modules/dsp/NE10_iir_init.c new file mode 100644 index 0000000..22185dc --- /dev/null +++ b/modules/dsp/NE10_iir_init.c @@ -0,0 +1,61 @@ +/* + * Copyright 2012 ARM Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NE10 Library : dsp/NE10_iir_init.c + */ +#include "NE10_types.h" + + +/** + * @brief Initialization function for the floating-point IIR lattice filter. + * @param[in] *S points to an instance of the floating-point IIR lattice structure. + * @param[in] numStages number of stages in the filter. + * @param[in] *pkCoeffs points to the reflection coefficient buffer. The array is of length numStages. + * @param[in] *pvCoeffs points to the ladder coefficient buffer. The array is of length numStages+1. + * @param[in] *pState points to the state buffer. The array is of length numStages+blockSize. + * @param[in] blockSize number of samples to process. + * @return none. + */ + +ne10_result_t ne10_iir_lattice_init_float (ne10_iir_lattice_instance_f32_t * S, + ne10_uint16_t numStages, + ne10_float32_t * pkCoeffs, + ne10_float32_t * pvCoeffs, + ne10_float32_t * pState, + ne10_uint32_t blockSize) +{ + /* Assign filter taps */ + S->numStages = numStages; + + /* Assign reflection coefficient pointer */ + S->pkCoeffs = pkCoeffs; + + /* Assign ladder coefficient pointer */ + S->pvCoeffs = pvCoeffs; + + /* Clear state buffer and size is always blockSize + numStages */ + memset (pState, 0, (numStages + blockSize) * sizeof (ne10_float32_t)); + + /* Assign state pointer */ + S->pState = pState; + + return NE10_OK; +} + +/** + * @} end of IIR_Lattice group + */ diff --git a/modules/dsp/NE10_init_dsp.c b/modules/dsp/NE10_init_dsp.c index fefcc1e..7885a0c 100644 --- a/modules/dsp/NE10_init_dsp.c +++ b/modules/dsp/NE10_init_dsp.c @@ -25,12 +25,28 @@ ne10_result_t ne10_init_dsp (ne10_int32_t is_NEON_available) ne10_radix4_butterfly_float = ne10_radix4_butterfly_float_neon; ne10_radix4_butterfly_inverse_float = ne10_radix4_butterfly_inverse_float_neon; ne10_rfft_float = ne10_rfft_float_neon; + + ne10_fir_float = ne10_fir_float_neon; + ne10_fir_decimate_float = ne10_fir_decimate_float_neon; + ne10_fir_interpolate_float = ne10_fir_interpolate_float_neon; + ne10_fir_lattice_float = ne10_fir_lattice_float_neon; + ne10_fir_sparse_float = ne10_fir_sparse_float_neon; + + ne10_iir_lattice_float = ne10_iir_lattice_float_neon; } else { ne10_radix4_butterfly_float = ne10_radix4_butterfly_float_c; ne10_radix4_butterfly_inverse_float = ne10_radix4_butterfly_inverse_float_c; ne10_rfft_float = ne10_rfft_float_c; + + ne10_fir_float = ne10_fir_float_c; + ne10_fir_decimate_float = ne10_fir_decimate_float_c; + ne10_fir_interpolate_float = ne10_fir_interpolate_float_c; + ne10_fir_lattice_float = ne10_fir_lattice_float_c; + ne10_fir_sparse_float = ne10_fir_sparse_float_c; + + ne10_iir_lattice_float = ne10_iir_lattice_float_c; } return NE10_OK; } @@ -51,3 +67,38 @@ void (*ne10_rfft_float)( ne10_float32_t * pSrc, ne10_float32_t * pDst, ne10_float32_t * pTemp); + +void (*ne10_fir_float)(const ne10_fir_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); + +void (*ne10_fir_decimate_float)( + const ne10_fir_decimate_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); + +void (*ne10_fir_interpolate_float)( + const ne10_fir_interpolate_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); + +void (*ne10_fir_lattice_float)( + const ne10_fir_lattice_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); + +void (*ne10_fir_sparse_float)( + ne10_fir_sparse_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_float32_t * pScratchIn, + ne10_uint32_t blockSize); + +void (*ne10_iir_lattice_float)(const ne10_iir_lattice_instance_f32_t * S, + ne10_float32_t * pSrc, + ne10_float32_t * pDst, + ne10_uint32_t blockSize); -- 2.7.4