};
+const ne10_uint32_t ne10_divLookUpTable[DIV_LOOKUP_TABLE_SIZE]=
+{
+ 65535,32768,21845,16384,13107,10923,9362,8192,7282,6554,5958,5461,5041,4681,4369,4096,
+ 3855,3641,3449,3277,3121,2979,2849,2731,2621,2521,2427,2341,2260,2185,2114,2048,
+ 1986,1928,1872,1820,1771,1725,1680,1638,1598,1560,1524,1489,1456,1425,1394,1365,
+ 1337,1311,1285,1260,1237,1214,1192,1170,1150,1130,1111,1092,1074,1057,1040,1024,
+ 1008,993,978,964,950,936,923,910,898,886,874,862,851,840,830,819,
+ 809,799,790,780,771,762,753,745,736,728,720,712,705,697,690,683,
+ 676,669,662,655,649,643,636,630,624,618,612,607,601,596,590,585,
+ 580,575,570,565,560,555,551,546,542,537,533,529,524,520,516,512,
+ 508,504,500,496,493,489,485,482,478,475,471,468,465,462,458,455,
+ 452,449,446,443,440,437,434,431,428,426,423,420,417,415,412,410,
+ 407,405,402,400,397,395,392,390,388,386,383,381,379,377,374,372,
+ 370,368,366,364,362,360,358,356,354,352,350,349,347,345,343,341,
+ 340,338,336,334,333,331,329,328,326,324,323,321,320,318,317,315,
+ 314,312,311,309,308,306,305,303,302,301,299,298,297,295,294,293,
+ 291,290,289,287,286,285,284,282,281,280,279,278,277,275,274,273,
+ 272,271,270,269,267,266,265,264,263,262,261,260,259,258,257
+ };
#ifndef _ARM_MASK_TABLE_H
#define _ARM_MASK_TABLE_H
-#define Q_MASK_TABLE_SIZE 20
-#define D_MASK_TABLE_SIZE 6
+#define Q_MASK_TABLE_SIZE 20
+#define D_MASK_TABLE_SIZE 6
+#define DIV_LOOKUP_TABLE_SIZE 255
extern const ne10_uint32_t ne10_qMaskTable32[Q_MASK_TABLE_SIZE];
extern const ne10_uint32_t ne10_dMaskTable32[D_MASK_TABLE_SIZE];
+extern const ne10_uint32_t ne10_divLookUpTable[DIV_LOOKUP_TABLE_SIZE];
#endif
ne10_float32_t * pSrc,
ne10_float32_t * pDst,
ne10_float32_t * pTemp);
+/* fir functions*/
+
+/* function pointers*/
+extern void (*ne10_fir_float)(const ne10_fir_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
+
+extern void (*ne10_fir_decimate_float)(
+ const ne10_fir_decimate_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
+
+extern void (*ne10_fir_interpolate_float)(
+ const ne10_fir_interpolate_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
+
+extern void (*ne10_fir_lattice_float)(
+ const ne10_fir_lattice_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
+
+extern void (*ne10_fir_sparse_float)(
+ ne10_fir_sparse_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_float32_t * pScratchIn,
+ ne10_uint32_t blockSize);
+
+
+/* init functions*/
+extern ne10_result_t ne10_fir_init_float(ne10_fir_instance_f32_t * S,
+ ne10_uint16_t numTaps,
+ ne10_float32_t * pCoeffs,
+ ne10_float32_t * pState,
+ ne10_uint32_t blockSize);
+
+extern ne10_result_t ne10_fir_decimate_init_float(
+ ne10_fir_decimate_instance_f32_t * S,
+ ne10_uint16_t numTaps,
+ ne10_uint8_t M,
+ ne10_float32_t * pCoeffs,
+ ne10_float32_t * pState,
+ ne10_uint32_t blockSize);
+
+extern ne10_result_t ne10_fir_interpolate_init_float(
+ ne10_fir_interpolate_instance_f32_t * S,
+ ne10_uint8_t L,
+ ne10_uint16_t numTaps,
+ ne10_float32_t * pCoeffs,
+ ne10_float32_t * pState,
+ ne10_uint32_t blockSize);
+
+extern ne10_result_t ne10_fir_lattice_init_float(
+ ne10_fir_lattice_instance_f32_t * S,
+ ne10_uint16_t numStages,
+ ne10_float32_t * pCoeffs,
+ ne10_float32_t * pState);
+
+extern ne10_result_t ne10_fir_sparse_init_float(
+ ne10_fir_sparse_instance_f32_t * S,
+ ne10_uint16_t numTaps,
+ ne10_float32_t * pCoeffs,
+ ne10_float32_t * pState,
+ ne10_int32_t * pTapDelay,
+ ne10_uint16_t maxDelay,
+ ne10_uint32_t blockSize);
+
+/* C version*/
+extern void ne10_fir_float_c(const ne10_fir_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
+
+extern void ne10_fir_decimate_float_c(
+ const ne10_fir_decimate_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
+
+extern void ne10_fir_interpolate_float_c(
+ const ne10_fir_interpolate_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
+
+extern void ne10_fir_lattice_float_c(
+ const ne10_fir_lattice_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
+
+extern void ne10_fir_sparse_float_c(
+ ne10_fir_sparse_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_float32_t * pScratchIn,
+ ne10_uint32_t blockSize);
+
+
+/* NEON version*/
+extern void ne10_fir_float_neon(const ne10_fir_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
+
+extern void ne10_fir_decimate_float_neon(const ne10_fir_decimate_instance_f32_t * S,
+ ne10_float32_t *pSrc,
+ ne10_float32_t *pDst,
+ ne10_uint32_t blockSize);
+
+extern void ne10_fir_interpolate_float_neon(const ne10_fir_interpolate_instance_f32_t * S,
+ ne10_float32_t *pSrc,
+ ne10_float32_t *pDst,
+ ne10_uint32_t blockSize);
+
+extern void ne10_fir_lattice_float_neon(const ne10_fir_lattice_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
+
+extern void ne10_fir_sparse_float_neon(ne10_fir_sparse_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_float32_t * pScratch,
+ ne10_uint32_t blockSize);
+
+
+/* iir functions*/
+
+/* function pointers*/
+extern void (*ne10_iir_lattice_float)(const ne10_iir_lattice_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
+
+/* init functions*/
+extern ne10_result_t ne10_iir_lattice_init_float(ne10_iir_lattice_instance_f32_t * S,
+ ne10_uint16_t numStages,
+ ne10_float32_t * pkCoeffs,
+ ne10_float32_t * pvCoeffs,
+ ne10_float32_t * pState,
+ ne10_uint32_t blockSize);
+
+
+/* C version*/
+extern void ne10_iir_lattice_float_c(const ne10_iir_lattice_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
+
+/* NEON version*/
+extern void ne10_iir_lattice_float_neon(const ne10_iir_lattice_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
#ifdef __cplusplus
}
ne10_cfft_radix4_instance_f32_t *p_cfft; /**< Pointer to the complex FFT Instance. */
} ne10_rfft_instance_f32_t;
+/////////////////////////////////////////////////////////
+// definitions for fir
+/////////////////////////////////////////////////////////
+
+/*
+ * @brief Instance structure for the floating-point FIR filter.
+ */
+typedef struct
+{
+ ne10_uint16_t numTaps; /**< Length of the filter. */
+ ne10_float32_t *pState; /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */
+ ne10_float32_t *pCoeffs; /**< Points to the coefficient array. The array is of length numTaps. */
+} ne10_fir_instance_f32_t;
+
+/*
+ * @brief Instance structure for the floating point FIR Lattice filter.
+ */
+typedef struct
+{
+ ne10_uint16_t numStages; /**< numStages of the of lattice filter. */
+ ne10_float32_t *pState; /**< Points to the state variable array. The array is of length numStages. */
+ ne10_float32_t *pCoeffs; /**< Points to the coefficient array. The array is of length numStages. */
+} ne10_fir_lattice_instance_f32_t;
+
+/*
+ * @brief Instance structure for the floating-point FIR Decimation.
+ */
+typedef struct
+{
+ ne10_uint8_t M; /**< Decimation Factor. */
+ ne10_uint16_t numTaps; /**< Length of the filter. */
+ ne10_float32_t *pCoeffs; /**< Points to the coefficient array. The array is of length numTaps.*/
+ ne10_float32_t *pState; /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */
+} ne10_fir_decimate_instance_f32_t;
+
+/*
+ * @brief Instance structure for the floating-point FIR Interpolation.
+ */
+typedef struct
+{
+ ne10_uint8_t L; /**< Interpolation Factor. */
+ ne10_uint16_t phaseLength; /**< Length of each polyphase filter component. */
+ ne10_float32_t *pCoeffs; /**< Points to the coefficient array. The array is of length numTaps.*/
+ ne10_float32_t *pState; /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */
+} ne10_fir_interpolate_instance_f32_t;
+
+/*
+ * @brief Instance structure for the floating-point FIR Sparse filter.
+ */
+typedef struct
+{
+ ne10_uint16_t numTaps; /**< Length of the filter. */
+ ne10_uint16_t stateIndex; /**< Index pointer for the state buffer .*/
+ ne10_float32_t *pState; /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */
+ ne10_float32_t *pCoeffs; /**< Points to the coefficient array. The array is of length numTaps.*/
+ ne10_uint16_t maxDelay; /**< the largest number of delay line values .*/
+ ne10_int32_t *pTapDelay; /**< Pointer to the array containing positions of the non-zero tap values. */
+} ne10_fir_sparse_instance_f32_t;
+
+/**
+ * @brief Instance structure for the floating point IIR Lattice filter.
+ */
+typedef struct
+{
+ ne10_uint16_t numStages; /**< numStages of the of lattice filter. */
+ ne10_float32_t *pState; /**< Points to the state variable array. The array is of length numStages + blockSize -1. */
+ ne10_float32_t *pkCoeffs; /**< Points to the reflection coefficient array. The array is of length numStages. */
+ ne10_float32_t *pvCoeffs; /**< Points to the ladder coefficient array. The array is of length numStages+1. */
+} ne10_iir_lattice_instance_f32_t;
+
#endif
${PROJECT_SOURCE_DIR}/modules/dsp/NE10_cfft_init.c
${PROJECT_SOURCE_DIR}/modules/dsp/NE10_rfft.c
${PROJECT_SOURCE_DIR}/modules/dsp/NE10_rfft_init.c
+ ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fir.c
+ ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fir_init.c
+ ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_iir.c
+ ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_iir_init.c
)
# Add dsp intrinsic NEON files.
# Add dsp NEON files.
set(NE10_DSP_NEON_SRCS
${PROJECT_SOURCE_DIR}/modules/dsp/NE10_cfft.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fir.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_iir.neon.s
)
# Tell CMake these files need to go to the C compiler
--- /dev/null
+/*
+ * Copyright 2012 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : dsp/NE10_fir.c
+ */
+
+#include "NE10_types.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @defgroup FIR Finite Impulse Response (FIR) Filters
+ *
+ * This set of functions implements Finite Impulse Response (FIR) filters
+ * for floating-point data types.
+ * The functions operate on blocks of input and output data and each call to the function processes
+ * <code>blockSize</code> samples through the filter. <code>pSrc</code> and
+ * <code>pDst</code> points to input and output arrays containing <code>blockSize</code> values.
+ *
+ * \par Algorithm:
+ * The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations.
+ * Each filter coefficient <code>b[n]</code> is multiplied by a state variable which equals a previous input sample <code>x[n]</code>.
+ * <pre>
+ * y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
+ * </pre>
+ * \par
+ * \image html FIR.gif "Finite Impulse Response filter"
+ * \par
+ * <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
+ * Coefficients are stored in time reversed order.
+ * \par
+ * <pre>
+ * {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * </pre>
+ * \par
+ * <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
+ * Samples in the state buffer are stored in the following order.
+ * \par
+ * <pre>
+ * {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
+ * </pre>
+ * \par
+ * Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code>.
+ * The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters,
+ * to be avoided and yields a significant speed improvement.
+ * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ * There are separate instance structure declarations for each of the 4 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ *
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Set the values in the state buffer to zeros before static initialization.
+ * The code below statically initializes each of the 4 different data type filter instance structures
+ * <pre>
+ *ne10_fir_instance_f32_t S = {numTaps, pState, pCoeffs};
+ * </pre>
+ *
+ * where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer;
+ * <code>pCoeffs</code> is the address of the coefficient buffer.
+ *
+ * \par Fixed-Point Behavior
+ * Care must be taken when using the fixed-point versions of the FIR filter functions.
+ * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ * Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ * @addtogroup FIR
+ * @{
+ */
+
+/**
+ *
+ * @param[in] *S points to an instance of the floating-point FIR filter structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process per call.
+ * @return none.
+ *
+ */
+
+void ne10_fir_float_c (
+ const ne10_fir_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize)
+{
+
+ ne10_float32_t *pState = S->pState; /* State pointer */
+ ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ ne10_float32_t *pStateCurnt; /* Points to the current sample of the state */
+ ne10_float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ ne10_uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ ne10_uint32_t i, tapCnt, blkCnt; /* Loop counters */
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ ne10_float32_t acc0, acc1, acc2, acc3; /* Accumulators */
+ ne10_float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
+
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = & (S->pState[ (numTaps - 1u)]);
+
+ /* Apply loop unrolling and compute 4 output values simultaneously.
+ * The variables acc0 ... acc3 hold output values that are being computed:
+ *
+ * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ */
+ blkCnt = blockSize >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0u)
+ {
+ /* Copy four new input samples into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set all accumulators to zero */
+ acc0 = 0.0f;
+ acc1 = 0.0f;
+ acc2 = 0.0f;
+ acc3 = 0.0f;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = (pCoeffs);
+
+ /* Read the first three samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2u;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-4 coefficients. */
+ while (tapCnt > 0u)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = * (pb++);
+
+ /* Read x[n-numTaps-3] sample */
+ x3 = * (px++);
+
+ /* acc0 += b[numTaps-1] * x[n-numTaps] */
+ acc0 += x0 * c0;
+
+ /* acc1 += b[numTaps-1] * x[n-numTaps-1] */
+ acc1 += x1 * c0;
+
+ /* acc2 += b[numTaps-1] * x[n-numTaps-2] */
+ acc2 += x2 * c0;
+
+ /* acc3 += b[numTaps-1] * x[n-numTaps-3] */
+ acc3 += x3 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = * (pb++);
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = * (px++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x1 * c0;
+ acc1 += x2 * c0;
+ acc2 += x3 * c0;
+ acc3 += x0 * c0;
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = * (pb++);
+
+ /* Read x[n-numTaps-5] sample */
+ x1 = * (px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x2 * c0;
+ acc1 += x3 * c0;
+ acc2 += x0 * c0;
+ acc3 += x1 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = * (pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x2 = * (px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x3 * c0;
+ acc1 += x0 * c0;
+ acc2 += x1 * c0;
+ acc3 += x2 * c0;
+
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while (tapCnt > 0u)
+ {
+ /* Read coefficients */
+ c0 = * (pb++);
+
+ /* Fetch 1 state variable */
+ x3 = * (px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+
+ /* Reuse the present sample states for next sample */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 4;
+
+ /* The results in the 4 accumulators, store in the destination buffer. */
+ *pDst++ = acc0;
+ *pDst++ = acc1;
+ *pDst++ = acc2;
+ *pDst++ = acc3;
+
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+ while (blkCnt > 0u)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0.0f;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = (pCoeffs);
+
+ i = numTaps;
+
+ /* Perform the multiply-accumulates */
+ do
+ {
+ acc0 += *px++ * *pb++;
+ i--;
+
+ }
+ while (i > 0u);
+
+ /* The result is store in the destination buffer. */
+ *pDst++ = acc0;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ tapCnt = (numTaps - 1u) >> 2u;
+
+ /* copy data */
+ while (tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1u) % 0x4u;
+
+ /* Copy the remaining q31_t data */
+ while (tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+}
+
+/**
+ * @brief Processing function for the floating-point FIR decimator.
+ * @param[in] *S points to an instance of the floating-point FIR decimator structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return none.
+ */
+
+void ne10_fir_decimate_float_c (
+ const ne10_fir_decimate_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize)
+{
+ ne10_float32_t *pState = S->pState; /* State pointer */
+ ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ ne10_float32_t *pStateCurnt; /* Points to the current sample of the state */
+ ne10_float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ ne10_float32_t sum0; /* Accumulator */
+ ne10_float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
+ ne10_uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ ne10_uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = S->pState + (numTaps - 1u);
+
+ /* Total number of output samples to be computed */
+ blkCnt = outBlockSize;
+
+ while (blkCnt > 0u)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = S->M;
+
+ do
+ {
+ *pStateCurnt++ = *pSrc++;
+
+ }
+ while (--i);
+
+ /* Set accumulator to zero */
+ sum0 = 0.0f;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-4 coefficients. */
+ while (tapCnt > 0u)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = * (pb++);
+
+ /* Read x[n-numTaps-1] sample */
+ x0 = * (px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = * (pb++);
+
+ /* Read x[n-numTaps-2] sample */
+ x0 = * (px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = * (pb++);
+
+ /* Read x[n-numTaps-3] sample */
+ x0 = * (px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = * (pb++);
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = * (px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while (tapCnt > 0u)
+ {
+ /* Read coefficients */
+ c0 = * (pb++);
+
+ /* Fetch 1 state variable */
+ x0 = * (px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M;
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = sum0;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ i = (numTaps - 1u) >> 2;
+
+ /* copy data */
+ while (i > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ i = (numTaps - 1u) % 0x04u;
+
+ /* copy data */
+ while (i > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+}
+
+/**
+ * @brief Processing function for the floating-point FIR interpolator.
+ * @param[in] *S points to an instance of the floating-point FIR interpolator structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return none.
+ */
+
+void ne10_fir_interpolate_float_c (
+ const ne10_fir_interpolate_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize)
+{
+ ne10_float32_t *pState = S->pState; /* State pointer */
+ ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ ne10_float32_t *pStateCurnt; /* Points to the current sample of the state */
+ ne10_float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
+
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ ne10_float32_t sum0; /* Accumulators */
+ ne10_float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
+ ne10_uint32_t i, blkCnt, j; /* Loop counters */
+ ne10_uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */
+
+
+ /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = S->pState + (phaseLen - 1u);
+
+ /* Total number of intput samples */
+ blkCnt = blockSize;
+
+ /* Loop over the blockSize. */
+ while (blkCnt > 0u)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1u;
+
+ /* Loop over the Interpolation factor. */
+ i = S->L;
+ while (i > 0u)
+ {
+ /* Set accumulator to zero */
+ sum0 = 0.0f;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ tapCnt = phaseLen >> 2u;
+ while (tapCnt > 0u)
+ {
+
+ /* Read the coefficient */
+ c0 = * (ptr2);
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = * (ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the coefficient */
+ c0 = * (ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = * (ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the coefficient */
+ c0 = * (ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = * (ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the coefficient */
+ c0 = * (ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = * (ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = phaseLen % 0x4u;
+
+ while (tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum0 += * (ptr1++) * (*ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = sum0;
+
+ /* Increment the address modifier index of coefficient buffer */
+ j++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ tapCnt = (phaseLen - 1u) >> 2u;
+
+ /* copy data */
+ while (tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (phaseLen - 1u) % 0x04u;
+
+ while (tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+}
+
+/**
+ * @brief Processing function for the floating-point FIR lattice filter.
+ * @param[in] *S points to an instance of the floating-point FIR lattice structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ */
+
+void ne10_fir_lattice_float_c (
+ const ne10_fir_lattice_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize)
+{
+ ne10_float32_t *pState; /* State pointer */
+ ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ ne10_float32_t *px; /* temporary state pointer */
+ ne10_float32_t *pk; /* temporary coefficient pointer */
+
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ ne10_float32_t fcurr1, fnext1, gcurr1, gnext1; /* temporary variables for first sample in loop unrolling */
+ ne10_float32_t fcurr2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */
+ ne10_float32_t fcurr3, fnext3, gnext3; /* temporary variables for third sample in loop unrolling */
+ ne10_float32_t fcurr4, fnext4, gnext4; /* temporary variables for fourth sample in loop unrolling */
+ ne10_uint32_t numStages = S->numStages; /* Number of stages in the filter */
+ ne10_uint32_t blkCnt, stageCnt; /* temporary variables for counts */
+
+ gcurr1 = 0.0f;
+ pState = &S->pState[0];
+
+ blkCnt = blockSize >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0u)
+ {
+
+ /* Read two samples from input buffer */
+ /* f0(n) = x(n) */
+ fcurr1 = *pSrc++;
+ fcurr2 = *pSrc++;
+
+ /* Initialize coeff pointer */
+ pk = (pCoeffs);
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Read g0(n-1) from state */
+ gcurr1 = *px;
+
+ /* Process first sample for first tap */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext1 = fcurr1 + ( (*pk) * gcurr1);
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext1 = (fcurr1 * (*pk)) + gcurr1;
+
+ /* Process second sample for first tap */
+ /* for sample 2 processing */
+ fnext2 = fcurr2 + ( (*pk) * fcurr1);
+ gnext2 = (fcurr2 * (*pk)) + fcurr1;
+
+ /* Read next two samples from input buffer */
+ /* f0(n+2) = x(n+2) */
+ fcurr3 = *pSrc++;
+ fcurr4 = *pSrc++;
+
+ /* Copy only last input samples into the state buffer
+ which will be used for next four samples processing */
+ *px++ = fcurr4;
+
+ /* Process third sample for first tap */
+ fnext3 = fcurr3 + ( (*pk) * fcurr2);
+ gnext3 = (fcurr3 * (*pk)) + fcurr2;
+
+ /* Process fourth sample for first tap */
+ fnext4 = fcurr4 + ( (*pk) * fcurr3);
+ gnext4 = (fcurr4 * (*pk++)) + fcurr3;
+
+ /* Update of f values for next coefficient set processing */
+ fcurr1 = fnext1;
+ fcurr2 = fnext2;
+ fcurr3 = fnext3;
+ fcurr4 = fnext4;
+
+ /* Loop unrolling. Process 4 taps at a time . */
+ stageCnt = (numStages - 1u) >> 2u;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numStages-3 coefficients. */
+
+ /* Process 2nd, 3rd, 4th and 5th taps ... here */
+ while (stageCnt > 0u)
+ {
+ /* Read g1(n-1), g3(n-1) .... from state */
+ gcurr1 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext4;
+
+ /* Process first sample for 2nd, 6th .. tap */
+ /* Sample processing for K2, K6.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext1 = fcurr1 + ( (*pk) * gcurr1);
+ /* Process second sample for 2nd, 6th .. tap */
+ /* for sample 2 processing */
+ fnext2 = fcurr2 + ( (*pk) * gnext1);
+ /* Process third sample for 2nd, 6th .. tap */
+ fnext3 = fcurr3 + ( (*pk) * gnext2);
+ /* Process fourth sample for 2nd, 6th .. tap */
+ fnext4 = fcurr4 + ( (*pk) * gnext3);
+
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ /* Calculation of state values for next stage */
+ gnext4 = (fcurr4 * (*pk)) + gnext3;
+ gnext3 = (fcurr3 * (*pk)) + gnext2;
+ gnext2 = (fcurr2 * (*pk)) + gnext1;
+ gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+
+ /* Read g2(n-1), g4(n-1) .... from state */
+ gcurr1 = *px;
+
+ /* save g2(n) in state buffer */
+ *px++ = gnext4;
+
+ /* Sample processing for K3, K7.... */
+ /* Process first sample for 3rd, 7th .. tap */
+ /* f3(n) = f2(n) + K3 * g2(n-1) */
+ fcurr1 = fnext1 + ( (*pk) * gcurr1);
+ /* Process second sample for 3rd, 7th .. tap */
+ fcurr2 = fnext2 + ( (*pk) * gnext1);
+ /* Process third sample for 3rd, 7th .. tap */
+ fcurr3 = fnext3 + ( (*pk) * gnext2);
+ /* Process fourth sample for 3rd, 7th .. tap */
+ fcurr4 = fnext4 + ( (*pk) * gnext3);
+
+ /* Calculation of state values for next stage */
+ /* g3(n) = f2(n) * K3 + g2(n-1) */
+ gnext4 = (fnext4 * (*pk)) + gnext3;
+ gnext3 = (fnext3 * (*pk)) + gnext2;
+ gnext2 = (fnext2 * (*pk)) + gnext1;
+ gnext1 = (fnext1 * (*pk++)) + gcurr1;
+
+
+ /* Read g1(n-1), g3(n-1) .... from state */
+ gcurr1 = *px;
+
+ /* save g3(n) in state buffer */
+ *px++ = gnext4;
+
+ /* Sample processing for K4, K8.... */
+ /* Process first sample for 4th, 8th .. tap */
+ /* f4(n) = f3(n) + K4 * g3(n-1) */
+ fnext1 = fcurr1 + ( (*pk) * gcurr1);
+ /* Process second sample for 4th, 8th .. tap */
+ /* for sample 2 processing */
+ fnext2 = fcurr2 + ( (*pk) * gnext1);
+ /* Process third sample for 4th, 8th .. tap */
+ fnext3 = fcurr3 + ( (*pk) * gnext2);
+ /* Process fourth sample for 4th, 8th .. tap */
+ fnext4 = fcurr4 + ( (*pk) * gnext3);
+
+ /* g4(n) = f3(n) * K4 + g3(n-1) */
+ /* Calculation of state values for next stage */
+ gnext4 = (fcurr4 * (*pk)) + gnext3;
+ gnext3 = (fcurr3 * (*pk)) + gnext2;
+ gnext2 = (fcurr2 * (*pk)) + gnext1;
+ gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+ /* Read g2(n-1), g4(n-1) .... from state */
+ gcurr1 = *px;
+
+ /* save g4(n) in state buffer */
+ *px++ = gnext4;
+
+ /* Sample processing for K5, K9.... */
+ /* Process first sample for 5th, 9th .. tap */
+ /* f5(n) = f4(n) + K5 * g4(n-1) */
+ fcurr1 = fnext1 + ( (*pk) * gcurr1);
+ /* Process second sample for 5th, 9th .. tap */
+ fcurr2 = fnext2 + ( (*pk) * gnext1);
+ /* Process third sample for 5th, 9th .. tap */
+ fcurr3 = fnext3 + ( (*pk) * gnext2);
+ /* Process fourth sample for 5th, 9th .. tap */
+ fcurr4 = fnext4 + ( (*pk) * gnext3);
+
+ /* Calculation of state values for next stage */
+ /* g5(n) = f4(n) * K5 + g4(n-1) */
+ gnext4 = (fnext4 * (*pk)) + gnext3;
+ gnext3 = (fnext3 * (*pk)) + gnext2;
+ gnext2 = (fnext2 * (*pk)) + gnext1;
+ gnext1 = (fnext1 * (*pk++)) + gcurr1;
+
+ stageCnt--;
+ }
+
+ /* If the (filter length -1) is not a multiple of 4, compute the remaining filter taps */
+ stageCnt = (numStages - 1u) % 0x4u;
+
+ while (stageCnt > 0u)
+ {
+ gcurr1 = *px;
+
+ /* save g value in state buffer */
+ *px++ = gnext4;
+
+ /* Process four samples for last three taps here */
+ fnext1 = fcurr1 + ( (*pk) * gcurr1);
+ fnext2 = fcurr2 + ( (*pk) * gnext1);
+ fnext3 = fcurr3 + ( (*pk) * gnext2);
+ fnext4 = fcurr4 + ( (*pk) * gnext3);
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext4 = (fcurr4 * (*pk)) + gnext3;
+ gnext3 = (fcurr3 * (*pk)) + gnext2;
+ gnext2 = (fcurr2 * (*pk)) + gnext1;
+ gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+ /* Update of f values for next coefficient set processing */
+ fcurr1 = fnext1;
+ fcurr2 = fnext2;
+ fcurr3 = fnext3;
+ fcurr4 = fnext4;
+
+ stageCnt--;
+
+ }
+
+ /* The results in the 4 accumulators, store in the destination buffer. */
+ /* y(n) = fN(n) */
+ *pDst++ = fcurr1;
+ *pDst++ = fcurr2;
+ *pDst++ = fcurr3;
+ *pDst++ = fcurr4;
+
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+ while (blkCnt > 0u)
+ {
+ /* f0(n) = x(n) */
+ fcurr1 = *pSrc++;
+
+ /* Initialize coeff pointer */
+ pk = (pCoeffs);
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* read g2(n) from state buffer */
+ gcurr1 = *px;
+
+ /* for sample 1 processing */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext1 = fcurr1 + ( (*pk) * gcurr1);
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+ /* save g1(n) in state buffer */
+ *px++ = fcurr1;
+
+ /* f1(n) is saved in fcurr1
+ for next stage processing */
+ fcurr1 = fnext1;
+
+ stageCnt = (numStages - 1u);
+
+ /* stage loop */
+ while (stageCnt > 0u)
+ {
+ /* read g2(n) from state buffer */
+ gcurr1 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext1;
+
+ /* Sample processing for K2, K3.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext1 = fcurr1 + ( (*pk) * gcurr1);
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+ /* f1(n) is saved in fcurr1
+ for next stage processing */
+ fcurr1 = fnext1;
+
+ stageCnt--;
+
+ }
+
+ /* y(n) = fN(n) */
+ *pDst++ = fcurr1;
+
+ blkCnt--;
+
+ }
+
+}
+/**
+ * @brief floating-point Circular write function.
+ */
+
+static void ne10_circular_write_float (
+ ne10_int32_t * circBuffer,
+ ne10_int32_t L,
+ ne10_uint16_t * writeOffset,
+ ne10_int32_t bufferInc,
+ const ne10_int32_t * src,
+ ne10_int32_t srcInc,
+ ne10_uint32_t blockSize)
+{
+ ne10_uint32_t i = 0u;
+ ne10_int32_t wOffset;
+
+ /* Copy the value of Index pointer that points
+ * to the current location where the input samples to be copied */
+ wOffset = *writeOffset;
+
+ /* Loop over the blockSize */
+ i = blockSize;
+
+ while (i > 0u)
+ {
+ /* copy the input sample to the circular buffer */
+ circBuffer[wOffset] = *src;
+
+ /* Update the input pointer */
+ src += srcInc;
+
+ /* Circularly update wOffset. Watch out for positive and negative value */
+ wOffset += bufferInc;
+ if (wOffset >= L)
+ wOffset -= L;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Update the index pointer */
+ *writeOffset = wOffset;
+}
+
+
+
+/**
+ * @brief floating-point Circular Read function.
+ */
+static void ne10_circular_read_float (
+ ne10_int32_t * circBuffer,
+ ne10_int32_t L,
+ ne10_int32_t * readOffset,
+ ne10_int32_t bufferInc,
+ ne10_int32_t * dst,
+ ne10_int32_t * dst_base,
+ ne10_int32_t dst_length,
+ ne10_int32_t dstInc,
+ ne10_uint32_t blockSize)
+{
+ ne10_uint32_t i = 0u;
+ ne10_int32_t rOffset, dst_end;
+
+ /* Copy the value of Index pointer that points
+ * to the current location from where the input samples to be read */
+ rOffset = *readOffset;
+ dst_end = (ne10_int32_t) (dst_base + dst_length);
+
+ /* Loop over the blockSize */
+ i = blockSize;
+
+ while (i > 0u)
+ {
+ /* copy the sample from the circular buffer to the destination buffer */
+ *dst = circBuffer[rOffset];
+
+ /* Update the input pointer */
+ dst += dstInc;
+
+ if (dst == (ne10_int32_t *) dst_end)
+ {
+ dst = dst_base;
+ }
+
+ /* Circularly update rOffset. Watch out for positive and negative value */
+ rOffset += bufferInc;
+
+ if (rOffset >= L)
+ {
+ rOffset -= L;
+ }
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Update the index pointer */
+ *readOffset = rOffset;
+}
+
+
+/**
+ * @brief Processing function for the floating-point sparse FIR filter.
+ * @param[in] *S points to an instance of the floating-point sparse FIR structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data
+ * @param[in] *pScratchIn points to a temporary buffer of size blockSize.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return none.
+ */
+
+void ne10_fir_sparse_float_c (
+ ne10_fir_sparse_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_float32_t * pScratchIn,
+ ne10_uint32_t blockSize)
+{
+
+ ne10_float32_t *pState = S->pState; /* State pointer */
+ ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ ne10_float32_t *px; /* Scratch buffer pointer */
+ ne10_float32_t *py = pState; /* Temporary pointers for state buffer */
+ ne10_float32_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
+ ne10_float32_t *pOut; /* Destination pointer */
+ ne10_int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
+ ne10_uint32_t delaySize = S->maxDelay + blockSize; /* state length */
+ ne10_uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ ne10_int32_t readIndex; /* Read index of the state buffer */
+ ne10_uint32_t tapCnt, blkCnt; /* loop counters */
+ ne10_float32_t coeff = *pCoeffs++; /* Read the first coefficient value */
+
+
+
+ /* BlockSize of Input samples are copied into the state buffer */
+ /* StateIndex points to the starting position to write in the state buffer */
+ ne10_circular_write_float ( (ne10_int32_t *) py, delaySize, &S->stateIndex, 1,
+ (ne10_int32_t *) pSrc, 1, blockSize);
+
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = ( (ne10_int32_t) S->stateIndex - (ne10_int32_t) blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if (readIndex < 0)
+ {
+ readIndex += (ne10_int32_t) delaySize;
+ }
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ ne10_circular_read_float ( (ne10_int32_t *) py, delaySize, &readIndex, 1,
+ (ne10_int32_t *) pb, (ne10_int32_t *) pb, blockSize, 1,
+ blockSize);
+
+ /* Working pointer for the scratch buffer */
+ px = pb;
+
+ /* Working pointer for destination buffer */
+ pOut = pDst;
+
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /* Loop over the blockSize. Unroll by a factor of 4.
+ * Compute 4 Multiplications at a time. */
+ blkCnt = blockSize >> 2u;
+
+ while (blkCnt > 0u)
+ {
+ /* Perform Multiplications and store in destination buffer */
+ *pOut++ = *px++ * coeff;
+ *pOut++ = *px++ * coeff;
+ *pOut++ = *px++ * coeff;
+ *pOut++ = *px++ * coeff;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4,
+ * compute the remaining samples */
+ blkCnt = blockSize % 0x4u;
+
+ while (blkCnt > 0u)
+ {
+ /* Perform Multiplications and store in destination buffer */
+ *pOut++ = *px++ * coeff;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = ( (ne10_int32_t) S->stateIndex - (ne10_int32_t) blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if (readIndex < 0)
+ {
+ readIndex += (ne10_int32_t) delaySize;
+ }
+
+ /* Loop over the number of taps. */
+ tapCnt = (ne10_uint32_t) numTaps - 1u;
+
+ while (tapCnt > 0u)
+ {
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ ne10_circular_read_float ( (ne10_int32_t *) py, delaySize, &readIndex, 1,
+ (ne10_int32_t *) pb, (ne10_int32_t *) pb, blockSize, 1,
+ blockSize);
+
+ /* Working pointer for the scratch buffer */
+ px = pb;
+
+ /* Working pointer for destination buffer */
+ pOut = pDst;
+
+ /* Loop over the blockSize. Unroll by a factor of 4.
+ * Compute 4 MACS at a time. */
+ blkCnt = blockSize >> 2u;
+
+ while (blkCnt > 0u)
+ {
+ /* Perform Multiply-Accumulate */
+ *pOut++ += *px++ * coeff;
+ *pOut++ += *px++ * coeff;
+ *pOut++ += *px++ * coeff;
+ *pOut++ += *px++ * coeff;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4,
+ * compute the remaining samples */
+ blkCnt = blockSize % 0x4u;
+
+ while (blkCnt > 0u)
+ {
+ /* Perform Multiply-Accumulate */
+ *pOut++ += *px++ * coeff;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = ( (ne10_int32_t) S->stateIndex -
+ (ne10_int32_t) blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if (readIndex < 0)
+ {
+ readIndex += (ne10_int32_t) delaySize;
+ }
+
+ /* Decrement the tap loop counter */
+ tapCnt--;
+ }
+
+}
+
+
+/**
+ * @} end of FIR group
+ */
--- /dev/null
+@/*
+@ * Copyright 2012 ARM Limited
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License")@
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ */
+
+@/*
+@ * NE10 Library : dsp/NE10_fir.neon.s
+@ */
+
+
+ .text
+ .syntax unified
+
+ @/**
+ @ * @details
+ @ * This function operates on floating-point data types.
+ @ * There are no restrictions on numTaps and blockSize.
+ @ *
+ @ * The order of the coefficients in *coeffs should be
+ @ * bN, bN-1, bN-2, .....b1, b0
+ @ *
+ @ * <b>Cycle Count:</b>
+ @ *
+ @ * <code>45 + 8 * numTaps + 12.25 * blockSize + 4.375 * numTaps * blockSize</code>
+ @ *
+ @ * @param[in] *S points to struct parameter
+ @ * @param[in] *pSrc points to the input buffer
+ @ * @param[out] *pDst points to the output buffer
+ @ * @param[in] blockSize block size of filter
+ @ */
+
+ .align 4
+ .global ne10_fir_float_neon
+ .extern ne10_qMaskTable32
+ .thumb
+ .thumb_func
+
+ne10_fir_float_neon:
+ PUSH {r4-r12,lr}
+@/*ARM Registers*/
+pStateStruct .req R0
+pSrc .req R1
+pDst .req R2
+blockSize .req R3
+
+pState .req R4 @/* State pointer */
+pCoeffs .req R5 @/* Coefficient pointer */
+pStateCurnt .req R6 @/* Points to the current sample of the state */
+
+pX .req R7 @/* Temporary pointers for state buffer */
+pB .req R8 @/* Temporary pointers for coefficient buffer */
+numTaps .req R9 @/* Length of the filter */
+
+tapCnt .req R10 @ /* Loop counter */
+Count .req R11 @ /* Loop counter */
+pTemp .req R11
+pMask .req R14 @ /* Mask Table */
+
+mask .req R12
+
+@/*NEON variale Declaration*/
+qInp .qn Q0.F32
+dInp_0 .dn D0.F32
+dInp_1 .dn D1.F32
+qCoeff .qn Q1.F32
+dCoeff_0 .dn D2.F32
+dCoeff_1 .dn D3.F32
+qZero .qn Q2.F32
+
+qMask .qn Q3.U32
+dMask_0 .dn D6.U32
+dMask_1 .dn D7.U32
+dOut_0 .dn D6.F32
+dOut_1 .dn D7.F32
+
+qAcc0 .qn Q8.F32
+dAcc0_0 .dn D16.F32
+dAcc0_1 .dn D17.F32
+
+
+qTemp .qn Q9.F32
+dTemp_0 .dn D18.F32
+dTemp_1 .dn D19.F32
+
+qTemp1 .qn Q10.F32
+dTemp1_0 .dn D20.F32
+dTemp1_1 .dn D21.F32
+qTemp2 .qn Q11.F32
+qTemp3 .qn Q12.F32
+qMask1 .qn Q13.U32
+dMask1_0 .dn D26.U32
+dMask1_1 .dn D27.U32
+qMaskTmp .qn Q14.U32
+dMaskTmp_0 .dn D28.U32
+dMaskTmp_1 .dn D29.U32
+
+
+
+
+
+ LDRH numTaps,[pStateStruct],#4
+ LDR pState,[pStateStruct],#4
+ LDR pCoeffs,[pStateStruct],#4
+
+ @/* S->state buffer contains previous frame (numTaps - 1) samples */
+ @/* pStateCurnt points to the location where the new input data should be written */
+ @/*pStateCurnt = &(S->state[(numTaps - 1u)])@*/
+ SUB mask,numTaps,#1
+ LDR pMask,=ne10_qMaskTable32
+ AND tapCnt,numTaps,#3
+ ADD pStateCurnt,pState,mask,LSL #2
+ AND mask,blockSize,#3
+
+
+ @/* Apply loop unrolling and compute 4 output values simultaneously.
+ @* The variables acc0 ... acc3 hold output values that are being computed:
+ @*
+ @* acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ @* acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ @* acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ @* acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ @*/
+
+ @/*If numTaps,blockSize are not multiples of 4, Get the appropriate Masks*/
+
+
+ ADD pTemp,pMask,tapCnt,LSL #4
+ VEOR qZero,qZero
+ ADD pX,pMask,mask,LSL #4
+ VLD1 {dMaskTmp_0,dMaskTmp_1},[pTemp]
+ VLD1 {dMask1_0,dMask1_1},[pX]
+
+
+ @/* Copy blockCnt number of new input samples into the state buffer */
+
+ SUBS blockSize,#4
+ BLT firEndOuterLoop
+
+ @/* Compute 4 outputs at a time*/
+
+firOuterLoop:
+
+ VLD1 {dTemp_0,dTemp_1},[pSrc]!
+ MOV pX,pState
+ MOV pB,pCoeffs
+ @/* Read the first four samples from the state buffer:
+ @* x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2],x[n-numTaps-3] */
+
+ VST1 {dTemp_0,dTemp_1},[pStateCurnt]!
+ @/* Zero the Accumulators*/
+ VEOR qAcc0,qAcc0
+ VLD1 {dInp_0,dInp_1},[pX]!
+
+ @//* Read the first four coefficients b[numTaps] to b[numTaps-3] */
+ VLD1 {dCoeff_0,dCoeff_1},[pB]!
+ @/* Loop unrolling. Process 4 taps at a time. */
+ SUBS tapCnt,numTaps,#4
+ VLD1 {dTemp_0,dTemp_1},[pX]!
+
+ BLT firEndInnerLoop
+
+firInnerLoop:
+
+
+ VEXT qTemp1,qInp,qTemp,#1
+ @/* acc0 += b[numTaps] * x[n-numTaps-1]+ b[numTaps] * x[n-numTaps-2] +
+ @* b[numTaps] * x[n-numTaps-3] + b[numTaps] * x[n-numTaps-4]*/
+ VMLA qAcc0,qInp,dCoeff_0[0]
+ VEXT qTemp2,qInp,qTemp,#2
+ @/* acc1 += b[numTaps-1] * x[n-numTaps-2]+ b[numTaps-1] * x[n-numTaps-3] +
+ @b[numTaps-1] * x[n-numTaps-4] +*b[numTaps-1] * x[n-numTaps-5]*/
+ VMLA qAcc0,qTemp1,dCoeff_0[1]
+ VEXT qTemp3,qInp,qTemp,#3
+ @/* acc2 += b[numTaps-2] * x[n-numTaps-3]+ b[numTaps-2] * x[n-numTaps-4] +
+ @b[numTaps-2] * x[n-numTaps-5] + *b[numTaps-2] * x[n-numTaps-6]*/
+ @//vacc0q_f32 = vmlaq_lane_f32(vacc0q_f32,vxtemp2q_f32,vget_high_f32(vcq_f32),0)@
+ VMLA qAcc0,qTemp2,dCoeff_1[0]
+ VMOV qInp,qTemp
+ @/* acc3 += b[numTaps-3] * x[n-numTaps-4]+ b[numTaps-3] * x[n-numTaps-5] +
+ @b[numTaps-3] * x[n-numTaps-6] +*b[numTaps-3] * x[n-numTaps-7] */
+
+ VMLA qAcc0,qTemp3,dCoeff_1[1]
+
+ @/* Read the b[numTaps-4] to b[numTaps-7] coefficients */
+ VLD1 {dCoeff_0,dCoeff_1},[pB]!
+ SUBS tapCnt,#4
+ VLD1 {dTemp_0,dTemp_1},[pX]!
+
+ BGE firInnerLoop
+firEndInnerLoop:
+
+
+ @/* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ @/*Select only the remaining filter Taps*/
+ VMOV qMask,qMaskTmp
+ VBSL qMask,qCoeff,qZero
+ VEXT qTemp1,qInp,qTemp,#1
+ VMLA qAcc0,qInp,dOut_0[0]
+ VEXT qTemp2,qInp,qTemp,#2
+ VMLA qAcc0,qTemp1,dOut_0[1]
+ VEXT qTemp3,qInp,qTemp,#3
+ VMLA qAcc0,qTemp2,dOut_1[0]
+ @/* Advance the state pointer by 4 to process the next group of 4 samples */
+ ADD pState,#16
+
+ VMLA qAcc0,qTemp3,dOut_1[1]
+
+
+ @/* The results in the 4 accumulators are in 2.30 format. Convert to 1.31
+ @ * Then store the 4 outputs in the destination buffer. */
+ SUBS blockSize,#4
+ VST1 {dAcc0_0,dAcc0_1},[pDst]!
+
+ BGE firOuterLoop
+
+firEndOuterLoop:
+ @/*Handle BlockSize Not a Multiple of 4*/
+ ADDS blockSize,#4
+ BEQ firCopyData
+ @/*Copy the Remaining BlockSize Number of Input Sample to state Buffer*/
+ VMOV qMask,qMask1
+ VLD1 {dTemp1_0,dTemp1_1},[pStateCurnt]
+ VLD1 {dTemp_0,dTemp_1},[pSrc]
+
+ ADD pSrc,pSrc,blockSize,LSL #2
+ MOV pX,pState
+ MOV pB,pCoeffs
+
+ VBSL qMask,qTemp,qTemp1
+ VST1 {dMask_0,dMask_1},[pStateCurnt]
+ VLD1 {dInp_0,dInp_1},[pX]!
+
+ ADD pStateCurnt,pStateCurnt,blockSize, LSL #2
+
+ @/* Zero the Accumulators*/
+ VEOR qAcc0,qAcc0
+ VLD1 {dCoeff_0,dCoeff_1},[pB]!
+ SUBS tapCnt,numTaps,#4
+ VLD1 {dTemp_0,dTemp_1},[pX]!
+
+ BLT firEndInnerLoop1
+
+firInnerLoop1:
+
+ VEXT qTemp1,qInp,qTemp,#1
+ VMLA qAcc0,qInp,dCoeff_0[0]
+ VEXT qTemp2,qInp,qTemp,#2
+ VMLA qAcc0,qTemp1,dCoeff_0[1]
+ VEXT qTemp3,qInp,qTemp,#3
+ VMLA qAcc0,qTemp2,dCoeff_1[0]
+ VMOV qInp,qTemp
+ VMLA qAcc0,qTemp3,dCoeff_1[1]
+ VLD1 {dCoeff_0,dCoeff_1},[pB]!
+ SUBS tapCnt,#4
+ VLD1 {dTemp_0,dTemp_1},[pX]!
+
+ BGE firInnerLoop1
+firEndInnerLoop1:
+
+
+ VMOV qMask,qMaskTmp
+ VBSL qMask,qCoeff,qZero
+ VEXT qTemp1,qInp,qTemp,#1
+ VMLA qAcc0,qInp,dOut_0[0]
+ VEXT qTemp2,qInp,qTemp,#2
+ VMLA qAcc0,qTemp1,dOut_0[1]
+ VEXT qTemp3,qInp,qTemp,#3
+ VMLA qAcc0,qTemp2,dOut_1[0]
+ VMOV qMask,qMask1
+ VLD1 {dTemp_0,dTemp_1},[pDst]
+ VMLA qAcc0,qTemp3,dOut_1[1]
+
+
+ @/* If the blockSize is not a multiple of 4, Mask the unwanted Output */
+
+ VBSL qMask,qAcc0,qTemp
+ VST1 {dMask_0,dMask_1},[pDst]
+ ADD pDst,pDst,blockSize,LSL #2
+ ADD pState,pState,blockSize,LSL #2
+
+
+firCopyData:
+ @/* Processing is complete. Now shift the data in the state buffer down by
+ @** blockSize samples. This prepares the state buffer for the next function
+ @** call. */
+
+ @/* Points to the start of the state buffer */
+
+ SUB numTaps,numTaps,#1
+ AND mask,numTaps,#3
+ LDR pStateCurnt,[pStateStruct,#-8]
+ ADD pTemp,pMask,mask,LSL #4
+ VLD1 {dInp_0,dInp_1},[pState]!
+ VLD1 {dMask_0,dMask_1},[pTemp]
+
+
+ @/* copy data */
+
+ SUBS Count,numTaps,#4
+ BLT firEnd
+firCopyLoop:
+ VST1 {dInp_0,dInp_1},[pStateCurnt]!
+ SUBS Count,#4
+ VLD1 {dInp_0,dInp_1},[pState]!
+ BGE firCopyLoop
+
+firEnd:
+
+ VLD1 {dTemp_0,dTemp_1},[pStateCurnt]
+ VBSL qMask,qInp,qTemp
+ VST1 {dOut_0,dOut_1},[pStateCurnt]
+ ADD pStateCurnt,pStateCurnt,mask, LSL #2
+
+ @/*Return From Function*/
+ POP {r4-r12,pc}
+@/*ARM Registers*/
+.unreq pStateStruct
+.unreq pSrc
+.unreq pDst
+.unreq blockSize
+
+.unreq pState
+.unreq pCoeffs
+.unreq pStateCurnt
+
+.unreq pX
+.unreq pB
+.unreq numTaps
+
+.unreq tapCnt
+.unreq Count
+.unreq pTemp
+.unreq pMask
+
+.unreq mask
+
+@/*NEON variale Declaration*/
+.unreq qInp
+.unreq dInp_0
+.unreq dInp_1
+.unreq qCoeff
+.unreq dCoeff_0
+.unreq dCoeff_1
+.unreq qZero
+
+.unreq qMask
+.unreq dMask_0
+.unreq dMask_1
+.unreq dOut_0
+.unreq dOut_1
+
+.unreq qAcc0
+.unreq dAcc0_0
+.unreq dAcc0_1
+
+.unreq qTemp
+.unreq dTemp_0
+.unreq dTemp_1
+
+.unreq qTemp1
+.unreq dTemp1_0
+.unreq dTemp1_1
+.unreq qTemp2
+.unreq qTemp3
+.unreq qMask1
+.unreq dMask1_0
+.unreq dMask1_1
+.unreq qMaskTmp
+.unreq dMaskTmp_0
+.unreq dMaskTmp_1
+
+ @/**
+ @ * @details
+ @ * This function operates on floating-point data types.
+ @ * There are no restrictions on numTaps and blockSize.
+ @ *
+ @ * The order of the coefficients in *coeffs should be
+ @ * bN, bN-1, bN-2, .....b1, b0
+ @ *
+ @ * <b>Cycle Count:</b>
+ @ *
+ @ * <code> Co + C1 * numTaps + C3 * blockSize * decimation Factor + c4 * numTaps * blockSize</code>
+ @ *
+ @ * @param[in] *S points to struct parameter
+ @ * @param[in] *pSrc points to the input buffer
+ @ * @param[out] *pDst points to the output buffer
+ @ * @param[in] blockSize block size of filter
+ @ */
+
+ .align 4
+ .global ne10_fir_decimate_float_neon
+ .extern ne10_qMaskTable32
+ .extern ne10_divLookUpTable
+ .thumb
+ .thumb_func
+
+ne10_fir_decimate_float_neon:
+
+ PUSH {r4-r12,lr}
+
+@/*ARM Registers*/
+pStateStruct .req R0
+pSrc .req R1
+pDst .req R2
+blockSize .req R3
+
+pState .req R4 @/* State pointer */
+pCoeffs .req R5 @/* Coefficient pointer */
+decimationFact .req R6
+outBlockSize .req R7
+
+pX .req R6 @/* Temporary pointers for state buffer */
+pB .req R8 @/* Temporary pointers for coefficient buffer */
+numTaps .req R9 @/* Length of the filter */
+
+tapCnt .req R10 @ /* Loop counter */
+Count .req R11 @ /* Loop counter */
+pTemp .req R11
+blkCnt .req R11
+pMask .req R14 @ /* Mask Table */
+
+mask .req R12
+Offset .req R12
+
+@/*NEON variale Declaration*/
+qInp0 .qn Q0.F32
+dInp0_0 .dn D0.F32
+dInp0_1 .dn D1.F32
+
+qCoeff .qn Q1.F32
+dCoeff_0 .dn D2.F32
+dCoeff_1 .dn D3.F32
+
+qZero .qn Q2.F32
+qMask .qn Q3.U32
+qMaskF32 .qn Q3.F32
+dMask_0 .dn D6.U32
+dMask_1 .dn D7.U32
+dOut_0 .dn D6.F32
+dOut_1 .dn D7.F32
+
+qInp3 .qn Q4.F32
+dInp3_0 .dn D8.F32
+dInp3_1 .dn D9.F32
+
+qAcc0 .qn Q8.F32
+dAcc0_0 .dn D16.F32
+dAcc0_1 .dn D17.F32
+
+
+qTemp .qn Q9.F32
+dTemp_0 .dn D18.F32
+dTemp_1 .dn D19.F32
+
+qInp1 .qn Q9.F32
+dInp1_0 .dn D18.F32
+dInp1_1 .dn D19.F32
+
+qAcc1 .qn Q10.F32
+dAcc1_0 .dn D20.F32
+dAcc1_1 .dn D21.F32
+qAcc2 .qn Q11.F32
+dAcc2_0 .dn D22.F32
+dAcc2_1 .dn D23.F32
+qAcc3 .qn Q12.F32
+dAcc3_0 .dn D24.F32
+dAcc3_1 .dn D25.F32
+
+qMask1 .qn Q13.U32
+dMask1_0 .dn D26.U32
+dMask1_1 .dn D27.U32
+
+qMaskTmp .qn Q14.U32
+dMaskTmp_0 .dn D28.U32
+dMaskTmp_1 .dn D29.U32
+
+
+qInp2 .qn Q15.F32
+dInp2_0 .dn D30.F32
+dInp2_1 .dn D31.F32
+
+
+
+
+ LDRB decimationFact,[pStateStruct],#2
+ LDRH numTaps,[pStateStruct],#2
+ LDR pCoeffs,[pStateStruct],#4
+ LDR pState,[pStateStruct],#4
+
+ @//outBlockSize = blockSize / S->M
+ LDR pTemp,=ne10_divLookUpTable
+ SUBS mask,decimationFact,#1
+ ADD pTemp,pTemp,mask,LSL #2
+ LDR mask,[pTemp]
+ @//MOV pX,#0
+
+
+ SMULWB outBlockSize,blockSize,mask
+ CMP outBlockSize,#0
+ IT LT
+ RSBLT outBlockSize,#0
+
+
+ @/* S->state buffer contains previous frame (numTaps - 1) samples */
+ @/* pStateCurnt points to the location where the new input data should be written */
+ @//pStateCurnt = S->state + (numTaps - 1u)@
+
+
+ @/* Copy Blocksize number of new input samples into the state buffer */
+
+ LDR pMask,=ne10_qMaskTable32
+ SUB tapCnt,numTaps,#1
+ AND mask,blockSize,#3
+
+ ADD pB,pState,tapCnt,LSL #2
+ ADD mask,pMask,mask,LSL #4
+ VLD1 {dTemp_0,dTemp_1},[pSrc]!
+ VLD1 {dMask1_0,dMask1_1},[mask]
+
+
+ SUBS Count,blockSize,#4
+ LSL Offset,decimationFact, #2
+ VMOV qMask,qMask1
+
+ BLT firDecimateEndCopy
+firDecimateCopyLoop:
+
+ VST1 {dTemp_0,dTemp_1},[pB]!
+ SUBS Count,#4
+ VLD1 {dTemp_0,dTemp_1},[pSrc]!
+ BGE firDecimateCopyLoop
+firDecimateEndCopy:
+ VLD1 {dCoeff_0,dCoeff_1},[pB]
+
+ VBSL qMask,qTemp,qCoeff
+ VST1 {dMask_0,dMask_1},[pB]
+ ADD pB,pB,tapCnt,LSL #2
+
+ @// Load Mask Value
+ AND blkCnt,outBlockSize,#3
+ ADD blkCnt,pMask,blkCnt,LSL #4
+ VLD1 {dMask1_0,dMask1_1},[blkCnt]
+
+ @/*Load Mask Table Values*/
+
+ AND tapCnt,numTaps,#3
+ ADD pTemp,pMask,tapCnt,LSL #4
+ VEOR qZero,qZero,qZero
+ VLD1 {dMaskTmp_0,dMaskTmp_1},[pTemp]
+
+ @/*Handle 4 output samples at a time */
+ SUBS blkCnt,outBlockSize,#4
+ BLT firDecimateEndOuterLoop
+
+ @//blkCnt = outBlockSize>>2@
+firDecimateOuterLoop:
+ @/* Set accumulator to zero */
+ VEOR qAcc0,qAcc0,qAcc0
+ VEOR qAcc1,qAcc1,qAcc1
+ VEOR qAcc2,qAcc2,qAcc2
+ VEOR qAcc3,qAcc3,qAcc3
+ @/* Initialize state pointer */
+ MOV pX,pState
+ @/* Initialize coeff pointer */
+ MOV pB,pCoeffs
+
+ SUBS tapCnt,numTaps,#4
+ VLD1 {dCoeff_0,dCoeff_1},[pB]!
+
+ VLD1 {dInp0_0,dInp0_1},[pX],Offset
+ VLD1 {dInp1_0,dInp1_1},[pX],Offset
+ VLD1 {dInp2_0,dInp2_1},[pX],Offset
+ VLD1 {dInp3_0,dInp3_1},[pX],Offset
+ SUB pX,pX,Offset, LSL #2
+ ADD pX,pX,#16
+
+ BLT firDecimateEndInnerLoop
+firDecimateInnerLoop:
+ VMLA qAcc0,qCoeff,qInp0
+ VMLA qAcc1,qCoeff,qInp1
+ VMLA qAcc2,qCoeff,qInp2
+ VMLA qAcc3,qCoeff,qInp3
+
+ VLD1 {dCoeff_0,dCoeff_1},[pB]!
+ VLD1 {dInp0_0,dInp0_1},[pX],Offset
+ VLD1 {dInp1_0,dInp1_1},[pX],Offset
+ VLD1 {dInp2_0,dInp2_1},[pX],Offset
+ VLD1 {dInp3_0,dInp3_1},[pX],Offset
+ SUB pX,pX,Offset, LSL #2
+ ADD pX,pX,#16
+
+ SUBS tapCnt,#4
+ BGE firDecimateInnerLoop
+firDecimateEndInnerLoop:
+ @/* If the filter length is not a multiple of 4, compute the remaining filter taps */
+
+ VMOV qMask,qMaskTmp
+ VBSL qMask,qCoeff,qZero
+
+ VMLA qAcc0,qMaskF32,qInp0
+ VMLA qAcc1,qMaskF32,qInp1
+ VMLA qAcc2,qMaskF32,qInp2
+ VMLA qAcc3,qMaskF32,qInp3
+
+ VADD dAcc0_0,dAcc0_0,dAcc0_1
+ VADD dAcc1_0,dAcc1_0,dAcc1_1
+ VADD dAcc2_0,dAcc2_0,dAcc2_1
+ VADD dAcc3_0,dAcc3_0,dAcc3_1
+
+ VPADD dAcc0_0,dAcc0_0,dAcc1_0
+ VPADD dAcc0_1,dAcc2_0,dAcc3_0
+ ADD pState,pState,Offset,LSL #2
+ VST1 {dAcc0_0,dAcc0_1},[pDst]!
+
+ SUBS blkCnt,#4
+ BGE firDecimateOuterLoop
+
+firDecimateEndOuterLoop:
+ @/*Handle BlockSize Not a Multiple of 4*/
+ ADDS blkCnt,#4
+ BEQ firDecimateCopyData
+
+
+ @/* Set accumulator to zero */
+ VEOR qAcc0,qAcc0,qAcc0
+ VEOR qAcc1,qAcc1,qAcc1
+ VEOR qAcc2,qAcc2,qAcc2
+ VEOR qAcc3,qAcc3,qAcc3
+ @/* Initialize state pointer */
+ MOV pX,pState
+ @/* Initialize coeff pointer */
+ MOV pB,pCoeffs
+ SUBS tapCnt,numTaps,#4
+ VLD1 {dCoeff_0,dCoeff_1},[pB]!
+
+ VLD1 {dInp0_0,dInp0_1},[pX],Offset
+ VLD1 {dInp1_0,dInp1_1},[pX],Offset
+ VLD1 {dInp2_0,dInp2_1},[pX],Offset
+ VLD1 {dInp3_0,dInp3_1},[pX],Offset
+ SUB pX,pX,Offset, LSL #2
+ ADD pX,pX,#16
+
+ BLT firDecimateEndInnerLoop1
+firDecimateInnerLoop1:
+ VMLA qAcc0,qCoeff,qInp0
+ VMLA qAcc1,qCoeff,qInp1
+ VMLA qAcc2,qCoeff,qInp2
+ VMLA qAcc3,qCoeff,qInp3
+
+ VLD1 {dCoeff_0,dCoeff_1},[pB]!
+ VLD1 {dInp0_0,dInp0_1},[pX],Offset
+ VLD1 {dInp1_0,dInp1_1},[pX],Offset
+ VLD1 {dInp2_0,dInp2_1},[pX],Offset
+ VLD1 {dInp3_0,dInp3_1},[pX],Offset
+ SUB pX,pX,Offset, LSL #2
+ ADD pX,pX,#16
+
+ SUBS tapCnt,#4
+ BGE firDecimateInnerLoop1
+firDecimateEndInnerLoop1:
+ @/* If the filter length is not a multiple of 4, compute the remaining filter taps */
+
+ VMOV qMask,qMaskTmp
+ VBSL qMask,qCoeff,qZero
+
+ VMLA qAcc0,qMaskF32,qInp0
+ VMLA qAcc1,qMaskF32,qInp1
+ VMLA qAcc2,qMaskF32,qInp2
+ VMLA qAcc3,qMaskF32,qInp3
+
+ VADD dAcc0_0,dAcc0_0,dAcc0_1
+ VADD dAcc1_0,dAcc1_0,dAcc1_1
+ VADD dAcc2_0,dAcc2_0,dAcc2_1
+ VADD dAcc3_0,dAcc3_0,dAcc3_1
+
+
+ MUL Offset,Offset,blkCnt
+ VPADD dAcc0_0,dAcc0_0,dAcc1_0
+ VPADD dAcc0_1,dAcc2_0,dAcc3_0
+ ADD pState,pState,Offset
+
+ VMOV qMask,qMask1
+ VLD1 {dTemp_0,dTemp_1},[pDst]
+ VBSL qMask,qAcc0,qTemp
+
+ VST1 {dMask_0,dMask_1},[pDst]
+ ADD pDst,pDst,blkCnt,LSL #2
+
+
+firDecimateCopyData:
+ @/* Processing is complete. Now shift the data in the state buffer down by
+ @** blockSize samples. This prepares the state buffer for the next function
+ @** call. */
+
+ @/* Points to the start of the state buffer */
+
+ SUB numTaps,numTaps,#1
+ AND mask,numTaps,#3
+ LDR pX,[pStateStruct,#-4]
+ ADD pTemp,pMask,mask,LSL #4
+ VLD1 {dInp0_0,dInp0_1},[pState]!
+ VLD1 {dMask_0,dMask_1},[pTemp]
+
+ @/* copy data */
+
+ SUBS Count,numTaps,#4
+ BLT firDecimateEnd
+firDecimateCopyLoop1:
+ VST1 {dInp0_0,dInp0_1},[pX]!
+ SUBS Count,#4
+ VLD1 {dInp0_0,dInp0_1},[pState]!
+ BGE firDecimateCopyLoop1
+firDecimateEnd:
+ VLD1 {dTemp_0,dTemp_1},[pX]
+ VBSL qMask,qInp0,qTemp
+ VST1 {dOut_0,dOut_1},[pX]
+ ADD pX,pX,mask, LSL #2
+
+ @// Return From Function
+ POP {r4-r12,pc}
+
+@/*ARM Registers*/
+.unreq pStateStruct
+.unreq pSrc
+.unreq pDst
+.unreq blockSize
+
+.unreq pState
+.unreq pCoeffs
+.unreq decimationFact
+.unreq outBlockSize
+
+.unreq pX
+.unreq pB
+.unreq numTaps
+
+.unreq tapCnt
+.unreq Count
+.unreq pTemp
+.unreq blkCnt
+.unreq pMask
+
+.unreq mask
+.unreq Offset
+
+@/*NEON variale Declaration*/
+.unreq qInp0
+.unreq dInp0_0
+.unreq dInp0_1
+
+.unreq qCoeff
+.unreq dCoeff_0
+.unreq dCoeff_1
+
+.unreq qZero
+.unreq qMask
+.unreq qMaskF32
+.unreq dMask_0
+.unreq dMask_1
+.unreq dOut_0
+.unreq dOut_1
+
+.unreq qInp3
+.unreq dInp3_0
+.unreq dInp3_1
+
+.unreq qAcc0
+.unreq dAcc0_0
+.unreq dAcc0_1
+
+.unreq qTemp
+.unreq dTemp_0
+.unreq dTemp_1
+
+.unreq qInp1
+.unreq dInp1_0
+.unreq dInp1_1
+
+.unreq qAcc1
+.unreq dAcc1_0
+.unreq dAcc1_1
+.unreq qAcc2
+.unreq dAcc2_0
+.unreq dAcc2_1
+.unreq qAcc3
+.unreq dAcc3_0
+.unreq dAcc3_1
+
+.unreq qMask1
+.unreq dMask1_0
+.unreq dMask1_1
+
+.unreq qMaskTmp
+.unreq dMaskTmp_0
+.unreq dMaskTmp_1
+
+.unreq qInp2
+.unreq dInp2_0
+.unreq dInp2_1
+
+
+ @/**
+ @ * @details
+ @ * This function operates on floating-point data types.
+ @ * There are no restrictions on numTaps and blockSize.
+ @ *
+ @ * The order of the coefficients in *coeffs should be
+ @ * bN, bN-1, bN-2, .....b1, b0
+ @ *
+ @ * <b>Cycle Count:</b>
+ @ *
+ @ * <code> C0 + C2 * blockSize + C3 * blockSize * interpolateFactor + C4 * numTaps * blockSize * interpolateFactor </code>
+ @ *
+ @ * @param[in] *S points to struct parameter
+ @ * @param[in] *pSrc points to the input buffer
+ @ * @param[out] *pDst points to the output buffer
+ @ * @param[in] blockSize block size of filter
+ @ */
+
+ .align 4
+ .global ne10_fir_interpolate_float_neon
+ .extern ne10_qMaskTable32
+ .thumb
+ .thumb_func
+
+ne10_fir_interpolate_float_neon:
+ PUSH {r4-r12,lr}
+
+
+@/*ARM Registers*/
+pStateStruct .req R0
+pSrc .req R1
+pDst .req R2
+blockSize .req R3
+
+pState .req R4 @/* State pointer */
+
+pB .req R5 @/* Temporary pointers for coefficient buffer */
+pCoeffs .req R5 @/* Coefficient pointer */
+pStateCurnt .req R5 @/* Points to the current sample of the state */
+
+pX .req R6 @/* Temporary pointers for state buffer */
+
+interpolationFact .req R7
+intFact .req R8
+phaseLen .req R9
+Offset .req R10
+
+Count .req R11 @ /* Loop counter */
+pTemp .req R11
+
+mask .req R12
+
+pMask .req R14 @ /* Mask Table */
+index .req R14
+
+@/*NEON variale Declaration*/
+qInp .qn Q0.F32
+dInp_0 .dn D0.F32
+dInp_1 .dn D1.F32
+qCoeff0 .qn Q1.F32
+dCoeff0_0 .dn D2.F32
+dCoeff0_1 .dn D3.F32
+qZero .qn Q2.F32
+
+qMask .qn Q3.U32
+dMask_0 .dn D6.U32
+dMask_1 .dn D7.U32
+dOut_0 .dn D6.F32
+dOut_1 .dn D7.F32
+
+qAcc0 .qn Q8.F32
+dAcc0_0 .dn D16.F32
+dAcc0_1 .dn D17.F32
+
+
+qTemp .qn Q9.F32
+dTemp_0 .dn D18.F32
+dTemp_1 .dn D19.F32
+
+qCoeff1 .qn Q10.F32
+dCoeff1_0 .dn D20.F32
+dCoeff1_1 .dn D21.F32
+qCoeff2 .qn Q11.F32
+dCoeff2_0 .dn D22.F32
+dCoeff2_1 .dn D23.F32
+qCoeff3 .qn Q12.F32
+dCoeff3_0 .dn D24.F32
+dCoeff3_1 .dn D25.F32
+
+qMask1 .qn Q13.F32
+dMask1_0 .dn D26.F32
+dMask1_1 .dn D27.F32
+
+
+qMaskTemp .qn Q14.U32
+dMaskTemp_0 .dn D28.U32
+dMaskTemp_1 .dn D29.U32
+
+ LDRB interpolationFact,[pStateStruct],#2
+ LDRH phaseLen,[pStateStruct],#2
+ LDR pCoeffs,[pStateStruct],#4
+ LDR pState,[pStateStruct],#4
+
+ LSL Offset,interpolationFact, #2
+
+
+
+ @/* S->state buffer contains previous frame (phaseLen - 1) samples */
+ @/* pStateCurnt points to the location where the new input data should be written */
+
+
+ AND phaseLen,#3
+ LDR pMask,=ne10_qMaskTable32
+
+ @/* Total number of intput samples */
+ @/*Load Mask Value*/
+
+ AND mask,interpolationFact,#3
+ ADD pTemp,pMask,phaseLen,LSL #4
+ ADD mask,pMask,mask,LSL #4
+
+ VLD1 {dMaskTemp_0,dMaskTemp_1},[pTemp]
+ VLD1 {dMask1_0,dMask1_1},[mask]
+
+
+ VEOR qZero,qZero,qZero
+
+
+
+ @/* Loop over the blockSize. */
+ CMP blockSize,#0
+ BEQ firInterpolateCopyData
+firInterpolateBlockLoop:
+ @/* Copy new input sample into the state buffer */
+ LDRH phaseLen,[pStateStruct,#-10]
+ LDR mask,[pSrc],#4
+ SUB phaseLen,#1
+ ADD pStateCurnt,pState,phaseLen, LSL #2
+
+ LDRB interpolationFact,[pStateStruct,#-12]
+ STR mask,[pStateCurnt]
+
+
+ SUBS intFact,interpolationFact,#4
+ MOV index,#4
+
+ BLT firInterpolateEndIntplLoop
+firInterpolateInterpolLoop:
+ VEOR qAcc0,qAcc0,qAcc0
+ LDRH phaseLen,[pStateStruct,#-10]
+ LDR pCoeffs,[pStateStruct,#-8]
+ MOV pX,pState
+ SUB mask,interpolationFact,index
+ ADD pB,pCoeffs,mask, LSL #2
+ @/*Load Coefficients*/
+ @/*c0 c1 c2 c3*/
+ VLD1 {dCoeff0_0,dCoeff0_1},[pB],Offset
+ @/*c0 c1 c2 c3*/
+ VLD1 {dCoeff1_0,dCoeff1_1},[pB],Offset
+ @/*c0 c1 c2 c3*/
+ VLD1 {dCoeff2_0,dCoeff2_1},[pB],Offset
+ @/*c0 c1 c2 c3*/
+ VLD1 {dCoeff3_0,dCoeff3_1},[pB],Offset
+ VLD1 {dInp_0,dInp_1},[pX]!
+ @/* Loop over the polyPhase length. Unroll by a factor of 4.
+ @ ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ SUBS phaseLen,#4
+ BLT firInterpolateEndPhaseLoop
+firInterpolatePhaseLoop:
+ @/* Perform the multiply-accumulate */
+ VMLA qAcc0,qCoeff0,dInp_0[0]
+ VMLA qAcc0,qCoeff1,dInp_0[1]
+ VMLA qAcc0,qCoeff2,dInp_1[0]
+ VMLA qAcc0,qCoeff3,dInp_1[1]
+
+ VLD1 {dInp_0,dInp_1},[pX]!
+ @ /*Load Coefficients*/
+ @/*c0 c1 c2 c3*/
+ VLD1 {dCoeff0_0,dCoeff0_1},[pB],Offset
+ @/*c0 c1 c2 c3*/
+ VLD1 {dCoeff1_0,dCoeff1_1},[pB],Offset
+ @/*c0 c1 c2 c3*/
+ VLD1 {dCoeff2_0,dCoeff2_1},[pB],Offset
+ @/*c0 c1 c2 c3*/
+ VLD1 {dCoeff3_0,dCoeff3_1},[pB],Offset
+
+ SUBS phaseLen,#4
+ BGE firInterpolatePhaseLoop
+firInterpolateEndPhaseLoop:
+ @/* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ VMOV qMask,qMaskTemp
+ VBSL qMask,qInp,qZero
+ @/* Perform the multiply-accumulate */
+ VMLA qAcc0,qCoeff0,dOut_0[0]
+ VMLA qAcc0,qCoeff1,dOut_0[1]
+ VMLA qAcc0,qCoeff2,dOut_1[0]
+ VMLA qAcc0,qCoeff3,dOut_1[1]
+ @ /* The result is in the accumulator is in Reverse Order*/
+ VREV64 qAcc0,qAcc0
+ @/*Swap the D-Regs of Acc*/
+ VMOV dCoeff0_0,dAcc0_1
+ VMOV dCoeff0_1,dAcc0_0
+
+ VST1 {dCoeff0_0,dCoeff0_1},[pDst]!
+ ADD index,#4
+ SUBS intFact,#4
+ BGE firInterpolateInterpolLoop
+
+firInterpolateEndIntplLoop:
+ ADDS intFact,#4
+ BEQ firInterpolateNextSample
+ @/*Handle the Remaining Samples*/
+ VEOR qAcc0,qAcc0,qAcc0
+ LDRH phaseLen,[pStateStruct,#-10]
+ LDR pCoeffs,[pStateStruct,#-8]
+ MOV pX,pState
+ SUB mask,interpolationFact,index
+ ADD pB,pCoeffs,mask, LSL #2
+ @/*Load Coefficients*/
+ @/*c0 c1 c2 c3*/
+ VLD1 {dCoeff0_0,dCoeff0_1},[pB],Offset
+ @/*c0 c1 c2 c3*/
+ VLD1 {dCoeff1_0,dCoeff1_1},[pB],Offset
+ @/*c0 c1 c2 c3*/
+ VLD1 {dCoeff2_0,dCoeff2_1},[pB],Offset
+ @/*c0 c1 c2 c3*/
+ VLD1 {dCoeff3_0,dCoeff3_1},[pB],Offset
+ VLD1 {dInp_0,dInp_1},[pX]!
+ @/* Loop over the polyPhase length. Unroll by a factor of 4.
+ @ ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ SUBS phaseLen,#4
+ BLT firInterpolateEndPhaseLoop1
+firInterpolatePhaseLoop1:
+ @/* Perform the multiply-accumulate */
+ VMLA qAcc0,qCoeff0,dInp_0[0]
+ VMLA qAcc0,qCoeff1,dInp_0[1]
+ VMLA qAcc0,qCoeff2,dInp_1[0]
+ VMLA qAcc0,qCoeff3,dInp_1[1]
+
+ VLD1 {dInp_0,dInp_1},[pX]!
+ @ /*Load Coefficients*/
+ @/*c0 c1 c2 c3*/
+ VLD1 {dCoeff0_0,dCoeff0_1},[pB],Offset
+ @/*c0 c1 c2 c3*/
+ VLD1 {dCoeff1_0,dCoeff1_1},[pB],Offset
+ @/*c0 c1 c2 c3*/
+ VLD1 {dCoeff2_0,dCoeff2_1},[pB],Offset
+ @/*c0 c1 c2 c3*/
+ VLD1 {dCoeff3_0,dCoeff3_1},[pB],Offset
+ SUBS phaseLen,#4
+ BGE firInterpolatePhaseLoop1
+
+firInterpolateEndPhaseLoop1:
+ @/* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ VMOV qMask,qMaskTemp
+ VBSL qMask,qInp,qZero
+ @/* Perform the multiply-accumulate */
+ VMLA qAcc0,qCoeff0,dOut_0[0]
+ VMLA qAcc0,qCoeff1,dOut_0[1]
+ VMLA qAcc0,qCoeff2,dOut_1[0]
+ VMLA qAcc0,qCoeff3,dOut_1[1]
+ @ /* The result is in the accumulator is in Reverse Order*/
+ VREV64 qAcc0,qAcc0
+
+ VMOV qMask,qMask1
+ VLD1 {dTemp_0,dTemp_1},[pDst]
+ @/*Swap the D-Regs of Acc*/
+ VMOV dCoeff0_0,dAcc0_1
+ VMOV dCoeff0_1,dAcc0_0
+
+ VBSL qMask,qCoeff0,qTemp
+ VST1 {dMask_0,dMask_1},[pDst]
+ ADD pDst,pDst,intFact, LSL #2
+
+
+firInterpolateNextSample:
+ SUBS blockSize,#1
+ ADD pState,#4
+ BGT firInterpolateBlockLoop
+
+ @/*End of Processing*/
+
+firInterpolateCopyData:
+
+ @/* Save previous phaseLen - 1 samples and get rid of other samples */
+ @/* Points to the start of the state buffer */
+ LDRH phaseLen,[pStateStruct,#-10]
+ LDR pMask,=ne10_qMaskTable32
+ LDR pStateCurnt,[pStateStruct,#-4]
+
+ SUB phaseLen,phaseLen,#1
+ AND mask,phaseLen,#3
+ ADD pTemp,pMask,mask,LSL #4
+
+ VLD1 {dInp_0,dInp_1},[pState]!
+ VLD1 {dMask_0,dMask_1},[pTemp]
+
+ @/* copy data */
+
+ SUBS Count,phaseLen,#4
+ BLT firInterpolateEnd
+firInterpolateCopyLoop:
+ VST1 {dInp_0,dInp_1},[pStateCurnt]!
+ SUBS Count,#4
+ VLD1 {dInp_0,dInp_1},[pState]!
+ BGE firInterpolateCopyLoop
+firInterpolateEnd:
+
+ VLD1 {dTemp_0,dTemp_1},[pStateCurnt]
+ VBSL qMask,qInp,qTemp
+ VST1 {dOut_0,dOut_1},[pStateCurnt]
+
+ ADD pStateCurnt,pStateCurnt,mask, LSL #2
+
+ @/*Return From Function*/
+ POP {r4-r12,pc}
+@/*ARM Registers*/
+.unreq pStateStruct
+.unreq pSrc
+.unreq pDst
+.unreq blockSize
+
+.unreq pState
+
+.unreq pB
+.unreq pCoeffs
+.unreq pStateCurnt
+
+.unreq pX
+
+.unreq interpolationFact
+.unreq intFact
+.unreq phaseLen
+.unreq Offset
+
+.unreq Count
+.unreq pTemp
+
+.unreq mask
+
+.unreq pMask
+.unreq index
+
+@/*NEON variale Declaration*/
+.unreq qInp
+.unreq dInp_0
+.unreq dInp_1
+.unreq qCoeff0
+.unreq dCoeff0_0
+.unreq dCoeff0_1
+.unreq qZero
+
+.unreq qMask
+.unreq dMask_0
+.unreq dMask_1
+.unreq dOut_0
+.unreq dOut_1
+
+.unreq qAcc0
+.unreq dAcc0_0
+.unreq dAcc0_1
+
+.unreq qTemp
+.unreq dTemp_0
+.unreq dTemp_1
+
+.unreq qCoeff1
+.unreq dCoeff1_0
+.unreq dCoeff1_1
+.unreq qCoeff2
+.unreq dCoeff2_0
+.unreq dCoeff2_1
+.unreq qCoeff3
+.unreq dCoeff3_0
+.unreq dCoeff3_1
+
+.unreq qMask1
+.unreq dMask1_0
+.unreq dMask1_1
+
+.unreq qMaskTemp
+.unreq dMaskTemp_0
+.unreq dMaskTemp_1
+
+
+ @/**
+ @ * @details
+ @ * This function operates on floating-point data types.
+ @ * There are no restrictions on numStages and blockSize.
+ @ *
+ @ * The order of the coefficients in *coeffs should be
+ @ * k1, k2, ...kM-1
+ @ *
+ @ * <b>Cycle Count:</b>
+ @ *
+ @ * <code>c0 + c1 * blockSize + c2 * numStages * blockSize</code>
+ @ *
+ @ * @param[in] *S points to struct parameter
+ @ * @param[in] *pSrc points to the input buffer
+ @ * @param[out] *pDst points to the output buffer
+ @ * @param[in] blockSize block size of filter
+ @ */
+
+ .align 4
+ .global ne10_fir_lattice_float_neon
+ .extern ne10_qMaskTable32
+ .thumb
+ .thumb_func
+
+ne10_fir_lattice_float_neon:
+
+ PUSH {r4-r12,lr}
+
+@/*ARM Registers*/
+pStateStruct .req R0
+pSrc .req R1
+pDst .req R2
+blockSize .req R3
+
+pState .req R4 @/* State pointer */
+pCoeffs .req R5 @/* Coefficient pointer */
+
+pX .req R7 @/* Temporary pointers for state buffer */
+pB .req R8 @/* Temporary pointers for coefficient buffer */
+numStages .req R9 @/* Length of the filter */
+
+stageCnt .req R10 @ /* Loop counter */
+
+
+pTemp .req R11
+pMask .req R14 @ /* Mask Table */
+mask .req R12
+
+@/*NEON variale Declaration*/
+qFcurr .qn Q0.F32
+dFcurr_0 .dn D0.F32
+dFcurr_1 .dn D1.F32
+qCoeff .qn Q1.F32
+dCoeff_0 .dn D2.F32
+dCoeff_1 .dn D3.F32
+
+qZero .qn Q2.F32
+
+qMask .qn Q3.U32
+dMask_0 .dn D6.U32
+dMask_1 .dn D7.U32
+dOut_0 .dn D6.F32
+dOut_1 .dn D7.F32
+
+qAcc0 .qn Q8.F32
+dAcc0_0 .dn D16.F32
+dAcc0_1 .dn D17.F32
+
+qTemp .qn Q9.F32
+dTemp_0 .dn D18.F32
+dTemp_1 .dn D19.F32
+
+qFnext .qn Q10.F32
+dFnext_0 .dn D20.F32
+dFnext_1 .dn D21.F32
+qGcurr .qn Q11.F32
+dGcurr_0 .dn D22.F32
+dGcurr_1 .dn D23.F32
+qGnext .qn Q12.F32
+dGnext_0 .dn D24.F32
+dGnext_1 .dn D25.F32
+
+qMask1 .qn Q13.U32
+dMask1_0 .dn D26.U32
+dMask1_1 .dn D27.U32
+qMaskTmp .qn Q14.U32
+dMaskTmp_0 .dn D28.U32
+dMaskTmp_1 .dn D29.U32
+qTemp1 .qn Q15.F32
+dTemp1_0 .dn D30.F32
+dTemp1_1 .dn D31.F32
+
+fNext .dn D0.F32
+gCurr .dn D1.F32
+gNext .dn D2.F32
+fCurr .dn D3.F32
+Coeff .dn D4.F32
+
+ @/* Length of the filter */
+ LDRH numStages,[pStateStruct],#4
+ @/* State pointer */
+ LDR pState,[pStateStruct],#4
+ @/* Coefficient pointer */
+ LDR pCoeffs,[pStateStruct],#4
+
+
+ @// Get the Mask Values
+
+ LDR pMask,=ne10_qMaskTable32
+ SUB numStages,#1
+ AND mask,numStages, #3
+ AND stageCnt,blockSize,#3
+
+ ADD pTemp,pMask,mask,LSL #4
+ ADD stageCnt,pMask,stageCnt,LSL #4
+ VLD1 {dMaskTmp_0,dMaskTmp_1},[pTemp]
+ VLD1 {dMask1_0,dMask1_1},[stageCnt]
+ VEOR qZero,qZero,qZero
+
+
+ SUBS blockSize,#4
+ BLT firLatticeEndOuterLoop
+firLatticeOuterLoop:
+ @/* Initialize coeff pointer */
+ MOV pB,pCoeffs
+ @/* Initialize state pointer */
+ MOV pX,pState
+ @/* Read Four samples from input buffer: fcurr0, fcurr1,fcurr2,fcurr3*/
+ @/* f0(n) = x(n) */
+ VLD1 {dFcurr_0,dFcurr_1},[pSrc]!
+ @/*Read one Sample from the State Buffer*/
+ VLD1 {dGcurr_1[1]},[pX]
+ VEXT qGnext,qGcurr,qFcurr,#3
+
+ VLD1 {dCoeff_0[],dCoeff_1[]},[pB]!
+ VMOV qFnext,qFcurr
+ VST1 {dFcurr_1[1]},[pX]!
+ @/* fi(n) = fi-1(n) + Ki * gi-1(n-1) */
+ @/* gi(n) = fi-1(n) * Ki + gi-1(n-1) */
+ @/* ki*gcurr4+fcurr4 ki*gcurr3+fcurr3 ki*gcurr2+fcurr2 ki*gcurr1+fcurr1*/
+
+ VMLA qFcurr,qGnext,qCoeff
+ @/* ki*fcurr4+gcurr4 ki*fcurr3+gcurr3 ki*fcurr2+gcurr2 ki*fcurr1+gcurr1*/
+ VMLA qGnext,qFnext,qCoeff
+
+
+ @/* Loop unrolling. Process 4 taps at a time . */
+ SUBS stageCnt,numStages,#4
+ BLT firLatticeEndInnerLoop
+ @/* Loop over the number of taps. Unroll by a factor of 4.
+ @ * Repeat until we've computed numStages-3 coefficients. */
+ @/* Process 2nd, 3rd, 4th and 5th taps ... here */
+firLatticeInnerLoop:
+ VLD1 {dGcurr_1[1]},[pX]!
+ VREV64 dTemp_0,dGnext_1
+
+ VLD1 {dCoeff_0[],dCoeff_1[]},[pB]!
+ VEXT qGcurr,qGcurr,qGnext,#3
+
+ @ /* fi(n) = fi-1(n) + Ki * gi-1(n-1) */
+ @/* gi(n) = fi-1(n) * Ki + gi-1(n-1) */
+ @/* ki*gcurr4+fcurr4 ki*gcurr3+fcurr3 ki*gcurr2+fcurr2 ki*gcurr1+fcurr1*/
+ VMOV qFnext,qFcurr
+ VMOV qGnext,qGcurr
+ @/* ki*fcurr4+gcurr4 ki*fcurr3+gcurr3 ki*fcurr2+gcurr2 ki*fcurr1+gcurr1*/
+ VMLA qGnext,qFnext,qCoeff
+ VMLA qFcurr,qGcurr,qCoeff
+
+ @/*Prepare for Next Stage*/
+ VLD1 {dGcurr_1[1]},[pX]!
+
+ VLD1 {dCoeff_0[],dCoeff_1[]},[pB]!
+ VEXT dTemp_0,dGnext_1,dTemp_0,#1
+ VEXT qGcurr,qGcurr,qGnext,#3
+
+ @/*Next Stage*/
+ VMOV qFnext,qFcurr
+ VMOV qGnext,qGcurr
+ VMLA qGnext,qFnext,qCoeff
+ VMLA qFcurr,qGcurr,qCoeff
+
+ @/*Prepare for Next Stage*/
+ VLD1 {dGcurr_1[1]},[pX]!
+ VLD1 {dCoeff_0[],dCoeff_1[]},[pB]!
+ VEXT dTemp_1,dGnext_1,dTemp_1,#1
+ VEXT qGcurr,qGcurr,qGnext,#3
+
+
+ @/*Next Stage*/
+ VMOV qFnext,qFcurr
+ VMOV qGnext,qGcurr
+ VMLA qGnext,qFnext,qCoeff
+ VMLA qFcurr,qGcurr,qCoeff
+
+
+ @/*Prepare for Next Stage*/
+ VLD1 {dGcurr_1[1]},[pX]!
+ VLD1 {dCoeff_0[],dCoeff_1[]},[pB]!
+ VEXT dTemp_1,dGnext_1,dTemp_1,#1
+ VEXT qGcurr,qGcurr,qGnext,#3
+
+ VREV64 qTemp,qTemp
+ @/*Next Stage*/
+ VMOV qFnext,qFcurr
+ VMOV qGnext,qGcurr
+ VMLA qFcurr,qGcurr,qCoeff
+ VMLA qGnext,qFnext,qCoeff
+ SUB pX,#16
+
+ @/*Store the samples in the state buffer for next frame*/
+ VST1 {dTemp_0,dTemp_1},[pX]!
+ SUBS stageCnt,#4
+ BGE firLatticeInnerLoop
+firLatticeEndInnerLoop:
+ ADDS stageCnt,#4
+ BEQ firLatticeFinishInner
+ VMOV qMask,qMaskTmp
+ VLD1 {dCoeff_0,dCoeff_1},[pB]!
+
+ VLD1 {dGcurr_1[1]},[pX]!
+ VREV64 dTemp_0,dGnext_1
+ VBSL qMask,qCoeff,qZero
+
+
+ VEXT qGcurr,qGcurr,qGnext,#3
+ VDUP qCoeff,dMask_0[0]
+ VMOV qFnext,qFcurr
+ VMOV qGnext,qGcurr
+ VMLA qGnext,qFnext,qCoeff
+ VMLA qFcurr,qGcurr,qCoeff
+
+ VLD1 {dGcurr_1[1]},[pX]!
+
+ VDUP qCoeff,dMask_0[1]
+ VEXT dTemp_0,dGnext_1,dTemp_0,#1
+ VEXT qGcurr,qGcurr,qGnext,#3
+
+ VMOV qFnext,qFcurr
+ VMOV qGnext,qGcurr
+ VMLA qGnext,qFnext,qCoeff
+ VMLA qFcurr,qGcurr,qCoeff
+
+ VLD1 {dGcurr_1[1]},[pX]!
+ VDUP qCoeff,dMask_1[0]
+ VEXT dTemp_1,dGnext_1,dTemp_1,#1
+ VEXT qGcurr,qGcurr,qGnext,#3
+
+ VMOV qFnext,qFcurr
+ VMOV qGnext,qGcurr
+ VMLA qGnext,qFnext,qCoeff
+ VMLA qFcurr,qGcurr,qCoeff
+
+ VLD1 {dGcurr_1[1]},[pX]!
+ VDUP qCoeff,dMask_1[1]
+ VEXT dTemp_1,dGnext_1,dTemp_1,#1
+ VEXT qGcurr,qGcurr,qGnext,#3
+
+ VREV64 qTemp,qTemp
+
+ VMOV qFnext,qFcurr
+ VMOV qGnext,qGcurr
+ SUB pX,pX,#16
+
+ VMOV qMask,qMaskTmp
+ VMLA qFcurr,qGcurr,qCoeff
+ VLD1 {dTemp1_0,dTemp1_1},[pX]
+ VMLA qGnext,qFnext,qCoeff
+ VBSL qMask,qTemp,qTemp1
+ VST1 {dMask_0,dMask_1},[pX]
+ ADD pX,pX,stageCnt, LSL #2
+
+firLatticeFinishInner:
+
+ VST1 {dFcurr_0,dFcurr_1},[pDst]!
+ SUBS blockSize,#4
+ BGE firLatticeOuterLoop
+
+firLatticeEndOuterLoop:
+ ADDS blockSize,#4
+ BEQ firLatticeEnd
+
+firLatticeOuterLoop1:
+ VLD1 {fCurr[0]},[pSrc]!
+ MOV pB,pCoeffs
+ MOV pX,pState
+ VLD1 {gCurr[0]},[pX]
+ VLD1 {Coeff[0]},[pB]!
+
+ VST1 {fCurr[0]},[pX]!
+ VMOV gNext,gCurr
+ VMLA gNext,Coeff,fCurr
+ VMLA fCurr,Coeff,gCurr
+
+ SUBS stageCnt,numStages,#1
+ BLE firLatticeEndinnerLoop1
+firLatticeInnerLoop1:
+ VLD1 {gCurr[0]},[pX]
+ VST1 {gNext[0]},[pX]!
+
+ VLD1 {Coeff[0]},[pB]!
+
+ VMOV gNext,gCurr
+ VMLA gNext,Coeff,fCurr
+ VMLA fCurr,Coeff,gCurr
+ SUBS stageCnt,#1
+ BGE firLatticeInnerLoop1
+firLatticeEndinnerLoop1:
+ VST1 {fCurr[0]},[pDst]!
+ SUBS blockSize,#1
+ BGT firLatticeOuterLoop1
+
+firLatticeEnd:
+ @/*Return From Function*/
+ POP {r4-r12,pc}
+
+@/*ARM Registers*/
+.unreq pStateStruct
+.unreq pSrc
+.unreq pDst
+.unreq blockSize
+
+.unreq pState
+.unreq pCoeffs
+
+.unreq pX
+.unreq pB
+.unreq numStages
+
+.unreq stageCnt
+
+.unreq pTemp
+.unreq pMask
+.unreq mask
+
+.unreq fNext
+.unreq gCurr
+.unreq gNext
+.unreq fCurr
+.unreq Coeff
+
+@/*NEON variale Declaration*/
+.unreq qFcurr
+.unreq dFcurr_0
+.unreq dFcurr_1
+.unreq qCoeff
+.unreq dCoeff_0
+.unreq dCoeff_1
+
+.unreq qZero
+
+.unreq qMask
+.unreq dMask_0
+.unreq dMask_1
+.unreq dOut_0
+.unreq dOut_1
+
+.unreq qAcc0
+.unreq dAcc0_0
+.unreq dAcc0_1
+
+.unreq qTemp
+.unreq dTemp_0
+.unreq dTemp_1
+
+.unreq qFnext
+.unreq dFnext_0
+.unreq dFnext_1
+.unreq qGcurr
+.unreq dGcurr_0
+.unreq dGcurr_1
+.unreq qGnext
+.unreq dGnext_0
+.unreq dGnext_1
+
+.unreq qMask1
+.unreq dMask1_0
+.unreq dMask1_1
+.unreq qMaskTmp
+.unreq dMaskTmp_0
+.unreq dMaskTmp_1
+.unreq qTemp1
+.unreq dTemp1_0
+.unreq dTemp1_1
+
+ @/**
+ @ * @details
+ @ * This function operates on floating-point data types.
+ @ * There are no restrictions on numTaps and blockSize.
+ @ *
+ @ * The scratch buffer, pScratch is internally used for holding the state values temporarily.
+ @ * <b>Cycle Count:</b>
+ @ *
+ @ * <code> C0 * blockSize + C1 * numTaps + C2 * numTaps * blockSize</code>
+ @ *
+ @ * <b>Cycle Count:</b>
+ @ *
+ @ * <code> C0 + C2 * blockSize + C3 * blockSize * interpolateFactor + C4 * numTaps * blockSize * interpolateFactor </code>
+ @ *
+ @ * @param[in] *S points to struct parameter
+ @ * @param[in] *pSrc points to the input buffer
+ @ * @param[out] *pDst points to the output buffer
+ @ * @param[out] *pScratch points to the scratch buffer
+ @ * @param[in] blockSize block size of filter
+ @ */
+
+ .align 4
+ .global ne10_fir_sparse_float_neon
+ .extern ne10_qMaskTable32
+ .thumb
+ .thumb_func
+
+ne10_fir_sparse_float_neon:
+ PUSH {r4-r12,lr}
+ PUSH {r0}
+
+@/*ARM Registers*/
+pStateStruct .req R0
+pSrc .req R1
+pDst .req R2
+pScratch .req R3
+blockSize .req R4
+size2 .req R4
+
+pYtmp1 .req R0
+pOut .req R0
+Offset .req R0
+
+readIndex .req R1
+
+numTaps .req R5 @/* Length of the filter */
+
+pState .req R6 @/* State pointer */
+pCoeffs .req R7 @/* Coefficient pointer */
+stateIndex .req R8
+
+maxDelay .req R9
+delaySize .req R9
+
+pTapDelay .req R10
+
+blkCnt .req R11
+size1 .req R11
+temp .req R1
+mask .req R11
+pMask .req R11
+
+pX .req R12
+
+pY .req R14
+pYtmp2 .req R14
+
+
+@/*NEON variale Declaration*/
+qInp .qn Q0.F32
+dInp_0 .dn D0.F32
+dInp_1 .dn D1.F32
+
+qCoeff .qn Q1.F32
+dCoeff_0 .dn D2.F32
+dCoeff_1 .dn D3.F32
+
+qZero .qn Q2.F32
+
+qMask .qn Q3.U32
+qMaskF32 .qn Q3.F32
+dMask_0 .dn D6.U32
+dMask_1 .dn D7.U32
+
+
+qAcc0 .qn Q4.F32
+dAcc0_0 .dn D8.F32
+dAcc0_1 .dn D9.F32
+
+
+qTemp .qn Q8.F32
+dTemp_0 .dn D16.F32
+dTemp_1 .dn D17.F32
+
+
+qMaskTmp .qn Q9.U32
+dMaskTmp_0 .dn D18.U32
+dMaskTmp_1 .dn D19.U32
+
+
+ /*Load Mask Table*/
+
+ LDRH numTaps,[pStateStruct],#2
+ LDRH stateIndex,[pStateStruct],#2
+ LDR pState,[pStateStruct],#4
+ LDR pCoeffs,[pStateStruct],#4
+ LDRH maxDelay,[pStateStruct],#4
+ LDR pTapDelay,[pStateStruct],#4
+
+ @// Load blockSize from Stack
+ LDR blockSize,[SP,#44]
+ LDR pMask,=ne10_qMaskTable32
+ ADD delaySize,blockSize,maxDelay
+
+ VEOR qZero,qZero
+ AND pY,blockSize,#3
+ ADD pY,pMask,pY,LSL #4
+ VLD1 {dMaskTmp_0,dMaskTmp_1},[pY]
+
+
+ @/* BlockSize of Input samples are copied into the state buffer */
+ @/* StateIndex points to the starting position to write in the state buffer */
+ MOV pX,pState
+ LSL Offset,stateIndex,#2
+
+ SUBS blkCnt,blockSize,#1
+ BLT firSparseEndSrcCopy
+firSparseSrcCopyLoop:
+
+ LDR pY,[pSrc],#4
+ STR pY,[pX,Offset]
+ ADD Offset,#4
+ CMP delaySize,Offset,LSR #2
+ IT LE
+ SUBLE Offset,Offset,delaySize, LSL #2
+ SUBS blkCnt,#1
+ BGE firSparseSrcCopyLoop
+firSparseEndSrcCopy:
+
+ LSR stateIndex,Offset,#2
+ LDR Offset,[SP,#0]
+ STRH stateIndex,[Offset,#2]
+
+ LDR readIndex,[pTapDelay],#4
+ ADD readIndex,readIndex,blockSize
+ SUBS readIndex,stateIndex,readIndex
+
+ @/*Wrap arround index*/
+ IT LT
+ ADDLT readIndex,readIndex,delaySize
+
+
+ @/*Processing begins*/
+ @/*First stage*/
+ MOV pY,pState
+ MOV pX,pScratch
+
+ @/* copy the sample from the circular buffer to the destination buffer */
+ SUB size1,delaySize,readIndex
+ CMP size1,blockSize
+ IT GT
+ MOVGT size1,blockSize
+
+ ADD pYtmp1,pY,readIndex, LSL #2
+ SUB size2,blockSize,size1
+ MOV pYtmp2,pY
+
+ CMP size1,#0
+ BLE firSparseEndcopy1
+firSparseCopy1:
+ LDR temp,[pYtmp1],#4
+ SUBS size1,#1
+ STR temp,[pScratch],#4
+ BGT firSparseCopy1
+firSparseEndcopy1:
+
+ CMP size2,#0
+ BLE firSparseEndcopy2
+firSparseCopy2:
+ LDR temp,[pYtmp2],#4
+ SUBS size2,#1
+ STR temp,[pScratch],#4
+ BGT firSparseCopy2
+firSparseEndcopy2:
+
+
+ @// Load blockSize from Stack
+ LDR blockSize,[SP,#44]
+
+
+ MOV pOut,pDst
+ VLD1 {dCoeff_0[],dCoeff_1[]},[pCoeffs]!
+ @//CMP tapCnt,numTaps
+
+ @//Complete the case of tapCnt=numTaps
+ SUBS blkCnt,blockSize,#4
+ VLD1 {dInp_0,dInp_1},[pX]!
+ BLT firSparseEndInnerLoop
+firSparseInnerLoop:
+ VMUL qAcc0,qInp,qCoeff
+ VLD1 {dInp_0,dInp_1},[pX]!
+ SUBS blkCnt,#4
+ VST1 {dAcc0_0,dAcc0_1},[pOut]!
+ BGE firSparseInnerLoop
+
+firSparseEndInnerLoop:
+ ADDS blkCnt,#4
+ @/* If the blockSize is not a multiple of 4,
+ @* * compute the remaining samples */
+
+ VLD1 {dTemp_0,dTemp_1},[pOut]
+ VMUL qAcc0,qInp,qCoeff
+ VMOV qMask,qMaskTmp
+ VBSL qMask,qAcc0,qTemp
+ VST1 {dMask_0,dMask_1},[pOut]
+ ADD pOut,pOut,blkCnt,LSL #2
+
+ LDR readIndex,[pTapDelay],#4
+ ADD readIndex,readIndex,blockSize
+ SUBS readIndex,stateIndex,readIndex
+
+ @/*Wrap arround index*/
+ IT LT
+ ADDLT readIndex,readIndex,delaySize
+
+ SUBS numTaps,#1
+ BLE firSparseEnd
+firSparseOuterLoop:
+
+ @// Load blockSize from Stack
+ LDR blockSize,[SP,#44]
+
+ MOV pY,pState
+ MOV pX,pScratch
+
+ @/* copy the sample from the circular buffer to the destination buffer */
+ SUB size1,delaySize,readIndex
+ CMP size1,blockSize
+ IT GT
+ MOVGT size1,blockSize
+
+ ADD pYtmp1,pY,readIndex, LSL #2
+ SUB size2,blockSize,size1
+ MOV pYtmp2,pY
+
+
+ CMP size1,#0
+ BLE firSparseEndcopy3
+firSparseCopy3:
+ LDR temp,[pYtmp1],#4
+ SUBS size1,#1
+ STR temp,[pScratch],#4
+ BGT firSparseCopy3
+firSparseEndcopy3:
+ CMP size2,#0
+ BLE firSparseEndcopy4
+firSparseCopy4:
+ LDR temp,[pYtmp2],#4
+ SUBS size2,#1
+ STR temp,[pScratch],#4
+ BGT firSparseCopy4
+firSparseEndcopy4:
+
+ @// Load blockSize from Stack
+ LDR blockSize,[SP,#44]
+
+
+ MOV pOut,pDst
+ VLD1 {dCoeff_0[],dCoeff_1[]},[pCoeffs]!
+
+
+ @//Complete the case of tapCnt=numTaps
+ SUBS blkCnt,blockSize,#4
+ VLD1 {dInp_0,dInp_1},[pX]!
+ VLD1 {dAcc0_0,dAcc0_1},[pOut]
+ BLT firSparseEndInnerLoop1
+firSparseInnerLoop1:
+ VMLA qAcc0,qInp,qCoeff
+ VLD1 {dInp_0,dInp_1},[pX]!
+ SUBS blkCnt,#4
+ VST1 {dAcc0_0,dAcc0_1},[pOut]!
+ VLD1 {dAcc0_0,dAcc0_1},[pOut]
+ BGE firSparseInnerLoop1
+
+firSparseEndInnerLoop1:
+ ADDS blkCnt,#4
+ @/* If the blockSize is not a multiple of 4,
+ @* * compute the remaining samples */
+
+
+ VMOV qMask,qMaskTmp
+ VBSL qMask,qInp,qZero
+ VMLA qAcc0,qMaskF32,qCoeff
+
+ VST1 {dAcc0_0,dAcc0_1},[pOut]
+ ADD pOut,pOut,blkCnt,LSL #2
+
+ LDR readIndex,[pTapDelay],#4
+ ADD readIndex,readIndex,blockSize
+ SUBS readIndex,stateIndex,readIndex
+
+ @/*Wrap arround index*/
+ IT LT
+ ADDLT readIndex,readIndex,delaySize
+
+ SUBS numTaps,#1
+
+ BGT firSparseOuterLoop
+firSparseEnd:
+ @// Return From Function
+ POP {r0}
+ POP {r4-r12,pc}
+
+@/*ARM Registers*/
+.unreq pStateStruct
+.unreq pSrc
+.unreq pDst
+.unreq pScratch
+.unreq blockSize
+.unreq size2
+
+.unreq pYtmp1
+.unreq pOut
+.unreq Offset
+
+.unreq readIndex
+
+.unreq numTaps
+
+.unreq pState
+.unreq pCoeffs
+.unreq stateIndex
+
+.unreq maxDelay
+.unreq delaySize
+
+.unreq pTapDelay
+
+.unreq blkCnt
+.unreq size1
+.unreq temp
+.unreq mask
+.unreq pMask
+
+.unreq pX
+
+.unreq pY
+.unreq pYtmp2
+
+@/*NEON variale Declaration*/
+.unreq qInp
+.unreq dInp_0
+.unreq dInp_1
+
+.unreq qCoeff
+.unreq dCoeff_0
+.unreq dCoeff_1
+
+.unreq qZero
+
+.unreq qMask
+.unreq qMaskF32
+.unreq dMask_0
+.unreq dMask_1
+
+.unreq qAcc0
+.unreq dAcc0_0
+.unreq dAcc0_1
+
+.unreq qTemp
+.unreq dTemp_0
+.unreq dTemp_1
+
+.unreq qMaskTmp
+.unreq dMaskTmp_0
+.unreq dMaskTmp_1
+
+ .end
--- /dev/null
+/*
+ * Copyright 2012 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string.h>
+
+#include "NE10_types.h"
+
+/**
+ * @details
+ *
+ * @param[in,out] *S points to an instance of the floating-point FIR filter structure.
+ * @param[in] numTaps Number of filter coefficients in the filter.
+ * @param[in] *pCoeffs points to the filter coefficients buffer.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] blockSize number of samples that are processed per call.
+ * @return none.
+ *
+ * <b>Description:</b>
+ * \par
+ * <code>pCoeffs</code> points to the array of filter coefficients stored in time reversed order:
+ * <pre>
+ * {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * </pre>
+ * \par
+ * <code>pState</code> points to the array of state variables.
+ * <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples, where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f32()</code>.
+ */
+
+ne10_result_t ne10_fir_init_float (ne10_fir_instance_f32_t * S,
+ ne10_uint16_t numTaps,
+ ne10_float32_t * pCoeffs,
+ ne10_float32_t * pState,
+ ne10_uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and the size of state buffer is (blockSize + numTaps - 1) */
+ memset (pState, 0, (numTaps + (blockSize - 1u)) * sizeof (ne10_float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+ return NE10_OK;
+}
+
+/**
+ * @brief Initialization function for the floating-point FIR decimator.
+ * @param[in,out] *S points to an instance of the floating-point FIR decimator structure.
+ * @param[in] numTaps number of coefficients in the filter.
+ * @param[in] M decimation factor.
+ * @param[in] *pCoeffs points to the filter coefficients.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return The function returns NE10_OK if initialization was successful or NE10_ERR if
+ * <code>blockSize</code> is not a multiple of <code>M</code>.
+ *
+ * <b>Description:</b>
+ * \par
+ * <code>pCoeffs</code> points to the array of filter coefficients stored in time reversed order:
+ * <pre>
+ * {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * </pre>
+ * \par
+ * <code>pState</code> points to the array of state variables.
+ * <code>pState</code> is of length <code>numTaps+blockSize-1</code> words where <code>blockSize</code> is the number of input samples passed to <code>arm_fir_decimate_f32()</code>.
+ * <code>M</code> is the decimation factor.
+ */
+
+ne10_result_t ne10_fir_decimate_init_float (
+ ne10_fir_decimate_instance_f32_t * S,
+ ne10_uint16_t numTaps,
+ ne10_uint8_t M,
+ ne10_float32_t * pCoeffs,
+ ne10_float32_t * pState,
+ ne10_uint32_t blockSize)
+{
+ ne10_result_t status;
+
+ /* The size of the input block must be a multiple of the decimation factor */
+ if ( (blockSize % M) != 0u)
+ {
+ /* Set status as NE10_ERR */
+ status = NE10_ERR;
+ }
+ else
+ {
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always (blockSize + numTaps - 1) */
+ memset (pState, 0, (numTaps + (blockSize - 1u)) * sizeof (ne10_float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ /* Assign Decimation Factor */
+ S->M = M;
+
+ status = NE10_OK;
+ }
+
+ return (status);
+
+}
+
+/**
+ * @brief Initialization function for the floating-point FIR interpolator.
+ * @param[in,out] *S points to an instance of the floating-point FIR interpolator structure.
+ * @param[in] L upsample factor.
+ * @param[in] numTaps number of filter coefficients in the filter.
+ * @param[in] *pCoeffs points to the filter coefficient buffer.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return The function returns NE10_OK if initialization was successful or NE10_ERR if
+ * the filter length <code>numTaps</code> is not a multiple of the interpolation factor <code>L</code>.
+ *
+ * <b>Description:</b>
+ * \par
+ * <code>pCoeffs</code> points to the array of filter coefficients stored in time reversed order:
+ * <pre>
+ * {b[numTaps-1], b[numTaps-2], b[numTaps-2], ..., b[1], b[0]}
+ * </pre>
+ * The length of the filter <code>numTaps</code> must be a multiple of the interpolation factor <code>L</code>.
+ * \par
+ * <code>pState</code> points to the array of state variables.
+ * <code>pState</code> is of length <code>(numTaps/L)+blockSize-1</code> words
+ * where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_interpolate_f32()</code>.
+ */
+
+ne10_result_t ne10_fir_interpolate_init_float (
+ ne10_fir_interpolate_instance_f32_t * S,
+ ne10_uint8_t L,
+ ne10_uint16_t numTaps,
+ ne10_float32_t * pCoeffs,
+ ne10_float32_t * pState,
+ ne10_uint32_t blockSize)
+{
+ ne10_result_t status;
+
+ /* The filter length must be a multiple of the interpolation factor */
+ if ( (numTaps % L) != 0u)
+ {
+ /* Set status as NE10_ERR */
+ status = NE10_ERR;
+ }
+ else
+ {
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Assign Interpolation factor */
+ S->L = L;
+
+ /* Assign polyPhaseLength */
+ S->phaseLength = numTaps / L;
+
+ /* Clear state buffer and size of state array is always phaseLength + blockSize - 1 */
+ memset (pState, 0,
+ (blockSize +
+ ( (ne10_uint32_t) S->phaseLength - 1u)) * sizeof (ne10_float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ status = NE10_OK;
+ }
+
+ return (status);
+
+}
+
+/**
+ * @brief Initialization function for the floating-point FIR lattice filter.
+ * @param[in] *S points to an instance of the floating-point FIR lattice structure.
+ * @param[in] numStages number of filter stages.
+ * @param[in] *pCoeffs points to the coefficient buffer. The array is of length numStages.
+ * @param[in] *pState points to the state buffer. The array is of length numStages.
+ * @return none.
+ */
+
+ne10_result_t ne10_fir_lattice_init_float (
+ ne10_fir_lattice_instance_f32_t * S,
+ ne10_uint16_t numStages,
+ ne10_float32_t * pCoeffs,
+ ne10_float32_t * pState)
+{
+ /* Assign filter taps */
+ S->numStages = numStages;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always numStages */
+ memset (pState, 0, (numStages) * sizeof (ne10_float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ return NE10_OK;
+}
+
+/**
+ * @brief Initialization function for the floating-point sparse FIR filter.
+ * @param[in,out] *S points to an instance of the floating-point sparse FIR structure.
+ * @param[in] numTaps number of nonzero coefficients in the filter.
+ * @param[in] *pCoeffs points to the array of filter coefficients.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] *pTapDelay points to the array of offset times.
+ * @param[in] maxDelay maximum offset time supported.
+ * @param[in] blockSize number of samples that will be processed per block.
+ * @return none
+ *
+ * <b>Description:</b>
+ * \par
+ * <code>pCoeffs</code> holds the filter coefficients and has length <code>numTaps</code>.
+ * <code>pState</code> holds the filter's state variables and must be of length
+ * <code>maxDelay + blockSize</code>, where <code>maxDelay</code>
+ * is the maximum number of delay line values.
+ * <code>blockSize</code> is the
+ * number of samples processed by the <code>arm_fir_sparse_f32()</code> function.
+ */
+
+ne10_result_t ne10_fir_sparse_init_float (
+ ne10_fir_sparse_instance_f32_t * S,
+ ne10_uint16_t numTaps,
+ ne10_float32_t * pCoeffs,
+ ne10_float32_t * pState,
+ ne10_int32_t * pTapDelay,
+ ne10_uint16_t maxDelay,
+ ne10_uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Assign TapDelay pointer */
+ S->pTapDelay = pTapDelay;
+
+ /* Assign MaxDelay */
+ S->maxDelay = maxDelay;
+
+ /* reset the stateIndex to 0 */
+ S->stateIndex = 0u;
+
+ /* Clear state buffer and size is always maxDelay + blockSize */
+ memset (pState, 0, (maxDelay + blockSize) * sizeof (ne10_float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ return NE10_OK;
+}
+
+
--- /dev/null
+/*
+ * Copyright 2012 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : dsp/NE10_iir.c
+ */
+
+#include "NE10_types.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @defgroup IIR_Lattice Infinite Impulse Response (IIR) Lattice Filters
+ *
+ * This set of functions implements lattice filters
+ * for Q15, Q31 and floating-point data types. Lattice filters are used in a
+ * variety of adaptive filter applications. The filter structure has feedforward and
+ * feedback components and the net impulse response is infinite length.
+ * The functions operate on blocks
+ * of input and output data and each call to the function processes
+ * <code>blockSize</code> samples through the filter. <code>pSrc</code> and
+ * <code>pDst</code> point to input and output arrays containing <code>blockSize</code> values.
+
+ * \par Algorithm:
+ * \image html IIRLattice.gif "Infinite Impulse Response Lattice filter"
+ * <pre>
+ * fN(n) = x(n)
+ * fm-1(n) = fm(n) - km * gm-1(n-1) for m = N, N-1, ...1
+ * gm(n) = km * fm-1(n) + gm-1(n-1) for m = N, N-1, ...1
+ * y(n) = vN * gN(n) + vN-1 * gN-1(n) + ...+ v0 * g0(n)
+ * </pre>
+ * \par
+ * <code>pkCoeffs</code> points to array of reflection coefficients of size <code>numStages</code>.
+ * Reflection coefficients are stored in time-reversed order.
+ * \par
+ * <pre>
+ * {kN, kN-1, ....k1}
+ * </pre>
+ * <code>pvCoeffs</code> points to the array of ladder coefficients of size <code>(numStages+1)</code>.
+ * Ladder coefficients are stored in time-reversed order.
+ * \par
+ * <pre>
+ * v0, v1, ...vN
+ * </pre>
+ * <code>pState</code> points to a state array of size <code>numStages + blockSize</code>.
+ * The state variables shown in the figure above (the g values) are stored in the <code>pState</code> array.
+ * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ *
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Set the values in the state buffer to zeros and then manually initialize the instance structure as follows:
+ * <pre>
+ *arm_iir_lattice_instance_f32 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ *arm_iir_lattice_instance_q31 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ *arm_iir_lattice_instance_q15 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ * </pre>
+ * \par
+ * where <code>numStages</code> is the number of stages in the filter; <code>pState</code> points to the state buffer array;
+ * <code>pkCoeffs</code> points to array of the reflection coefficients; <code>pvCoeffs</code> points to the array of ladder coefficients.
+ * \par Fixed-Point Behavior
+ * Care must be taken when using the fixed-point versions of the IIR lattice filter functions.
+ * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ * Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ * @addtogroup IIR_Lattice
+ * @{
+ */
+
+/**
+ * @brief Processing function for the floating-point IIR lattice filter.
+ * @param[in] *S points to an instance of the floating-point IIR lattice structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ */
+
+void ne10_iir_lattice_float_c (const ne10_iir_lattice_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize)
+{
+ ne10_float32_t fcurr, fnext = 0, gcurr, gnext; /* Temporary variables for lattice stages */
+ ne10_float32_t acc; /* Accumlator */
+ ne10_uint32_t blkCnt, tapCnt; /* temporary variables for counts */
+ ne10_float32_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */
+ ne10_uint32_t numStages = S->numStages; /* number of stages */
+ ne10_float32_t *pState; /* State pointer */
+ ne10_float32_t *pStateCurnt; /* State current pointer */
+
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ gcurr = 0.0f;
+ blkCnt = blockSize;
+
+ pState = &S->pState[0];
+
+ /* Sample processing */
+ while (blkCnt > 0u)
+ {
+ /* Read Sample from input buffer */
+ /* fN(n) = x(n) */
+ fcurr = *pSrc++;
+
+ /* Initialize state read pointer */
+ px1 = pState;
+ /* Initialize state write pointer */
+ px2 = pState;
+ /* Set accumulator to zero */
+ acc = 0.0f;
+ /* Initialize Ladder coeff pointer */
+ pv = &S->pvCoeffs[S->numStages];
+ /* Initialize Reflection coeff pointer */
+ pk = &S->pkCoeffs[0];
+
+
+ /* Process sample for first tap */
+ gcurr = *px1++;
+ /* fN-1(n) = fN(n) - kN * gN-1(n-1) */
+ fnext = fcurr - ( (*pk) * gcurr);
+ /* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+ gnext = (fnext * (*pk++)) + gcurr;
+ /* write gN(n) into state for next sample processing */
+ *px2++ = gnext;
+ /* y(n) += gN(n) * vN */
+ acc += (gnext * (*pv--));
+
+ /* Update f values for next coefficient processing */
+ fcurr = fnext;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = (numStages - 1u) >> 2;
+
+ while (tapCnt > 0u)
+ {
+ /* Process sample for 2nd, 6th ...taps */
+ /* Read gN-2(n-1) from state buffer */
+ gcurr = *px1++;
+ /* Process sample for 2nd, 6th .. taps */
+ /* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */
+ fnext = fcurr - ( (*pk) * gcurr);
+ /* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */
+ gnext = (fnext * (*pk++)) + gcurr;
+ /* y(n) += gN-1(n) * vN-1 */
+ /* process for gN-5(n) * vN-5, gN-9(n) * vN-9 ... */
+ acc += (gnext * (*pv--));
+ /* write gN-1(n) into state for next sample processing */
+ *px2++ = gnext;
+
+
+ /* Process sample for 3nd, 7th ...taps */
+ /* Read gN-3(n-1) from state buffer */
+ gcurr = *px1++;
+ /* Process sample for 3rd, 7th .. taps */
+ /* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */
+ fcurr = fnext - ( (*pk) * gcurr);
+ /* gN-2(n) = kN-2 * fN-3(n) + gN-3(n-1) */
+ gnext = (fcurr * (*pk++)) + gcurr;
+ /* y(n) += gN-2(n) * vN-2 */
+ /* process for gN-6(n) * vN-6, gN-10(n) * vN-10 ... */
+ acc += (gnext * (*pv--));
+ /* write gN-2(n) into state for next sample processing */
+ *px2++ = gnext;
+
+
+ /* Process sample for 4th, 8th ...taps */
+ /* Read gN-4(n-1) from state buffer */
+ gcurr = *px1++;
+ /* Process sample for 4th, 8th .. taps */
+ /* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */
+ fnext = fcurr - ( (*pk) * gcurr);
+ /* gN-3(n) = kN-3 * fN-4(n) + gN-4(n-1) */
+ gnext = (fnext * (*pk++)) + gcurr;
+ /* y(n) += gN-3(n) * vN-3 */
+ /* process for gN-7(n) * vN-7, gN-11(n) * vN-11 ... */
+ acc += (gnext * (*pv--));
+ /* write gN-3(n) into state for next sample processing */
+ *px2++ = gnext;
+
+
+ /* Process sample for 5th, 9th ...taps */
+ /* Read gN-5(n-1) from state buffer */
+ gcurr = *px1++;
+ /* Process sample for 5th, 9th .. taps */
+ /* fN-5(n) = fN-4(n) - kN-4 * gN-1(n-1) */
+ fcurr = fnext - ( (*pk) * gcurr);
+ /* gN-4(n) = kN-4 * fN-5(n) + gN-5(n-1) */
+ gnext = (fcurr * (*pk++)) + gcurr;
+ /* y(n) += gN-4(n) * vN-4 */
+ /* process for gN-8(n) * vN-8, gN-12(n) * vN-12 ... */
+ acc += (gnext * (*pv--));
+ /* write gN-4(n) into state for next sample processing */
+ *px2++ = gnext;
+
+ tapCnt--;
+
+ }
+
+ fnext = fcurr;
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = (numStages - 1u) % 0x4u;
+
+ while (tapCnt > 0u)
+ {
+ gcurr = *px1++;
+ /* Process sample for last taps */
+ fnext = fcurr - ( (*pk) * gcurr);
+ gnext = (fnext * (*pk++)) + gcurr;
+ /* Output samples for last taps */
+ acc += (gnext * (*pv--));
+ *px2++ = gnext;
+ fcurr = fnext;
+
+ tapCnt--;
+
+ }
+
+
+ /* y(n) += g0(n) * v0 */
+ acc += (fnext * (*pv));
+
+ *px2++ = fnext;
+
+ /* write out into pDst */
+ *pDst++ = acc;
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 1u;
+ blkCnt--;
+
+ }
+
+ /* Processing is complete. Now copy last S->numStages samples to start of the buffer
+ for the preperation of next frame process */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = &S->pState[0];
+ pState = &S->pState[blockSize];
+
+ tapCnt = numStages >> 2u;
+
+ /* copy data */
+ while (tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numStages) % 0x4u;
+
+ /* Copy the remaining q31_t data */
+ while (tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+}
+
+
+
+
+/**
+ * @} end of IIR_Lattice group
+ */
--- /dev/null
+@/*
+@ * Copyright 2012 ARM Limited
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License")@
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ */
+
+@/*
+@ * NE10 Library : dsp/NE10_iir.neon.s
+@ */
+
+
+ .text
+ .syntax unified
+
+ @/**
+ @ * @brief Processing function for the floating-point IIR lattice filter.
+ @ * @param[in] *S points to an instance of the floating-point IIR lattice structure.
+ @ * @param[in] *pSrc points to the block of input data.
+ @ * @param[out] *pDst points to the block of output data.
+ @ * @param[in] blockSize number of samples to process.
+ @ * @return none.
+ @ */
+
+ .align 4
+ .global ne10_iir_lattice_float_neon
+ .extern ne10_qMaskTable32
+ .thumb
+ .thumb_func
+
+ne10_iir_lattice_float_neon:
+ PUSH {r4-r12,lr}
+ @VPUSH {d8,d9}
+
+@/*ARM Registers*/
+pStateStruct .req R0
+pSrc .req R1
+pDst .req R2
+blockSize .req R3
+
+pState .req R4 @/* State pointer */
+pKcoeffs .req R5 @/* Coefficient pointer */
+pVcoeffs .req R6 @/* Coefficient pointer */
+
+pX .req R7 @/* Temporary pointers for state buffer */
+pK .req R8 @/* Temporary pointers for coefficient buffer */
+numStages .req R9 @/* Length of the filter */
+
+tapCnt .req R10 @ /* Loop counter */
+pTemp .req R11
+
+
+pMask .req R14 @ /* Mask Table */
+
+mask .req R12
+pV .req R12
+
+@/*NEON variale Declaration*/
+dTemp3a_0 .dn D0.U32
+dTemp3_0 .dn D0.F32
+dMask2 .dn D1.U32
+
+qGcurr .qn Q1.F32
+dGcurr_0 .dn D2.F32
+dGcurr_1 .dn D3.F32
+
+qZero .qn Q2.F32
+
+qMask .qn Q3.U32
+dMask_0 .dn D6.U32
+dMask_1 .dn D7.U32
+dOut_0 .dn D6.F32
+dOut_1 .dn D7.F32
+
+qGK .qn Q4.F32
+dGK_0 .dn D8.F32
+dGK_1 .dn D9.F32
+
+qAcc0 .qn Q8.F32
+dAcc0_0 .dn D16.F32
+dAcc0_1 .dn D17.F32
+
+qTemp .qn Q9.F32
+dTemp_0 .dn D18.F32
+dTemp_1 .dn D19.F32
+
+qFnext .qn Q10.F32
+dFnext_0 .dn D20.F32
+dFnext_1 .dn D21.F32
+
+qFcurr .qn Q11.F32
+dFcurr_0 .dn D22.F32
+dFcurr_1 .dn D23.F32
+
+qCoeff0 .qn Q12.F32
+dCoeff0_0 .dn D24.F32
+dCoeff0_1 .dn D25.F32
+
+qMask1 .qn Q13.U32
+dMask1_0 .dn D26.U32
+dMask1_1 .dn D27.U32
+
+
+qMaskTmp .qn Q14.U32
+dMaskTmp_0 .dn D28.U32
+dMaskTmp_1 .dn D29.U32
+
+qGnext .qn Q15.F32
+dGnext_0 .dn D30.F32
+dGnext_1 .dn D31.F32
+
+
+ @/* Length of the filter */
+ LDRH numStages,[pStateStruct],#4
+ @/* State pointer */
+ LDR pState,[pStateStruct],#4
+ @/* Coefficient pointer */
+ LDR pKcoeffs,[pStateStruct],#4
+ LDR pVcoeffs,[pStateStruct],#4
+
+
+ @/*Load Mask Valies*/
+ LDR pMask,=ne10_qMaskTable32
+ AND mask,numStages,#3
+ ADD tapCnt,mask,#1
+
+ ADD pTemp,pMask,mask,LSL #4
+ ADD tapCnt,pMask,tapCnt,LSL #4
+
+ VLD1 {dMaskTmp_0,dMaskTmp_1},[pTemp]
+ VLD1 {dMask1_0,dMask1_1},[tapCnt]
+
+ ADD pTemp,pMask,#16
+ VEOR qZero,qZero
+ VLD1 {dMask2},[pTemp]
+
+ @/*while blockSize > 0*/
+ CMP blockSize, #0
+ BEQ firLatticeCopy
+
+
+firLatticeOuterLoop:
+ VLD1 {dFcurr_0[],dFcurr_1[]},[pSrc]!
+ MOV pX,pState
+ VEOR qAcc0,qAcc0
+ @/* Initialize Ladder coeff pointer */
+ ADD pV,pVcoeffs,numStages, LSL #2
+ MOV pK,pKcoeffs
+
+ VLD1 {dGcurr_0,dGcurr_1},[pX]
+ @/* Load the filter Taps */
+ VLD1 {dCoeff0_0,dCoeff0_1},[pK]!
+
+ SUBS tapCnt,numStages,#4
+ ADD pV,pV,#4
+ BLT firLatticeEndInnerLoop
+
+
+firLatticeInnerLoop:
+
+
+
+ VMUL qGK,qGcurr,qCoeff0
+
+ @/* g4k4+g5k5 g6k6+g7k7*/
+ VPADD dTemp_0,dGK_1,dGK_0
+ @/*g6k6 g4k4+g5k5*/
+ VEXT dTemp_1,dTemp_0,dGK_1,#1
+ @/*g7k7+g6k6+g5k5+g4k4 g6k6+g5k5+g4k4*/
+ VPADD dTemp_1,dTemp_1,dTemp_0
+ VMOV dTemp3a_0,dMask2
+ VBSL dTemp3a_0,dGK_0,dTemp_0
+ VMOV dTemp_0,dTemp3_0
+ VSUB qFnext,qFcurr,qTemp
+
+ @/* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+ VMLA qGcurr,qFnext,qCoeff0
+
+ @/* y(n) += gN(n) * vN */
+ SUB pV,pV,#16
+ VLD1 {dCoeff0_0,dCoeff0_1},[pV]
+
+ @/* write gN-1(n-1) into state for next sample processing */
+ VST1 {dGcurr_0,dGcurr_1},[pX]!
+ VREV64 qCoeff0,qCoeff0
+ @/* acc0 += gnext * (*pv--)@ */
+ VMLA dAcc0_0,dGcurr_0,dCoeff0_1
+ VMLA dAcc0_1,dGcurr_1,dCoeff0_0
+
+ @/* Update f values for next coefficients processing */
+
+ VDUP qFcurr,dFnext_1[1]
+
+ VLD1 {dGcurr_0,dGcurr_1},[pX]
+ @/* Load the filter Taps */
+ VLD1 {dCoeff0_0,dCoeff0_1},[pK]!
+
+ SUBS tapCnt,#4
+ BGE firLatticeInnerLoop
+firLatticeEndInnerLoop:
+ @/* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ ADDS tapCnt,#4
+ IT GT
+ SUBGT tapCnt,#1
+
+
+ VMUL qGK,qGcurr,qCoeff0
+
+ VPADD dTemp_0,dGK_1,dGK_0
+ VEXT dTemp_1,dTemp_0,dGK_1,#1
+ VPADD dTemp_1,dTemp_1,dTemp_0
+ VMOV dTemp3a_0,dMask2
+ VBSL dTemp3a_0,dGK_0,dTemp_0
+ VMOV dTemp_0,dTemp3_0
+
+ @/*Mask the Uncessary f values*/
+ VMOV qFnext,qMaskTmp
+ VBSL qFnext,qTemp,qZero
+ VSUB qFnext,qFcurr,qFnext
+
+ VMOV qGnext,qGcurr
+ VMLA qGnext,qFnext,qCoeff0
+
+ @/*Store on to stack for getting proper Fnext*/
+ SUB pTemp,SP,#20
+ VST1 {dFnext_0,dFnext_1},[pTemp]
+
+ ADD pTemp,pTemp,tapCnt, LSL #2
+ VLD1 {dTemp_0[],dTemp_1[]},[pTemp]
+
+ VMOV qGcurr,qMaskTmp
+ VBSL qGcurr,qGnext,qTemp
+
+ VLD1 {dTemp_0,dTemp_1},[pX]
+ VMOV qMask,qMask1
+ VBSL qMask,qGcurr,qTemp
+ VST1 {dMask_0,dMask_1},[pX]
+
+ ADD pX,pX,tapCnt,LSL #2
+
+ SUB pV,pV,#16
+ VLD1 {dCoeff0_0,dCoeff0_1},[pV]
+
+ @// MASk the Gnext value used for Output calculation
+ VMOV qGnext,qMask1
+ VBSL qGnext,qGcurr,qZero
+ ADD pX,pX,#4
+
+ VREV64 qCoeff0,qCoeff0
+
+ VMLA dAcc0_0,dGnext_0,dCoeff0_1
+ VMLA dAcc0_1,dGnext_1,dCoeff0_0
+
+ /*Get Accumulated Result in to single Value*/
+
+ VLD1 {dTemp_1},[pDst]
+ VPADD dTemp_0,dAcc0_0,dAcc0_1
+ VPADD dTemp_0,dTemp_0
+
+ VMOV dMask_0,dMask2
+ VBSL dMask_0,dTemp_0,dTemp_1
+
+ VST1 {dMask_0},[pDst]
+ ADD pDst,#4
+ ADD pState,#4
+
+ SUBS blockSize,#1
+
+ BGT firLatticeOuterLoop
+
+
+ @/* copy last S->numStages samples to start of the buffer
+ @for next frame process */
+
+firLatticeCopy:
+ AND mask,numStages,#3
+ ADD pTemp,pMask,mask,LSL #4
+ LDR pX,[pStateStruct,#-12]
+
+ VLD1 {dFcurr_0,dFcurr_1},[pState]!
+ VLD1 {dMask_0,dMask_1},[pTemp]
+ SUBS tapCnt,numStages,#4
+ BLT firLatticeEnd
+firLatticeCopyLoop:
+ VST1 {dFcurr_0,dFcurr_1},[pX]!
+ SUBS tapCnt,#4
+ VLD1 {dFcurr_0,dFcurr_1},[pState]!
+ BGE firLatticeCopyLoop
+firLatticeEnd:
+ VLD1 {dTemp_0,dTemp_1},[pX]
+ VBSL qMask,qFcurr,qTemp
+ VST1 {dOut_0,dOut_1},[pX]
+ ADD pX,pX,mask, LSL #2
+@/*ARM Registers*/
+.unreq pStateStruct
+.unreq pSrc
+.unreq pDst
+.unreq blockSize
+
+.unreq pState
+.unreq pKcoeffs
+.unreq pVcoeffs
+
+.unreq pX
+.unreq pK
+.unreq numStages
+
+.unreq tapCnt
+.unreq pTemp
+.unreq pMask
+.unreq mask
+.unreq pV
+
+@/*NEON variale Declaration*/
+.unreq dTemp3a_0
+.unreq dTemp3_0
+.unreq dMask2
+
+.unreq qGcurr
+.unreq dGcurr_0
+.unreq dGcurr_1
+
+.unreq qZero
+.unreq qMask
+.unreq dMask_0
+.unreq dMask_1
+.unreq dOut_0
+.unreq dOut_1
+
+.unreq qGK
+.unreq dGK_0
+.unreq dGK_1
+
+.unreq qAcc0
+.unreq dAcc0_0
+.unreq dAcc0_1
+
+.unreq qTemp
+.unreq dTemp_0
+.unreq dTemp_1
+
+.unreq qFnext
+.unreq dFnext_0
+.unreq dFnext_1
+
+.unreq qFcurr
+.unreq dFcurr_0
+.unreq dFcurr_1
+
+.unreq qCoeff0
+.unreq dCoeff0_0
+.unreq dCoeff0_1
+
+.unreq qMask1
+.unreq dMask1_0
+.unreq dMask1_1
+
+.unreq qMaskTmp
+.unreq dMaskTmp_0
+.unreq dMaskTmp_1
+
+.unreq qGnext
+.unreq dGnext_0
+.unreq dGnext_1
+
+ @VPOP {d8,d9}
+ POP {r4-r12,pc}
+
+ .end
--- /dev/null
+/*
+ * Copyright 2012 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : dsp/NE10_iir_init.c
+ */
+#include "NE10_types.h"
+
+
+/**
+ * @brief Initialization function for the floating-point IIR lattice filter.
+ * @param[in] *S points to an instance of the floating-point IIR lattice structure.
+ * @param[in] numStages number of stages in the filter.
+ * @param[in] *pkCoeffs points to the reflection coefficient buffer. The array is of length numStages.
+ * @param[in] *pvCoeffs points to the ladder coefficient buffer. The array is of length numStages+1.
+ * @param[in] *pState points to the state buffer. The array is of length numStages+blockSize.
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ */
+
+ne10_result_t ne10_iir_lattice_init_float (ne10_iir_lattice_instance_f32_t * S,
+ ne10_uint16_t numStages,
+ ne10_float32_t * pkCoeffs,
+ ne10_float32_t * pvCoeffs,
+ ne10_float32_t * pState,
+ ne10_uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numStages = numStages;
+
+ /* Assign reflection coefficient pointer */
+ S->pkCoeffs = pkCoeffs;
+
+ /* Assign ladder coefficient pointer */
+ S->pvCoeffs = pvCoeffs;
+
+ /* Clear state buffer and size is always blockSize + numStages */
+ memset (pState, 0, (numStages + blockSize) * sizeof (ne10_float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ return NE10_OK;
+}
+
+/**
+ * @} end of IIR_Lattice group
+ */
ne10_radix4_butterfly_float = ne10_radix4_butterfly_float_neon;
ne10_radix4_butterfly_inverse_float = ne10_radix4_butterfly_inverse_float_neon;
ne10_rfft_float = ne10_rfft_float_neon;
+
+ ne10_fir_float = ne10_fir_float_neon;
+ ne10_fir_decimate_float = ne10_fir_decimate_float_neon;
+ ne10_fir_interpolate_float = ne10_fir_interpolate_float_neon;
+ ne10_fir_lattice_float = ne10_fir_lattice_float_neon;
+ ne10_fir_sparse_float = ne10_fir_sparse_float_neon;
+
+ ne10_iir_lattice_float = ne10_iir_lattice_float_neon;
}
else
{
ne10_radix4_butterfly_float = ne10_radix4_butterfly_float_c;
ne10_radix4_butterfly_inverse_float = ne10_radix4_butterfly_inverse_float_c;
ne10_rfft_float = ne10_rfft_float_c;
+
+ ne10_fir_float = ne10_fir_float_c;
+ ne10_fir_decimate_float = ne10_fir_decimate_float_c;
+ ne10_fir_interpolate_float = ne10_fir_interpolate_float_c;
+ ne10_fir_lattice_float = ne10_fir_lattice_float_c;
+ ne10_fir_sparse_float = ne10_fir_sparse_float_c;
+
+ ne10_iir_lattice_float = ne10_iir_lattice_float_c;
}
return NE10_OK;
}
ne10_float32_t * pSrc,
ne10_float32_t * pDst,
ne10_float32_t * pTemp);
+
+void (*ne10_fir_float)(const ne10_fir_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
+
+void (*ne10_fir_decimate_float)(
+ const ne10_fir_decimate_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
+
+void (*ne10_fir_interpolate_float)(
+ const ne10_fir_interpolate_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
+
+void (*ne10_fir_lattice_float)(
+ const ne10_fir_lattice_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);
+
+void (*ne10_fir_sparse_float)(
+ ne10_fir_sparse_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_float32_t * pScratchIn,
+ ne10_uint32_t blockSize);
+
+void (*ne10_iir_lattice_float)(const ne10_iir_lattice_instance_f32_t * S,
+ ne10_float32_t * pSrc,
+ ne10_float32_t * pDst,
+ ne10_uint32_t blockSize);