From: yang <yang.zhang@arm.com>
Date: Fri, 19 Oct 2012 03:02:42 +0000 (+0800)
Subject: add fir and iir filter function:
X-Git-Tag: v1.0.0~20^2~5
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=a4d7cf270e4dfeca379d1801e5a7778e7e8c9411;p=platform%2Fupstream%2Fne10.git

add fir and iir filter function:
1. fir
2. fir decimate
3. fir interpolate
4. fir lattice
5. fir sparse
6. iir lattice
---

diff --git a/common/NE10_mask_table.c b/common/NE10_mask_table.c
index c0a414b..a7845aa 100644
--- a/common/NE10_mask_table.c
+++ b/common/NE10_mask_table.c
@@ -36,4 +36,23 @@ const ne10_uint32_t ne10_dMaskTable32[D_MASK_TABLE_SIZE] =
 
 };
 
+const ne10_uint32_t ne10_divLookUpTable[DIV_LOOKUP_TABLE_SIZE]=
+{
 
+    65535,32768,21845,16384,13107,10923,9362,8192,7282,6554,5958,5461,5041,4681,4369,4096,
+    3855,3641,3449,3277,3121,2979,2849,2731,2621,2521,2427,2341,2260,2185,2114,2048,
+    1986,1928,1872,1820,1771,1725,1680,1638,1598,1560,1524,1489,1456,1425,1394,1365,
+    1337,1311,1285,1260,1237,1214,1192,1170,1150,1130,1111,1092,1074,1057,1040,1024,
+    1008,993,978,964,950,936,923,910,898,886,874,862,851,840,830,819,
+    809,799,790,780,771,762,753,745,736,728,720,712,705,697,690,683,
+    676,669,662,655,649,643,636,630,624,618,612,607,601,596,590,585,
+    580,575,570,565,560,555,551,546,542,537,533,529,524,520,516,512,
+    508,504,500,496,493,489,485,482,478,475,471,468,465,462,458,455,
+    452,449,446,443,440,437,434,431,428,426,423,420,417,415,412,410,
+    407,405,402,400,397,395,392,390,388,386,383,381,379,377,374,372,
+    370,368,366,364,362,360,358,356,354,352,350,349,347,345,343,341,
+    340,338,336,334,333,331,329,328,326,324,323,321,320,318,317,315,
+    314,312,311,309,308,306,305,303,302,301,299,298,297,295,294,293,
+    291,290,289,287,286,285,284,282,281,280,279,278,277,275,274,273,
+    272,271,270,269,267,266,265,264,263,262,261,260,259,258,257
+    };
diff --git a/common/NE10_mask_table.h b/common/NE10_mask_table.h
index 9787ffc..67211c4 100644
--- a/common/NE10_mask_table.h
+++ b/common/NE10_mask_table.h
@@ -22,10 +22,12 @@
 #ifndef _ARM_MASK_TABLE_H
 #define _ARM_MASK_TABLE_H
 
-#define Q_MASK_TABLE_SIZE 20
-#define D_MASK_TABLE_SIZE 6
+#define Q_MASK_TABLE_SIZE        20
+#define D_MASK_TABLE_SIZE        6
+#define DIV_LOOKUP_TABLE_SIZE    255
 
 extern const ne10_uint32_t ne10_qMaskTable32[Q_MASK_TABLE_SIZE];
 extern const ne10_uint32_t ne10_dMaskTable32[D_MASK_TABLE_SIZE];
+extern const ne10_uint32_t ne10_divLookUpTable[DIV_LOOKUP_TABLE_SIZE];
 #endif
 
diff --git a/inc/NE10_dsp.h b/inc/NE10_dsp.h
index 1cc492e..5240126 100644
--- a/inc/NE10_dsp.h
+++ b/inc/NE10_dsp.h
@@ -93,6 +93,166 @@ extern void ne10_rfft_float_neon(
                      ne10_float32_t * pSrc,
                      ne10_float32_t * pDst,
                      ne10_float32_t * pTemp);
+/* fir functions*/
+
+/* function pointers*/
+extern void (*ne10_fir_float)(const ne10_fir_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
+
+extern void (*ne10_fir_decimate_float)(
+                     const ne10_fir_decimate_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
+
+extern void (*ne10_fir_interpolate_float)(
+                     const ne10_fir_interpolate_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
+
+extern void (*ne10_fir_lattice_float)(
+                     const ne10_fir_lattice_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
+
+extern void (*ne10_fir_sparse_float)(
+                     ne10_fir_sparse_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_float32_t * pScratchIn,
+                     ne10_uint32_t blockSize);
+
+
+/* init functions*/
+extern ne10_result_t ne10_fir_init_float(ne10_fir_instance_f32_t * S,
+                     ne10_uint16_t numTaps,
+                     ne10_float32_t * pCoeffs,
+                     ne10_float32_t * pState,
+                     ne10_uint32_t blockSize);
+
+extern ne10_result_t ne10_fir_decimate_init_float(
+                     ne10_fir_decimate_instance_f32_t * S,
+                     ne10_uint16_t numTaps,
+                     ne10_uint8_t M,
+                     ne10_float32_t * pCoeffs,
+                     ne10_float32_t * pState,
+                     ne10_uint32_t blockSize);
+
+extern ne10_result_t ne10_fir_interpolate_init_float(
+                     ne10_fir_interpolate_instance_f32_t * S,
+                     ne10_uint8_t L,
+                     ne10_uint16_t numTaps,
+                     ne10_float32_t * pCoeffs,
+                     ne10_float32_t * pState,
+                     ne10_uint32_t blockSize);
+
+extern ne10_result_t ne10_fir_lattice_init_float(
+                     ne10_fir_lattice_instance_f32_t * S,
+                     ne10_uint16_t numStages,
+                     ne10_float32_t * pCoeffs,
+                     ne10_float32_t * pState);
+
+extern ne10_result_t ne10_fir_sparse_init_float(
+                     ne10_fir_sparse_instance_f32_t * S,
+                     ne10_uint16_t numTaps,
+                     ne10_float32_t * pCoeffs,
+                     ne10_float32_t * pState,
+                     ne10_int32_t * pTapDelay,
+                     ne10_uint16_t maxDelay,
+                     ne10_uint32_t blockSize);
+
+/* C version*/
+extern void ne10_fir_float_c(const ne10_fir_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
+
+extern void ne10_fir_decimate_float_c(
+                     const ne10_fir_decimate_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
+
+extern void ne10_fir_interpolate_float_c(
+                     const ne10_fir_interpolate_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
+
+extern void ne10_fir_lattice_float_c(
+                     const ne10_fir_lattice_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
+
+extern void ne10_fir_sparse_float_c(
+                     ne10_fir_sparse_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_float32_t * pScratchIn,
+                     ne10_uint32_t blockSize);
+
+
+/* NEON version*/
+extern void ne10_fir_float_neon(const ne10_fir_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
+
+extern void ne10_fir_decimate_float_neon(const ne10_fir_decimate_instance_f32_t * S,
+                     ne10_float32_t *pSrc,
+                     ne10_float32_t *pDst,
+                     ne10_uint32_t blockSize);
+
+extern void ne10_fir_interpolate_float_neon(const ne10_fir_interpolate_instance_f32_t * S,
+                     ne10_float32_t *pSrc,
+                     ne10_float32_t *pDst,
+                     ne10_uint32_t blockSize);
+
+extern void ne10_fir_lattice_float_neon(const ne10_fir_lattice_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
+
+extern void ne10_fir_sparse_float_neon(ne10_fir_sparse_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_float32_t * pScratch,
+                     ne10_uint32_t blockSize);
+
+
+/* iir functions*/
+
+/* function pointers*/
+extern void (*ne10_iir_lattice_float)(const ne10_iir_lattice_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
+
+/* init functions*/
+extern ne10_result_t ne10_iir_lattice_init_float(ne10_iir_lattice_instance_f32_t * S,
+                     ne10_uint16_t numStages,
+                     ne10_float32_t * pkCoeffs,
+                     ne10_float32_t * pvCoeffs,
+                     ne10_float32_t * pState,
+                     ne10_uint32_t blockSize);
+
+
+/* C version*/
+extern void ne10_iir_lattice_float_c(const ne10_iir_lattice_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
+
+/* NEON version*/
+extern void ne10_iir_lattice_float_neon(const ne10_iir_lattice_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
 
 #ifdef __cplusplus
 }
diff --git a/inc/NE10_types.h b/inc/NE10_types.h
index e4db06e..bd03629 100644
--- a/inc/NE10_types.h
+++ b/inc/NE10_types.h
@@ -199,4 +199,74 @@ typedef struct
     ne10_cfft_radix4_instance_f32_t *p_cfft;         /**< Pointer to the complex FFT Instance. */
 } ne10_rfft_instance_f32_t;
 
+/////////////////////////////////////////////////////////
+// definitions for fir
+/////////////////////////////////////////////////////////
+
+/*
+ * @brief Instance structure for the floating-point FIR filter.
+ */
+typedef struct
+{
+    ne10_uint16_t numTaps;    /**< Length of the filter. */
+    ne10_float32_t *pState;    /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */
+    ne10_float32_t *pCoeffs;   /**< Points to the coefficient array. The array is of length numTaps. */
+} ne10_fir_instance_f32_t;
+
+/*
+ * @brief Instance structure for the floating point FIR Lattice filter.
+ */
+typedef struct
+{
+    ne10_uint16_t numStages;    /**< numStages of the of lattice filter. */
+    ne10_float32_t *pState;      /**< Points to the state variable array. The array is of length numStages. */
+    ne10_float32_t *pCoeffs;     /**< Points to the coefficient array. The array is of length numStages. */
+} ne10_fir_lattice_instance_f32_t;
+
+/*
+ * @brief Instance structure for the floating-point FIR Decimation.
+ */
+typedef struct
+{
+    ne10_uint8_t  M;            /**< Decimation Factor. */
+    ne10_uint16_t numTaps;      /**< Length of the filter. */
+    ne10_float32_t    *pCoeffs;      /**< Points to the coefficient array. The array is of length numTaps.*/
+    ne10_float32_t    *pState;       /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */
+} ne10_fir_decimate_instance_f32_t;
+
+/*
+ * @brief Instance structure for the floating-point FIR Interpolation.
+ */
+typedef struct
+{
+    ne10_uint8_t L;             /**< Interpolation Factor. */
+    ne10_uint16_t phaseLength;  /**< Length of each polyphase filter component. */
+    ne10_float32_t *pCoeffs;         /**< Points to the coefficient array. The array is of length numTaps.*/
+    ne10_float32_t *pState;          /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */
+} ne10_fir_interpolate_instance_f32_t;
+
+/*
+ * @brief Instance structure for the floating-point FIR Sparse filter.
+ */
+typedef struct
+{
+    ne10_uint16_t numTaps;      /**< Length of the filter. */
+    ne10_uint16_t stateIndex;   /**< Index pointer for the state buffer .*/
+    ne10_float32_t *pState;          /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */
+    ne10_float32_t *pCoeffs;         /**< Points to the coefficient array. The array is of length numTaps.*/
+    ne10_uint16_t  maxDelay;    /**< the largest number of delay line values .*/
+    ne10_int32_t  *pTapDelay;    /**< Pointer to the array containing positions of the non-zero tap values. */
+} ne10_fir_sparse_instance_f32_t;
+
+/**
+   * @brief Instance structure for the floating point IIR Lattice filter.
+   */
+typedef struct
+{
+    ne10_uint16_t numStages;    /**< numStages of the of lattice filter. */
+    ne10_float32_t *pState;      /**< Points to the state variable array. The array is of length numStages + blockSize -1. */
+    ne10_float32_t *pkCoeffs;    /**< Points to the reflection coefficient array. The array is of length numStages. */
+    ne10_float32_t *pvCoeffs;    /**< Points to the ladder coefficient array. The array is of length numStages+1. */
+} ne10_iir_lattice_instance_f32_t;
+
 #endif
diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt
index 7329b63..f74ed9d 100644
--- a/modules/CMakeLists.txt
+++ b/modules/CMakeLists.txt
@@ -122,6 +122,10 @@ if(NE10_ENABLE_DSP)
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_cfft_init.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_rfft.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_rfft_init.c
+        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fir.c
+        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fir_init.c
+        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_iir.c
+        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_iir_init.c
     )
 
     # Add dsp intrinsic NEON files.
@@ -137,6 +141,8 @@ if(NE10_ENABLE_DSP)
     # Add dsp NEON files.
     set(NE10_DSP_NEON_SRCS
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_cfft.neon.s
+        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fir.neon.s
+        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_iir.neon.s
     )
 
     # Tell CMake these files need to go to the C compiler
diff --git a/modules/dsp/NE10_fir.c b/modules/dsp/NE10_fir.c
new file mode 100644
index 0000000..4c95d28
--- /dev/null
+++ b/modules/dsp/NE10_fir.c
@@ -0,0 +1,1280 @@
+/*
+ *  Copyright 2012 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : dsp/NE10_fir.c
+ */
+
+#include "NE10_types.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @defgroup FIR Finite Impulse Response (FIR) Filters
+ *
+ * This set of functions implements Finite Impulse Response (FIR) filters
+ * for floating-point data types.
+ * The functions operate on blocks of input and output data and each call to the function processes
+ * <code>blockSize</code> samples through the filter.  <code>pSrc</code> and
+ * <code>pDst</code> points to input and output arrays containing <code>blockSize</code> values.
+ *
+ * \par Algorithm:
+ * The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations.
+ * Each filter coefficient <code>b[n]</code> is multiplied by a state variable which equals a previous input sample <code>x[n]</code>.
+ * <pre>
+ *    y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
+ * </pre>
+ * \par
+ * \image html FIR.gif "Finite Impulse Response filter"
+ * \par
+ * <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
+ * Coefficients are stored in time reversed order.
+ * \par
+ * <pre>
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * </pre>
+ * \par
+ * <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
+ * Samples in the state buffer are stored in the following order.
+ * \par
+ * <pre>
+ *    {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
+ * </pre>
+ * \par
+ * Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code>.
+ * The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters,
+ * to be avoided and yields a significant speed improvement.
+ * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ * There are separate instance structure declarations for each of the 4 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ *
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Set the values in the state buffer to zeros before static initialization.
+ * The code below statically initializes each of the 4 different data type filter instance structures
+ * <pre>
+ *ne10_fir_instance_f32_t S = {numTaps, pState, pCoeffs};
+ * </pre>
+ *
+ * where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer;
+ * <code>pCoeffs</code> is the address of the coefficient buffer.
+ *
+ * \par Fixed-Point Behavior
+ * Care must be taken when using the fixed-point versions of the FIR filter functions.
+ * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ * Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ * @addtogroup FIR
+ * @{
+ */
+
+/**
+ *
+ * @param[in]  *S points to an instance of the floating-point FIR filter structure.
+ * @param[in]  *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in]  blockSize number of samples to process per call.
+ * @return     none.
+ *
+ */
+
+void ne10_fir_float_c (
+    const ne10_fir_instance_f32_t * S,
+    ne10_float32_t * pSrc,
+    ne10_float32_t * pDst,
+    ne10_uint32_t blockSize)
+{
+
+    ne10_float32_t *pState = S->pState;                 /* State pointer */
+    ne10_float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+    ne10_float32_t *pStateCurnt;                        /* Points to the current sample of the state */
+    ne10_float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
+    ne10_uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+    ne10_uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
+
+    /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+    ne10_float32_t acc0, acc1, acc2, acc3;              /* Accumulators */
+    ne10_float32_t x0, x1, x2, x3, c0;                  /* Temporary variables to hold state and coefficient values */
+
+
+    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+    /* pStateCurnt points to the location where the new input data should be written */
+    pStateCurnt = & (S->pState[ (numTaps - 1u)]);
+
+    /* Apply loop unrolling and compute 4 output values simultaneously.
+     * The variables acc0 ... acc3 hold output values that are being computed:
+     *
+     *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+     *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+     *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+     *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
+     */
+    blkCnt = blockSize >> 2;
+
+    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+     ** a second loop below computes the remaining 1 to 3 samples. */
+    while (blkCnt > 0u)
+    {
+        /* Copy four new input samples into the state buffer */
+        *pStateCurnt++ = *pSrc++;
+        *pStateCurnt++ = *pSrc++;
+        *pStateCurnt++ = *pSrc++;
+        *pStateCurnt++ = *pSrc++;
+
+        /* Set all accumulators to zero */
+        acc0 = 0.0f;
+        acc1 = 0.0f;
+        acc2 = 0.0f;
+        acc3 = 0.0f;
+
+        /* Initialize state pointer */
+        px = pState;
+
+        /* Initialize coeff pointer */
+        pb = (pCoeffs);
+
+        /* Read the first three samples from the state buffer:  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
+        x0 = *px++;
+        x1 = *px++;
+        x2 = *px++;
+
+        /* Loop unrolling.  Process 4 taps at a time. */
+        tapCnt = numTaps >> 2u;
+
+        /* Loop over the number of taps.  Unroll by a factor of 4.
+         ** Repeat until we've computed numTaps-4 coefficients. */
+        while (tapCnt > 0u)
+        {
+            /* Read the b[numTaps-1] coefficient */
+            c0 = * (pb++);
+
+            /* Read x[n-numTaps-3] sample */
+            x3 = * (px++);
+
+            /* acc0 +=  b[numTaps-1] * x[n-numTaps] */
+            acc0 += x0 * c0;
+
+            /* acc1 +=  b[numTaps-1] * x[n-numTaps-1] */
+            acc1 += x1 * c0;
+
+            /* acc2 +=  b[numTaps-1] * x[n-numTaps-2] */
+            acc2 += x2 * c0;
+
+            /* acc3 +=  b[numTaps-1] * x[n-numTaps-3] */
+            acc3 += x3 * c0;
+
+            /* Read the b[numTaps-2] coefficient */
+            c0 = * (pb++);
+
+            /* Read x[n-numTaps-4] sample */
+            x0 = * (px++);
+
+            /* Perform the multiply-accumulate */
+            acc0 += x1 * c0;
+            acc1 += x2 * c0;
+            acc2 += x3 * c0;
+            acc3 += x0 * c0;
+
+            /* Read the b[numTaps-3] coefficient */
+            c0 = * (pb++);
+
+            /* Read x[n-numTaps-5] sample */
+            x1 = * (px++);
+
+            /* Perform the multiply-accumulates */
+            acc0 += x2 * c0;
+            acc1 += x3 * c0;
+            acc2 += x0 * c0;
+            acc3 += x1 * c0;
+
+            /* Read the b[numTaps-4] coefficient */
+            c0 = * (pb++);
+
+            /* Read x[n-numTaps-6] sample */
+            x2 = * (px++);
+
+            /* Perform the multiply-accumulates */
+            acc0 += x3 * c0;
+            acc1 += x0 * c0;
+            acc2 += x1 * c0;
+            acc3 += x2 * c0;
+
+            tapCnt--;
+        }
+
+        /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+        tapCnt = numTaps % 0x4u;
+
+        while (tapCnt > 0u)
+        {
+            /* Read coefficients */
+            c0 = * (pb++);
+
+            /* Fetch 1 state variable */
+            x3 = * (px++);
+
+            /* Perform the multiply-accumulates */
+            acc0 += x0 * c0;
+            acc1 += x1 * c0;
+            acc2 += x2 * c0;
+            acc3 += x3 * c0;
+
+            /* Reuse the present sample states for next sample */
+            x0 = x1;
+            x1 = x2;
+            x2 = x3;
+
+            /* Decrement the loop counter */
+            tapCnt--;
+        }
+
+        /* Advance the state pointer by 4 to process the next group of 4 samples */
+        pState = pState + 4;
+
+        /* The results in the 4 accumulators, store in the destination buffer. */
+        *pDst++ = acc0;
+        *pDst++ = acc1;
+        *pDst++ = acc2;
+        *pDst++ = acc3;
+
+        blkCnt--;
+    }
+
+    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+     ** No loop unrolling is used. */
+    blkCnt = blockSize % 0x4u;
+
+    while (blkCnt > 0u)
+    {
+        /* Copy one sample at a time into state buffer */
+        *pStateCurnt++ = *pSrc++;
+
+        /* Set the accumulator to zero */
+        acc0 = 0.0f;
+
+        /* Initialize state pointer */
+        px = pState;
+
+        /* Initialize Coefficient pointer */
+        pb = (pCoeffs);
+
+        i = numTaps;
+
+        /* Perform the multiply-accumulates */
+        do
+        {
+            acc0 += *px++ * *pb++;
+            i--;
+
+        }
+        while (i > 0u);
+
+        /* The result is store in the destination buffer. */
+        *pDst++ = acc0;
+
+        /* Advance state pointer by 1 for the next sample */
+        pState = pState + 1;
+
+        blkCnt--;
+    }
+
+    /* Processing is complete.
+     ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+     ** This prepares the state buffer for the next function call. */
+
+    /* Points to the start of the state buffer */
+    pStateCurnt = S->pState;
+
+    tapCnt = (numTaps - 1u) >> 2u;
+
+    /* copy data */
+    while (tapCnt > 0u)
+    {
+        *pStateCurnt++ = *pState++;
+        *pStateCurnt++ = *pState++;
+        *pStateCurnt++ = *pState++;
+        *pStateCurnt++ = *pState++;
+
+        /* Decrement the loop counter */
+        tapCnt--;
+    }
+
+    /* Calculate remaining number of copies */
+    tapCnt = (numTaps - 1u) % 0x4u;
+
+    /* Copy the remaining q31_t data */
+    while (tapCnt > 0u)
+    {
+        *pStateCurnt++ = *pState++;
+
+        /* Decrement the loop counter */
+        tapCnt--;
+    }
+
+}
+
+/**
+   * @brief Processing function for the floating-point FIR decimator.
+   * @param[in] *S        points to an instance of the floating-point FIR decimator structure.
+   * @param[in] *pSrc     points to the block of input data.
+   * @param[out] *pDst    points to the block of output data.
+   * @param[in] blockSize number of input samples to process per call.
+   * @return none.
+   */
+
+void ne10_fir_decimate_float_c (
+    const ne10_fir_decimate_instance_f32_t * S,
+    ne10_float32_t * pSrc,
+    ne10_float32_t * pDst,
+    ne10_uint32_t blockSize)
+{
+    ne10_float32_t *pState = S->pState;                 /* State pointer */
+    ne10_float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+    ne10_float32_t *pStateCurnt;                        /* Points to the current sample of the state */
+    ne10_float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
+    ne10_float32_t sum0;                                /* Accumulator */
+    ne10_float32_t x0, c0;                              /* Temporary variables to hold state and coefficient values */
+    ne10_uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+    ne10_uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
+
+
+    /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+    /* S->pState buffer contains previous frame (numTaps - 1) samples */
+    /* pStateCurnt points to the location where the new input data should be written */
+    pStateCurnt = S->pState + (numTaps - 1u);
+
+    /* Total number of output samples to be computed */
+    blkCnt = outBlockSize;
+
+    while (blkCnt > 0u)
+    {
+        /* Copy decimation factor number of new input samples into the state buffer */
+        i = S->M;
+
+        do
+        {
+            *pStateCurnt++ = *pSrc++;
+
+        }
+        while (--i);
+
+        /* Set accumulator to zero */
+        sum0 = 0.0f;
+
+        /* Initialize state pointer */
+        px = pState;
+
+        /* Initialize coeff pointer */
+        pb = pCoeffs;
+
+        /* Loop unrolling.  Process 4 taps at a time. */
+        tapCnt = numTaps >> 2;
+
+        /* Loop over the number of taps.  Unroll by a factor of 4.
+         ** Repeat until we've computed numTaps-4 coefficients. */
+        while (tapCnt > 0u)
+        {
+            /* Read the b[numTaps-1] coefficient */
+            c0 = * (pb++);
+
+            /* Read x[n-numTaps-1] sample */
+            x0 = * (px++);
+
+            /* Perform the multiply-accumulate */
+            sum0 += x0 * c0;
+
+            /* Read the b[numTaps-2] coefficient */
+            c0 = * (pb++);
+
+            /* Read x[n-numTaps-2] sample */
+            x0 = * (px++);
+
+            /* Perform the multiply-accumulate */
+            sum0 += x0 * c0;
+
+            /* Read the b[numTaps-3] coefficient */
+            c0 = * (pb++);
+
+            /* Read x[n-numTaps-3] sample */
+            x0 = * (px++);
+
+            /* Perform the multiply-accumulate */
+            sum0 += x0 * c0;
+
+            /* Read the b[numTaps-4] coefficient */
+            c0 = * (pb++);
+
+            /* Read x[n-numTaps-4] sample */
+            x0 = * (px++);
+
+            /* Perform the multiply-accumulate */
+            sum0 += x0 * c0;
+
+            /* Decrement the loop counter */
+            tapCnt--;
+        }
+
+        /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+        tapCnt = numTaps % 0x4u;
+
+        while (tapCnt > 0u)
+        {
+            /* Read coefficients */
+            c0 = * (pb++);
+
+            /* Fetch 1 state variable */
+            x0 = * (px++);
+
+            /* Perform the multiply-accumulate */
+            sum0 += x0 * c0;
+
+            /* Decrement the loop counter */
+            tapCnt--;
+        }
+
+        /* Advance the state pointer by the decimation factor
+         * to process the next group of decimation factor number samples */
+        pState = pState + S->M;
+
+        /* The result is in the accumulator, store in the destination buffer. */
+        *pDst++ = sum0;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Processing is complete.
+     ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+     ** This prepares the state buffer for the next function call. */
+
+    /* Points to the start of the state buffer */
+    pStateCurnt = S->pState;
+
+    i = (numTaps - 1u) >> 2;
+
+    /* copy data */
+    while (i > 0u)
+    {
+        *pStateCurnt++ = *pState++;
+        *pStateCurnt++ = *pState++;
+        *pStateCurnt++ = *pState++;
+        *pStateCurnt++ = *pState++;
+
+        /* Decrement the loop counter */
+        i--;
+    }
+
+    i = (numTaps - 1u) % 0x04u;
+
+    /* copy data */
+    while (i > 0u)
+    {
+        *pStateCurnt++ = *pState++;
+
+        /* Decrement the loop counter */
+        i--;
+    }
+
+}
+
+/**
+ * @brief Processing function for the floating-point FIR interpolator.
+ * @param[in] *S        points to an instance of the floating-point FIR interpolator structure.
+ * @param[in] *pSrc     points to the block of input data.
+ * @param[out] *pDst    points to the block of output data.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return none.
+ */
+
+void ne10_fir_interpolate_float_c (
+    const ne10_fir_interpolate_instance_f32_t * S,
+    ne10_float32_t * pSrc,
+    ne10_float32_t * pDst,
+    ne10_uint32_t blockSize)
+{
+    ne10_float32_t *pState = S->pState;                 /* State pointer */
+    ne10_float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+    ne10_float32_t *pStateCurnt;                        /* Points to the current sample of the state */
+    ne10_float32_t *ptr1, *ptr2;                        /* Temporary pointers for state and coefficient buffers */
+
+
+    /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+    ne10_float32_t sum0;                                /* Accumulators */
+    ne10_float32_t x0, c0;                              /* Temporary variables to hold state and coefficient values */
+    ne10_uint32_t i, blkCnt, j;                         /* Loop counters */
+    ne10_uint16_t phaseLen = S->phaseLength, tapCnt;    /* Length of each polyphase filter component */
+
+
+    /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+    /* pStateCurnt points to the location where the new input data should be written */
+    pStateCurnt = S->pState + (phaseLen - 1u);
+
+    /* Total number of intput samples */
+    blkCnt = blockSize;
+
+    /* Loop over the blockSize. */
+    while (blkCnt > 0u)
+    {
+        /* Copy new input sample into the state buffer */
+        *pStateCurnt++ = *pSrc++;
+
+        /* Address modifier index of coefficient buffer */
+        j = 1u;
+
+        /* Loop over the Interpolation factor. */
+        i = S->L;
+        while (i > 0u)
+        {
+            /* Set accumulator to zero */
+            sum0 = 0.0f;
+
+            /* Initialize state pointer */
+            ptr1 = pState;
+
+            /* Initialize coefficient pointer */
+            ptr2 = pCoeffs + (S->L - j);
+
+            /* Loop over the polyPhase length. Unroll by a factor of 4.
+             ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+            tapCnt = phaseLen >> 2u;
+            while (tapCnt > 0u)
+            {
+
+                /* Read the coefficient */
+                c0 = * (ptr2);
+
+                /* Upsampling is done by stuffing L-1 zeros between each sample.
+                 * So instead of multiplying zeros with coefficients,
+                 * Increment the coefficient pointer by interpolation factor times. */
+                ptr2 += S->L;
+
+                /* Read the input sample */
+                x0 = * (ptr1++);
+
+                /* Perform the multiply-accumulate */
+                sum0 += x0 * c0;
+
+                /* Read the coefficient */
+                c0 = * (ptr2);
+
+                /* Increment the coefficient pointer by interpolation factor times. */
+                ptr2 += S->L;
+
+                /* Read the input sample */
+                x0 = * (ptr1++);
+
+                /* Perform the multiply-accumulate */
+                sum0 += x0 * c0;
+
+                /* Read the coefficient */
+                c0 = * (ptr2);
+
+                /* Increment the coefficient pointer by interpolation factor times. */
+                ptr2 += S->L;
+
+                /* Read the input sample */
+                x0 = * (ptr1++);
+
+                /* Perform the multiply-accumulate */
+                sum0 += x0 * c0;
+
+                /* Read the coefficient */
+                c0 = * (ptr2);
+
+                /* Increment the coefficient pointer by interpolation factor times. */
+                ptr2 += S->L;
+
+                /* Read the input sample */
+                x0 = * (ptr1++);
+
+                /* Perform the multiply-accumulate */
+                sum0 += x0 * c0;
+
+                /* Decrement the loop counter */
+                tapCnt--;
+            }
+
+            /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+            tapCnt = phaseLen % 0x4u;
+
+            while (tapCnt > 0u)
+            {
+                /* Perform the multiply-accumulate */
+                sum0 += * (ptr1++) * (*ptr2);
+
+                /* Increment the coefficient pointer by interpolation factor times. */
+                ptr2 += S->L;
+
+                /* Decrement the loop counter */
+                tapCnt--;
+            }
+
+            /* The result is in the accumulator, store in the destination buffer. */
+            *pDst++ = sum0;
+
+            /* Increment the address modifier index of coefficient buffer */
+            j++;
+
+            /* Decrement the loop counter */
+            i--;
+        }
+
+        /* Advance the state pointer by 1
+         * to process the next group of interpolation factor number samples */
+        pState = pState + 1;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Processing is complete.
+     ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+     ** This prepares the state buffer for the next function call. */
+
+    /* Points to the start of the state buffer */
+    pStateCurnt = S->pState;
+
+    tapCnt = (phaseLen - 1u) >> 2u;
+
+    /* copy data */
+    while (tapCnt > 0u)
+    {
+        *pStateCurnt++ = *pState++;
+        *pStateCurnt++ = *pState++;
+        *pStateCurnt++ = *pState++;
+        *pStateCurnt++ = *pState++;
+
+        /* Decrement the loop counter */
+        tapCnt--;
+    }
+
+    tapCnt = (phaseLen - 1u) % 0x04u;
+
+    while (tapCnt > 0u)
+    {
+        *pStateCurnt++ = *pState++;
+
+        /* Decrement the loop counter */
+        tapCnt--;
+    }
+
+}
+
+/**
+   * @brief Processing function for the floating-point FIR lattice filter.
+   * @param[in]  *S        points to an instance of the floating-point FIR lattice structure.
+   * @param[in]  *pSrc     points to the block of input data.
+   * @param[out] *pDst     points to the block of output data
+   * @param[in]  blockSize number of samples to process.
+   * @return none.
+   */
+
+void ne10_fir_lattice_float_c (
+    const ne10_fir_lattice_instance_f32_t * S,
+    ne10_float32_t * pSrc,
+    ne10_float32_t * pDst,
+    ne10_uint32_t blockSize)
+{
+    ne10_float32_t *pState;                             /* State pointer */
+    ne10_float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+    ne10_float32_t *px;                                 /* temporary state pointer */
+    ne10_float32_t *pk;                                 /* temporary coefficient pointer */
+
+
+    /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+    ne10_float32_t fcurr1, fnext1, gcurr1, gnext1;      /* temporary variables for first sample in loop unrolling */
+    ne10_float32_t fcurr2, fnext2, gnext2;              /* temporary variables for second sample in loop unrolling */
+    ne10_float32_t fcurr3, fnext3, gnext3;              /* temporary variables for third sample in loop unrolling */
+    ne10_float32_t fcurr4, fnext4, gnext4;              /* temporary variables for fourth sample in loop unrolling */
+    ne10_uint32_t numStages = S->numStages;             /* Number of stages in the filter */
+    ne10_uint32_t blkCnt, stageCnt;                     /* temporary variables for counts */
+
+    gcurr1 = 0.0f;
+    pState = &S->pState[0];
+
+    blkCnt = blockSize >> 2;
+
+    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+       a second loop below computes the remaining 1 to 3 samples. */
+    while (blkCnt > 0u)
+    {
+
+        /* Read two samples from input buffer */
+        /* f0(n) = x(n) */
+        fcurr1 = *pSrc++;
+        fcurr2 = *pSrc++;
+
+        /* Initialize coeff pointer */
+        pk = (pCoeffs);
+
+        /* Initialize state pointer */
+        px = pState;
+
+        /* Read g0(n-1) from state */
+        gcurr1 = *px;
+
+        /* Process first sample for first tap */
+        /* f1(n) = f0(n) +  K1 * g0(n-1) */
+        fnext1 = fcurr1 + ( (*pk) * gcurr1);
+        /* g1(n) = f0(n) * K1  +  g0(n-1) */
+        gnext1 = (fcurr1 * (*pk)) + gcurr1;
+
+        /* Process second sample for first tap */
+        /* for sample 2 processing */
+        fnext2 = fcurr2 + ( (*pk) * fcurr1);
+        gnext2 = (fcurr2 * (*pk)) + fcurr1;
+
+        /* Read next two samples from input buffer */
+        /* f0(n+2) = x(n+2) */
+        fcurr3 = *pSrc++;
+        fcurr4 = *pSrc++;
+
+        /* Copy only last input samples into the state buffer
+           which will be used for next four samples processing */
+        *px++ = fcurr4;
+
+        /* Process third sample for first tap */
+        fnext3 = fcurr3 + ( (*pk) * fcurr2);
+        gnext3 = (fcurr3 * (*pk)) + fcurr2;
+
+        /* Process fourth sample for first tap */
+        fnext4 = fcurr4 + ( (*pk) * fcurr3);
+        gnext4 = (fcurr4 * (*pk++)) + fcurr3;
+
+        /* Update of f values for next coefficient set processing */
+        fcurr1 = fnext1;
+        fcurr2 = fnext2;
+        fcurr3 = fnext3;
+        fcurr4 = fnext4;
+
+        /* Loop unrolling.  Process 4 taps at a time . */
+        stageCnt = (numStages - 1u) >> 2u;
+
+        /* Loop over the number of taps.  Unroll by a factor of 4.
+         ** Repeat until we've computed numStages-3 coefficients. */
+
+        /* Process 2nd, 3rd, 4th and 5th taps ... here */
+        while (stageCnt > 0u)
+        {
+            /* Read g1(n-1), g3(n-1) .... from state */
+            gcurr1 = *px;
+
+            /* save g1(n) in state buffer */
+            *px++ = gnext4;
+
+            /* Process first sample for 2nd, 6th .. tap */
+            /* Sample processing for K2, K6.... */
+            /* f2(n) = f1(n) +  K2 * g1(n-1) */
+            fnext1 = fcurr1 + ( (*pk) * gcurr1);
+            /* Process second sample for 2nd, 6th .. tap */
+            /* for sample 2 processing */
+            fnext2 = fcurr2 + ( (*pk) * gnext1);
+            /* Process third sample for 2nd, 6th .. tap */
+            fnext3 = fcurr3 + ( (*pk) * gnext2);
+            /* Process fourth sample for 2nd, 6th .. tap */
+            fnext4 = fcurr4 + ( (*pk) * gnext3);
+
+            /* g2(n) = f1(n) * K2  +  g1(n-1) */
+            /* Calculation of state values for next stage */
+            gnext4 = (fcurr4 * (*pk)) + gnext3;
+            gnext3 = (fcurr3 * (*pk)) + gnext2;
+            gnext2 = (fcurr2 * (*pk)) + gnext1;
+            gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+
+            /* Read g2(n-1), g4(n-1) .... from state */
+            gcurr1 = *px;
+
+            /* save g2(n) in state buffer */
+            *px++ = gnext4;
+
+            /* Sample processing for K3, K7.... */
+            /* Process first sample for 3rd, 7th .. tap */
+            /* f3(n) = f2(n) +  K3 * g2(n-1) */
+            fcurr1 = fnext1 + ( (*pk) * gcurr1);
+            /* Process second sample for 3rd, 7th .. tap */
+            fcurr2 = fnext2 + ( (*pk) * gnext1);
+            /* Process third sample for 3rd, 7th .. tap */
+            fcurr3 = fnext3 + ( (*pk) * gnext2);
+            /* Process fourth sample for 3rd, 7th .. tap */
+            fcurr4 = fnext4 + ( (*pk) * gnext3);
+
+            /* Calculation of state values for next stage */
+            /* g3(n) = f2(n) * K3  +  g2(n-1) */
+            gnext4 = (fnext4 * (*pk)) + gnext3;
+            gnext3 = (fnext3 * (*pk)) + gnext2;
+            gnext2 = (fnext2 * (*pk)) + gnext1;
+            gnext1 = (fnext1 * (*pk++)) + gcurr1;
+
+
+            /* Read g1(n-1), g3(n-1) .... from state */
+            gcurr1 = *px;
+
+            /* save g3(n) in state buffer */
+            *px++ = gnext4;
+
+            /* Sample processing for K4, K8.... */
+            /* Process first sample for 4th, 8th .. tap */
+            /* f4(n) = f3(n) +  K4 * g3(n-1) */
+            fnext1 = fcurr1 + ( (*pk) * gcurr1);
+            /* Process second sample for 4th, 8th .. tap */
+            /* for sample 2 processing */
+            fnext2 = fcurr2 + ( (*pk) * gnext1);
+            /* Process third sample for 4th, 8th .. tap */
+            fnext3 = fcurr3 + ( (*pk) * gnext2);
+            /* Process fourth sample for 4th, 8th .. tap */
+            fnext4 = fcurr4 + ( (*pk) * gnext3);
+
+            /* g4(n) = f3(n) * K4  +  g3(n-1) */
+            /* Calculation of state values for next stage */
+            gnext4 = (fcurr4 * (*pk)) + gnext3;
+            gnext3 = (fcurr3 * (*pk)) + gnext2;
+            gnext2 = (fcurr2 * (*pk)) + gnext1;
+            gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+            /* Read g2(n-1), g4(n-1) .... from state */
+            gcurr1 = *px;
+
+            /* save g4(n) in state buffer */
+            *px++ = gnext4;
+
+            /* Sample processing for K5, K9.... */
+            /* Process first sample for 5th, 9th .. tap */
+            /* f5(n) = f4(n) +  K5 * g4(n-1) */
+            fcurr1 = fnext1 + ( (*pk) * gcurr1);
+            /* Process second sample for 5th, 9th .. tap */
+            fcurr2 = fnext2 + ( (*pk) * gnext1);
+            /* Process third sample for 5th, 9th .. tap */
+            fcurr3 = fnext3 + ( (*pk) * gnext2);
+            /* Process fourth sample for 5th, 9th .. tap */
+            fcurr4 = fnext4 + ( (*pk) * gnext3);
+
+            /* Calculation of state values for next stage */
+            /* g5(n) = f4(n) * K5  +  g4(n-1) */
+            gnext4 = (fnext4 * (*pk)) + gnext3;
+            gnext3 = (fnext3 * (*pk)) + gnext2;
+            gnext2 = (fnext2 * (*pk)) + gnext1;
+            gnext1 = (fnext1 * (*pk++)) + gcurr1;
+
+            stageCnt--;
+        }
+
+        /* If the (filter length -1) is not a multiple of 4, compute the remaining filter taps */
+        stageCnt = (numStages - 1u) % 0x4u;
+
+        while (stageCnt > 0u)
+        {
+            gcurr1 = *px;
+
+            /* save g value in state buffer */
+            *px++ = gnext4;
+
+            /* Process four samples for last three taps here */
+            fnext1 = fcurr1 + ( (*pk) * gcurr1);
+            fnext2 = fcurr2 + ( (*pk) * gnext1);
+            fnext3 = fcurr3 + ( (*pk) * gnext2);
+            fnext4 = fcurr4 + ( (*pk) * gnext3);
+
+            /* g1(n) = f0(n) * K1  +  g0(n-1) */
+            gnext4 = (fcurr4 * (*pk)) + gnext3;
+            gnext3 = (fcurr3 * (*pk)) + gnext2;
+            gnext2 = (fcurr2 * (*pk)) + gnext1;
+            gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+            /* Update of f values for next coefficient set processing */
+            fcurr1 = fnext1;
+            fcurr2 = fnext2;
+            fcurr3 = fnext3;
+            fcurr4 = fnext4;
+
+            stageCnt--;
+
+        }
+
+        /* The results in the 4 accumulators, store in the destination buffer. */
+        /* y(n) = fN(n) */
+        *pDst++ = fcurr1;
+        *pDst++ = fcurr2;
+        *pDst++ = fcurr3;
+        *pDst++ = fcurr4;
+
+        blkCnt--;
+    }
+
+    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+     ** No loop unrolling is used. */
+    blkCnt = blockSize % 0x4u;
+
+    while (blkCnt > 0u)
+    {
+        /* f0(n) = x(n) */
+        fcurr1 = *pSrc++;
+
+        /* Initialize coeff pointer */
+        pk = (pCoeffs);
+
+        /* Initialize state pointer */
+        px = pState;
+
+        /* read g2(n) from state buffer */
+        gcurr1 = *px;
+
+        /* for sample 1 processing */
+        /* f1(n) = f0(n) +  K1 * g0(n-1) */
+        fnext1 = fcurr1 + ( (*pk) * gcurr1);
+        /* g1(n) = f0(n) * K1  +  g0(n-1) */
+        gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+        /* save g1(n) in state buffer */
+        *px++ = fcurr1;
+
+        /* f1(n) is saved in fcurr1
+           for next stage processing */
+        fcurr1 = fnext1;
+
+        stageCnt = (numStages - 1u);
+
+        /* stage loop */
+        while (stageCnt > 0u)
+        {
+            /* read g2(n) from state buffer */
+            gcurr1 = *px;
+
+            /* save g1(n) in state buffer */
+            *px++ = gnext1;
+
+            /* Sample processing for K2, K3.... */
+            /* f2(n) = f1(n) +  K2 * g1(n-1) */
+            fnext1 = fcurr1 + ( (*pk) * gcurr1);
+            /* g2(n) = f1(n) * K2  +  g1(n-1) */
+            gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+            /* f1(n) is saved in fcurr1
+               for next stage processing */
+            fcurr1 = fnext1;
+
+            stageCnt--;
+
+        }
+
+        /* y(n) = fN(n) */
+        *pDst++ = fcurr1;
+
+        blkCnt--;
+
+    }
+
+}
+/**
+   * @brief floating-point Circular write function.
+   */
+
+static void ne10_circular_write_float (
+    ne10_int32_t * circBuffer,
+    ne10_int32_t L,
+    ne10_uint16_t * writeOffset,
+    ne10_int32_t bufferInc,
+    const ne10_int32_t * src,
+    ne10_int32_t srcInc,
+    ne10_uint32_t blockSize)
+{
+    ne10_uint32_t i = 0u;
+    ne10_int32_t wOffset;
+
+    /* Copy the value of Index pointer that points
+     * to the current location where the input samples to be copied */
+    wOffset = *writeOffset;
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0u)
+    {
+        /* copy the input sample to the circular buffer */
+        circBuffer[wOffset] = *src;
+
+        /* Update the input pointer */
+        src += srcInc;
+
+        /* Circularly update wOffset.  Watch out for positive and negative value */
+        wOffset += bufferInc;
+        if (wOffset >= L)
+            wOffset -= L;
+
+        /* Decrement the loop counter */
+        i--;
+    }
+
+    /* Update the index pointer */
+    *writeOffset = wOffset;
+}
+
+
+
+/**
+ * @brief floating-point Circular Read function.
+ */
+static void ne10_circular_read_float (
+    ne10_int32_t * circBuffer,
+    ne10_int32_t L,
+    ne10_int32_t * readOffset,
+    ne10_int32_t bufferInc,
+    ne10_int32_t * dst,
+    ne10_int32_t * dst_base,
+    ne10_int32_t dst_length,
+    ne10_int32_t dstInc,
+    ne10_uint32_t blockSize)
+{
+    ne10_uint32_t i = 0u;
+    ne10_int32_t rOffset, dst_end;
+
+    /* Copy the value of Index pointer that points
+     * to the current location from where the input samples to be read */
+    rOffset = *readOffset;
+    dst_end = (ne10_int32_t) (dst_base + dst_length);
+
+    /* Loop over the blockSize */
+    i = blockSize;
+
+    while (i > 0u)
+    {
+        /* copy the sample from the circular buffer to the destination buffer */
+        *dst = circBuffer[rOffset];
+
+        /* Update the input pointer */
+        dst += dstInc;
+
+        if (dst == (ne10_int32_t *) dst_end)
+        {
+            dst = dst_base;
+        }
+
+        /* Circularly update rOffset.  Watch out for positive and negative value  */
+        rOffset += bufferInc;
+
+        if (rOffset >= L)
+        {
+            rOffset -= L;
+        }
+
+        /* Decrement the loop counter */
+        i--;
+    }
+
+    /* Update the index pointer */
+    *readOffset = rOffset;
+}
+
+
+/**
+ * @brief Processing function for the floating-point sparse FIR filter.
+ * @param[in]  *S          points to an instance of the floating-point sparse FIR structure.
+ * @param[in]  *pSrc       points to the block of input data.
+ * @param[out] *pDst       points to the block of output data
+ * @param[in]  *pScratchIn points to a temporary buffer of size blockSize.
+ * @param[in]  blockSize   number of input samples to process per call.
+ * @return none.
+ */
+
+void ne10_fir_sparse_float_c (
+    ne10_fir_sparse_instance_f32_t * S,
+    ne10_float32_t * pSrc,
+    ne10_float32_t * pDst,
+    ne10_float32_t * pScratchIn,
+    ne10_uint32_t blockSize)
+{
+
+    ne10_float32_t *pState = S->pState;                 /* State pointer */
+    ne10_float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+    ne10_float32_t *px;                                 /* Scratch buffer pointer */
+    ne10_float32_t *py = pState;                        /* Temporary pointers for state buffer */
+    ne10_float32_t *pb = pScratchIn;                    /* Temporary pointers for scratch buffer */
+    ne10_float32_t *pOut;                               /* Destination pointer */
+    ne10_int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */
+    ne10_uint32_t delaySize = S->maxDelay + blockSize;  /* state length */
+    ne10_uint16_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter  */
+    ne10_int32_t readIndex;                             /* Read index of the state buffer */
+    ne10_uint32_t tapCnt, blkCnt;                       /* loop counters */
+    ne10_float32_t coeff = *pCoeffs++;                  /* Read the first coefficient value */
+
+
+
+    /* BlockSize of Input samples are copied into the state buffer */
+    /* StateIndex points to the starting position to write in the state buffer */
+    ne10_circular_write_float ( (ne10_int32_t *) py, delaySize, &S->stateIndex, 1,
+                                (ne10_int32_t *) pSrc, 1, blockSize);
+
+
+    /* Read Index, from where the state buffer should be read, is calculated. */
+    readIndex = ( (ne10_int32_t) S->stateIndex - (ne10_int32_t) blockSize) - *pTapDelay++;
+
+    /* Wraparound of readIndex */
+    if (readIndex < 0)
+    {
+        readIndex += (ne10_int32_t) delaySize;
+    }
+
+    /* Working pointer for state buffer is updated */
+    py = pState;
+
+    /* blockSize samples are read from the state buffer */
+    ne10_circular_read_float ( (ne10_int32_t *) py, delaySize, &readIndex, 1,
+                               (ne10_int32_t *) pb, (ne10_int32_t *) pb, blockSize, 1,
+                               blockSize);
+
+    /* Working pointer for the scratch buffer */
+    px = pb;
+
+    /* Working pointer for destination buffer */
+    pOut = pDst;
+
+
+    /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+    /* Loop over the blockSize. Unroll by a factor of 4.
+     * Compute 4 Multiplications at a time. */
+    blkCnt = blockSize >> 2u;
+
+    while (blkCnt > 0u)
+    {
+        /* Perform Multiplications and store in destination buffer */
+        *pOut++ = *px++ * coeff;
+        *pOut++ = *px++ * coeff;
+        *pOut++ = *px++ * coeff;
+        *pOut++ = *px++ * coeff;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* If the blockSize is not a multiple of 4,
+     * compute the remaining samples */
+    blkCnt = blockSize % 0x4u;
+
+    while (blkCnt > 0u)
+    {
+        /* Perform Multiplications and store in destination buffer */
+        *pOut++ = *px++ * coeff;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Load the coefficient value and
+     * increment the coefficient buffer for the next set of state values */
+    coeff = *pCoeffs++;
+
+    /* Read Index, from where the state buffer should be read, is calculated. */
+    readIndex = ( (ne10_int32_t) S->stateIndex - (ne10_int32_t) blockSize) - *pTapDelay++;
+
+    /* Wraparound of readIndex */
+    if (readIndex < 0)
+    {
+        readIndex += (ne10_int32_t) delaySize;
+    }
+
+    /* Loop over the number of taps. */
+    tapCnt = (ne10_uint32_t) numTaps - 1u;
+
+    while (tapCnt > 0u)
+    {
+
+        /* Working pointer for state buffer is updated */
+        py = pState;
+
+        /* blockSize samples are read from the state buffer */
+        ne10_circular_read_float ( (ne10_int32_t *) py, delaySize, &readIndex, 1,
+                                   (ne10_int32_t *) pb, (ne10_int32_t *) pb, blockSize, 1,
+                                   blockSize);
+
+        /* Working pointer for the scratch buffer */
+        px = pb;
+
+        /* Working pointer for destination buffer */
+        pOut = pDst;
+
+        /* Loop over the blockSize. Unroll by a factor of 4.
+         * Compute 4 MACS at a time. */
+        blkCnt = blockSize >> 2u;
+
+        while (blkCnt > 0u)
+        {
+            /* Perform Multiply-Accumulate */
+            *pOut++ += *px++ * coeff;
+            *pOut++ += *px++ * coeff;
+            *pOut++ += *px++ * coeff;
+            *pOut++ += *px++ * coeff;
+
+            /* Decrement the loop counter */
+            blkCnt--;
+        }
+
+        /* If the blockSize is not a multiple of 4,
+         * compute the remaining samples */
+        blkCnt = blockSize % 0x4u;
+
+        while (blkCnt > 0u)
+        {
+            /* Perform Multiply-Accumulate */
+            *pOut++ += *px++ * coeff;
+
+            /* Decrement the loop counter */
+            blkCnt--;
+        }
+
+        /* Load the coefficient value and
+         * increment the coefficient buffer for the next set of state values */
+        coeff = *pCoeffs++;
+
+        /* Read Index, from where the state buffer should be read, is calculated. */
+        readIndex = ( (ne10_int32_t) S->stateIndex -
+                      (ne10_int32_t) blockSize) - *pTapDelay++;
+
+        /* Wraparound of readIndex */
+        if (readIndex < 0)
+        {
+            readIndex += (ne10_int32_t) delaySize;
+        }
+
+        /* Decrement the tap loop counter */
+        tapCnt--;
+    }
+
+}
+
+
+/**
+ * @} end of FIR group
+ */
diff --git a/modules/dsp/NE10_fir.neon.s b/modules/dsp/NE10_fir.neon.s
new file mode 100644
index 0000000..f63cb2e
--- /dev/null
+++ b/modules/dsp/NE10_fir.neon.s
@@ -0,0 +1,1959 @@
+@/*
+@ * Copyright 2012 ARM Limited
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License")@
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at
+@ *
+@ *     http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ */
+
+@/*
+@ * NE10 Library : dsp/NE10_fir.neon.s
+@ */
+
+
+        .text
+        .syntax   unified
+
+        @/**
+        @ * @details
+        @ * This function operates on floating-point data types.
+        @ * There are no restrictions on numTaps and blockSize.
+        @ *
+        @ * The order of the coefficients in *coeffs should be
+        @ * bN, bN-1, bN-2, .....b1, b0
+        @ *
+        @ * <b>Cycle Count:</b>
+        @ *
+        @ * <code>45 + 8 * numTaps + 12.25 * blockSize + 4.375 * numTaps * blockSize</code>
+        @ *
+        @ * @param[in]  *S                points to struct parameter
+        @ * @param[in]  *pSrc             points to the input buffer
+        @ * @param[out]  *pDst            points to the output buffer
+        @ * @param[in]  blockSize         block size of filter
+        @ */
+
+        .align   4
+        .global   ne10_fir_float_neon
+        .extern   ne10_qMaskTable32
+        .thumb
+        .thumb_func
+
+ne10_fir_float_neon:
+                    PUSH    {r4-r12,lr}
+@/*ARM Registers*/
+pStateStruct     .req   R0
+pSrc             .req   R1
+pDst             .req   R2
+blockSize        .req   R3
+
+pState           .req   R4             @/* State pointer */
+pCoeffs          .req   R5             @/* Coefficient pointer */
+pStateCurnt      .req   R6             @/* Points to the current sample of the state */
+
+pX               .req   R7             @/* Temporary pointers for state buffer */
+pB               .req   R8             @/* Temporary pointers for coefficient buffer */
+numTaps          .req   R9             @/* Length of the filter */
+
+tapCnt           .req   R10            @ /* Loop counter */
+Count            .req   R11            @ /* Loop counter */
+pTemp            .req   R11
+pMask            .req   R14            @  /* Mask Table */
+
+mask             .req   R12
+
+@/*NEON variale Declaration*/
+qInp             .qn   Q0.F32
+dInp_0           .dn   D0.F32
+dInp_1           .dn   D1.F32
+qCoeff           .qn   Q1.F32
+dCoeff_0         .dn   D2.F32
+dCoeff_1         .dn   D3.F32
+qZero            .qn   Q2.F32
+
+qMask            .qn   Q3.U32
+dMask_0          .dn   D6.U32
+dMask_1          .dn   D7.U32
+dOut_0           .dn   D6.F32
+dOut_1           .dn   D7.F32
+
+qAcc0            .qn   Q8.F32
+dAcc0_0          .dn   D16.F32
+dAcc0_1          .dn   D17.F32
+
+
+qTemp            .qn   Q9.F32
+dTemp_0          .dn   D18.F32
+dTemp_1          .dn   D19.F32
+
+qTemp1           .qn   Q10.F32
+dTemp1_0         .dn   D20.F32
+dTemp1_1         .dn   D21.F32
+qTemp2           .qn   Q11.F32
+qTemp3           .qn   Q12.F32
+qMask1           .qn   Q13.U32
+dMask1_0         .dn   D26.U32
+dMask1_1         .dn   D27.U32
+qMaskTmp         .qn   Q14.U32
+dMaskTmp_0       .dn   D28.U32
+dMaskTmp_1       .dn   D29.U32
+
+
+
+
+
+                    LDRH        numTaps,[pStateStruct],#4
+                    LDR         pState,[pStateStruct],#4
+                    LDR         pCoeffs,[pStateStruct],#4
+
+                    @/* S->state buffer contains previous frame (numTaps - 1) samples */
+                    @/* pStateCurnt points to the location where the new input data should be written */
+                    @/*pStateCurnt = &(S->state[(numTaps - 1u)])@*/
+                    SUB         mask,numTaps,#1
+                    LDR         pMask,=ne10_qMaskTable32
+                    AND         tapCnt,numTaps,#3
+                    ADD         pStateCurnt,pState,mask,LSL #2
+                    AND         mask,blockSize,#3
+
+
+                    @/* Apply loop unrolling and compute 4 output values simultaneously.
+                    @* The variables acc0 ... acc3 hold output values that are being computed:
+                    @*
+                    @*    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+                    @*    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+                    @*    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+                    @*    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
+                    @*/
+
+                    @/*If numTaps,blockSize are not  multiples of 4,  Get the appropriate Masks*/
+
+
+                    ADD         pTemp,pMask,tapCnt,LSL #4
+                    VEOR        qZero,qZero
+                    ADD         pX,pMask,mask,LSL #4
+                    VLD1        {dMaskTmp_0,dMaskTmp_1},[pTemp]
+                    VLD1        {dMask1_0,dMask1_1},[pX]
+
+
+                    @/* Copy blockCnt number of  new input samples into the state buffer */
+
+                    SUBS        blockSize,#4
+                    BLT         firEndOuterLoop
+
+                    @/* Compute 4 outputs at a time*/
+
+firOuterLoop:
+
+                    VLD1        {dTemp_0,dTemp_1},[pSrc]!
+                    MOV         pX,pState
+                    MOV         pB,pCoeffs
+                    @/* Read the first four samples from the state buffer:
+                    @* x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2],x[n-numTaps-3] */
+
+                    VST1        {dTemp_0,dTemp_1},[pStateCurnt]!
+                    @/* Zero the Accumulators*/
+                    VEOR        qAcc0,qAcc0
+                    VLD1        {dInp_0,dInp_1},[pX]!
+
+                    @//* Read the first four coefficients b[numTaps] to b[numTaps-3] */
+                    VLD1        {dCoeff_0,dCoeff_1},[pB]!
+                    @/* Loop unrolling.  Process 4 taps at a time. */
+                    SUBS        tapCnt,numTaps,#4
+                    VLD1        {dTemp_0,dTemp_1},[pX]!
+
+                    BLT         firEndInnerLoop
+
+firInnerLoop:
+
+
+                    VEXT        qTemp1,qInp,qTemp,#1
+                    @/* acc0 +=  b[numTaps] * x[n-numTaps-1]+ b[numTaps] * x[n-numTaps-2] +
+                    @* b[numTaps] * x[n-numTaps-3] +  b[numTaps] * x[n-numTaps-4]*/
+                    VMLA        qAcc0,qInp,dCoeff_0[0]
+                    VEXT        qTemp2,qInp,qTemp,#2
+                    @/* acc1 +=  b[numTaps-1] * x[n-numTaps-2]+ b[numTaps-1] * x[n-numTaps-3] +
+                    @b[numTaps-1] * x[n-numTaps-4] +*b[numTaps-1] * x[n-numTaps-5]*/
+                    VMLA        qAcc0,qTemp1,dCoeff_0[1]
+                    VEXT        qTemp3,qInp,qTemp,#3
+                    @/* acc2 +=  b[numTaps-2] * x[n-numTaps-3]+ b[numTaps-2] * x[n-numTaps-4] +
+                    @b[numTaps-2] * x[n-numTaps-5] + *b[numTaps-2] * x[n-numTaps-6]*/
+                    @//vacc0q_f32 = vmlaq_lane_f32(vacc0q_f32,vxtemp2q_f32,vget_high_f32(vcq_f32),0)@
+                    VMLA        qAcc0,qTemp2,dCoeff_1[0]
+                    VMOV        qInp,qTemp
+                    @/* acc3 +=  b[numTaps-3] * x[n-numTaps-4]+ b[numTaps-3] * x[n-numTaps-5] +
+                    @b[numTaps-3] * x[n-numTaps-6] +*b[numTaps-3] * x[n-numTaps-7]  */
+
+                    VMLA        qAcc0,qTemp3,dCoeff_1[1]
+
+                    @/* Read the b[numTaps-4] to b[numTaps-7]  coefficients */
+                    VLD1        {dCoeff_0,dCoeff_1},[pB]!
+                    SUBS        tapCnt,#4
+                    VLD1        {dTemp_0,dTemp_1},[pX]!
+
+                    BGE         firInnerLoop
+firEndInnerLoop:
+
+
+                    @/* If the filter length is not a multiple of 4, compute the remaining filter taps */
+                    @/*Select only the remaining filter Taps*/
+                    VMOV        qMask,qMaskTmp
+                    VBSL        qMask,qCoeff,qZero
+                    VEXT        qTemp1,qInp,qTemp,#1
+                    VMLA        qAcc0,qInp,dOut_0[0]
+                    VEXT        qTemp2,qInp,qTemp,#2
+                    VMLA        qAcc0,qTemp1,dOut_0[1]
+                    VEXT        qTemp3,qInp,qTemp,#3
+                    VMLA        qAcc0,qTemp2,dOut_1[0]
+                    @/* Advance the state pointer by 4 to process the next group of 4 samples */
+                    ADD         pState,#16
+
+                    VMLA        qAcc0,qTemp3,dOut_1[1]
+
+
+                    @/* The results in the 4 accumulators are in 2.30 format.  Convert to 1.31
+                    @ * Then store the 4 outputs in the destination buffer. */
+                    SUBS        blockSize,#4
+                    VST1        {dAcc0_0,dAcc0_1},[pDst]!
+
+                    BGE         firOuterLoop
+
+firEndOuterLoop:
+                    @/*Handle BlockSize Not a Multiple of 4*/
+                    ADDS        blockSize,#4
+                    BEQ         firCopyData
+                    @/*Copy the Remaining BlockSize Number of Input Sample to state Buffer*/
+                    VMOV        qMask,qMask1
+                    VLD1        {dTemp1_0,dTemp1_1},[pStateCurnt]
+                    VLD1        {dTemp_0,dTemp_1},[pSrc]
+
+                    ADD         pSrc,pSrc,blockSize,LSL #2
+                    MOV         pX,pState
+                    MOV         pB,pCoeffs
+
+                    VBSL        qMask,qTemp,qTemp1
+                    VST1        {dMask_0,dMask_1},[pStateCurnt]
+                    VLD1        {dInp_0,dInp_1},[pX]!
+
+                    ADD         pStateCurnt,pStateCurnt,blockSize, LSL #2
+
+                    @/* Zero the Accumulators*/
+                    VEOR        qAcc0,qAcc0
+                    VLD1        {dCoeff_0,dCoeff_1},[pB]!
+                    SUBS        tapCnt,numTaps,#4
+                    VLD1        {dTemp_0,dTemp_1},[pX]!
+
+                    BLT         firEndInnerLoop1
+
+firInnerLoop1:
+
+                    VEXT        qTemp1,qInp,qTemp,#1
+                    VMLA        qAcc0,qInp,dCoeff_0[0]
+                    VEXT        qTemp2,qInp,qTemp,#2
+                    VMLA        qAcc0,qTemp1,dCoeff_0[1]
+                    VEXT        qTemp3,qInp,qTemp,#3
+                    VMLA        qAcc0,qTemp2,dCoeff_1[0]
+                    VMOV        qInp,qTemp
+                    VMLA        qAcc0,qTemp3,dCoeff_1[1]
+                    VLD1        {dCoeff_0,dCoeff_1},[pB]!
+                    SUBS        tapCnt,#4
+                    VLD1        {dTemp_0,dTemp_1},[pX]!
+
+                    BGE         firInnerLoop1
+firEndInnerLoop1:
+
+
+                    VMOV        qMask,qMaskTmp
+                    VBSL        qMask,qCoeff,qZero
+                    VEXT        qTemp1,qInp,qTemp,#1
+                    VMLA        qAcc0,qInp,dOut_0[0]
+                    VEXT        qTemp2,qInp,qTemp,#2
+                    VMLA        qAcc0,qTemp1,dOut_0[1]
+                    VEXT        qTemp3,qInp,qTemp,#3
+                    VMLA        qAcc0,qTemp2,dOut_1[0]
+                    VMOV        qMask,qMask1
+                    VLD1        {dTemp_0,dTemp_1},[pDst]
+                    VMLA        qAcc0,qTemp3,dOut_1[1]
+
+
+                    @/* If the blockSize is not a multiple of 4, Mask the unwanted Output */
+
+                    VBSL        qMask,qAcc0,qTemp
+                    VST1        {dMask_0,dMask_1},[pDst]
+                    ADD         pDst,pDst,blockSize,LSL #2
+                    ADD         pState,pState,blockSize,LSL #2
+
+
+firCopyData:
+                    @/* Processing is complete.  Now shift the data in the state buffer down by
+                    @** blockSize samples.  This prepares the state buffer for the next function
+                    @** call. */
+
+                    @/* Points to the start of the state buffer */
+
+                    SUB         numTaps,numTaps,#1
+                    AND         mask,numTaps,#3
+                    LDR         pStateCurnt,[pStateStruct,#-8]
+                    ADD         pTemp,pMask,mask,LSL #4
+                    VLD1        {dInp_0,dInp_1},[pState]!
+                    VLD1        {dMask_0,dMask_1},[pTemp]
+
+
+                    @/* copy data */
+
+                    SUBS        Count,numTaps,#4
+                    BLT         firEnd
+firCopyLoop:
+                    VST1        {dInp_0,dInp_1},[pStateCurnt]!
+                    SUBS        Count,#4
+                    VLD1        {dInp_0,dInp_1},[pState]!
+                    BGE         firCopyLoop
+
+firEnd:
+
+                    VLD1        {dTemp_0,dTemp_1},[pStateCurnt]
+                    VBSL        qMask,qInp,qTemp
+                    VST1        {dOut_0,dOut_1},[pStateCurnt]
+                    ADD         pStateCurnt,pStateCurnt,mask, LSL #2
+
+                    @/*Return From Function*/
+                    POP     {r4-r12,pc}
+@/*ARM Registers*/
+.unreq    pStateStruct
+.unreq    pSrc
+.unreq    pDst
+.unreq    blockSize
+
+.unreq    pState
+.unreq    pCoeffs
+.unreq    pStateCurnt
+
+.unreq    pX
+.unreq    pB
+.unreq    numTaps
+
+.unreq    tapCnt
+.unreq    Count
+.unreq    pTemp
+.unreq    pMask
+
+.unreq    mask
+
+@/*NEON variale Declaration*/
+.unreq    qInp
+.unreq    dInp_0
+.unreq    dInp_1
+.unreq    qCoeff
+.unreq    dCoeff_0
+.unreq    dCoeff_1
+.unreq    qZero
+
+.unreq    qMask
+.unreq    dMask_0
+.unreq    dMask_1
+.unreq    dOut_0
+.unreq    dOut_1
+
+.unreq    qAcc0
+.unreq    dAcc0_0
+.unreq    dAcc0_1
+
+.unreq    qTemp
+.unreq    dTemp_0
+.unreq    dTemp_1
+
+.unreq    qTemp1
+.unreq    dTemp1_0
+.unreq    dTemp1_1
+.unreq    qTemp2
+.unreq    qTemp3
+.unreq    qMask1
+.unreq    dMask1_0
+.unreq    dMask1_1
+.unreq    qMaskTmp
+.unreq    dMaskTmp_0
+.unreq    dMaskTmp_1
+
+        @/**
+        @ * @details
+        @ * This function operates on floating-point data types.
+        @ * There are no restrictions on numTaps and blockSize.
+        @ *
+        @ * The order of the coefficients in *coeffs should be
+        @ * bN, bN-1, bN-2, .....b1, b0
+        @ *
+        @ * <b>Cycle Count:</b>
+        @ *
+        @ * <code> Co + C1 * numTaps + C3 * blockSize * decimation Factor + c4 * numTaps * blockSize</code>
+        @ *
+        @ * @param[in]  *S                points to struct parameter
+        @ * @param[in]  *pSrc             points to the input buffer
+        @ * @param[out]  *pDst            points to the output buffer
+        @ * @param[in]  blockSize         block size of filter
+        @ */
+
+        .align   4
+        .global   ne10_fir_decimate_float_neon
+        .extern   ne10_qMaskTable32
+        .extern   ne10_divLookUpTable
+        .thumb
+        .thumb_func
+
+ne10_fir_decimate_float_neon:
+
+                            PUSH    {r4-r12,lr}
+
+@/*ARM Registers*/
+pStateStruct     .req   R0
+pSrc             .req   R1
+pDst             .req   R2
+blockSize        .req   R3
+
+pState           .req   R4             @/* State pointer */
+pCoeffs          .req   R5             @/* Coefficient pointer */
+decimationFact   .req   R6
+outBlockSize     .req   R7
+
+pX               .req   R6             @/* Temporary pointers for state buffer */
+pB               .req   R8             @/* Temporary pointers for coefficient buffer */
+numTaps          .req   R9             @/* Length of the filter */
+
+tapCnt           .req   R10            @ /* Loop counter */
+Count            .req   R11            @ /* Loop counter */
+pTemp            .req   R11
+blkCnt           .req   R11
+pMask            .req   R14            @  /* Mask Table */
+
+mask             .req   R12
+Offset           .req   R12
+
+@/*NEON variale Declaration*/
+qInp0            .qn   Q0.F32
+dInp0_0          .dn   D0.F32
+dInp0_1          .dn   D1.F32
+
+qCoeff           .qn   Q1.F32
+dCoeff_0         .dn   D2.F32
+dCoeff_1         .dn   D3.F32
+
+qZero            .qn   Q2.F32
+qMask            .qn   Q3.U32
+qMaskF32         .qn   Q3.F32
+dMask_0          .dn   D6.U32
+dMask_1          .dn   D7.U32
+dOut_0           .dn   D6.F32
+dOut_1           .dn   D7.F32
+
+qInp3            .qn   Q4.F32
+dInp3_0          .dn   D8.F32
+dInp3_1          .dn   D9.F32
+
+qAcc0            .qn   Q8.F32
+dAcc0_0          .dn   D16.F32
+dAcc0_1          .dn   D17.F32
+
+
+qTemp            .qn   Q9.F32
+dTemp_0          .dn   D18.F32
+dTemp_1          .dn   D19.F32
+
+qInp1            .qn   Q9.F32
+dInp1_0          .dn   D18.F32
+dInp1_1          .dn   D19.F32
+
+qAcc1            .qn   Q10.F32
+dAcc1_0          .dn   D20.F32
+dAcc1_1          .dn   D21.F32
+qAcc2            .qn   Q11.F32
+dAcc2_0          .dn   D22.F32
+dAcc2_1          .dn   D23.F32
+qAcc3            .qn   Q12.F32
+dAcc3_0          .dn   D24.F32
+dAcc3_1          .dn   D25.F32
+
+qMask1           .qn   Q13.U32
+dMask1_0         .dn   D26.U32
+dMask1_1         .dn   D27.U32
+
+qMaskTmp         .qn   Q14.U32
+dMaskTmp_0       .dn   D28.U32
+dMaskTmp_1       .dn   D29.U32
+
+
+qInp2            .qn   Q15.F32
+dInp2_0          .dn   D30.F32
+dInp2_1          .dn   D31.F32
+
+
+
+
+                    LDRB        decimationFact,[pStateStruct],#2
+                    LDRH        numTaps,[pStateStruct],#2
+                    LDR         pCoeffs,[pStateStruct],#4
+                    LDR         pState,[pStateStruct],#4
+
+                    @//outBlockSize = blockSize / S->M
+                    LDR         pTemp,=ne10_divLookUpTable
+                    SUBS        mask,decimationFact,#1
+                    ADD         pTemp,pTemp,mask,LSL #2
+                    LDR         mask,[pTemp]
+                    @//MOV         pX,#0
+
+
+                    SMULWB      outBlockSize,blockSize,mask
+                    CMP         outBlockSize,#0
+                    IT          LT
+                    RSBLT       outBlockSize,#0
+
+
+                    @/* S->state buffer contains previous frame (numTaps - 1) samples */
+                    @/* pStateCurnt points to the location where the new input data should be written */
+                    @//pStateCurnt = S->state + (numTaps - 1u)@
+
+
+                    @/* Copy Blocksize number of new input samples into the state buffer */
+
+                    LDR         pMask,=ne10_qMaskTable32
+                    SUB         tapCnt,numTaps,#1
+                    AND         mask,blockSize,#3
+
+                    ADD         pB,pState,tapCnt,LSL #2
+                    ADD         mask,pMask,mask,LSL #4
+                    VLD1        {dTemp_0,dTemp_1},[pSrc]!
+                    VLD1        {dMask1_0,dMask1_1},[mask]
+
+
+                    SUBS        Count,blockSize,#4
+                    LSL         Offset,decimationFact, #2
+                    VMOV        qMask,qMask1
+
+                    BLT         firDecimateEndCopy
+firDecimateCopyLoop:
+
+                    VST1        {dTemp_0,dTemp_1},[pB]!
+                    SUBS        Count,#4
+                    VLD1        {dTemp_0,dTemp_1},[pSrc]!
+                    BGE         firDecimateCopyLoop
+firDecimateEndCopy:
+                    VLD1        {dCoeff_0,dCoeff_1},[pB]
+
+                    VBSL        qMask,qTemp,qCoeff
+                    VST1        {dMask_0,dMask_1},[pB]
+                    ADD         pB,pB,tapCnt,LSL #2
+
+                    @// Load Mask Value
+                    AND         blkCnt,outBlockSize,#3
+                    ADD         blkCnt,pMask,blkCnt,LSL #4
+                    VLD1        {dMask1_0,dMask1_1},[blkCnt]
+
+                    @/*Load Mask Table Values*/
+
+                    AND         tapCnt,numTaps,#3
+                    ADD         pTemp,pMask,tapCnt,LSL #4
+                    VEOR        qZero,qZero,qZero
+                    VLD1        {dMaskTmp_0,dMaskTmp_1},[pTemp]
+
+                    @/*Handle 4 output samples at a time */
+                    SUBS        blkCnt,outBlockSize,#4
+                    BLT        firDecimateEndOuterLoop
+
+                    @//blkCnt = outBlockSize>>2@
+firDecimateOuterLoop:
+                    @/* Set accumulator to zero */
+                    VEOR        qAcc0,qAcc0,qAcc0
+                    VEOR        qAcc1,qAcc1,qAcc1
+                    VEOR        qAcc2,qAcc2,qAcc2
+                    VEOR        qAcc3,qAcc3,qAcc3
+                    @/* Initialize state pointer */
+                    MOV         pX,pState
+                    @/* Initialize coeff pointer */
+                    MOV         pB,pCoeffs
+
+                    SUBS        tapCnt,numTaps,#4
+                    VLD1        {dCoeff_0,dCoeff_1},[pB]!
+
+                    VLD1        {dInp0_0,dInp0_1},[pX],Offset
+                    VLD1        {dInp1_0,dInp1_1},[pX],Offset
+                    VLD1        {dInp2_0,dInp2_1},[pX],Offset
+                    VLD1        {dInp3_0,dInp3_1},[pX],Offset
+                    SUB         pX,pX,Offset, LSL #2
+                    ADD         pX,pX,#16
+
+                    BLT         firDecimateEndInnerLoop
+firDecimateInnerLoop:
+                    VMLA        qAcc0,qCoeff,qInp0
+                    VMLA        qAcc1,qCoeff,qInp1
+                    VMLA        qAcc2,qCoeff,qInp2
+                    VMLA        qAcc3,qCoeff,qInp3
+
+                    VLD1        {dCoeff_0,dCoeff_1},[pB]!
+                    VLD1        {dInp0_0,dInp0_1},[pX],Offset
+                    VLD1        {dInp1_0,dInp1_1},[pX],Offset
+                    VLD1        {dInp2_0,dInp2_1},[pX],Offset
+                    VLD1        {dInp3_0,dInp3_1},[pX],Offset
+                    SUB         pX,pX,Offset, LSL #2
+                    ADD         pX,pX,#16
+
+                    SUBS        tapCnt,#4
+                    BGE         firDecimateInnerLoop
+firDecimateEndInnerLoop:
+                    @/* If the filter length is not a multiple of 4, compute the remaining filter taps */
+
+                    VMOV        qMask,qMaskTmp
+                    VBSL        qMask,qCoeff,qZero
+
+                    VMLA        qAcc0,qMaskF32,qInp0
+                    VMLA        qAcc1,qMaskF32,qInp1
+                    VMLA        qAcc2,qMaskF32,qInp2
+                    VMLA        qAcc3,qMaskF32,qInp3
+
+                    VADD        dAcc0_0,dAcc0_0,dAcc0_1
+                    VADD        dAcc1_0,dAcc1_0,dAcc1_1
+                    VADD        dAcc2_0,dAcc2_0,dAcc2_1
+                    VADD        dAcc3_0,dAcc3_0,dAcc3_1
+
+                    VPADD       dAcc0_0,dAcc0_0,dAcc1_0
+                    VPADD       dAcc0_1,dAcc2_0,dAcc3_0
+                    ADD         pState,pState,Offset,LSL #2
+                    VST1        {dAcc0_0,dAcc0_1},[pDst]!
+
+                    SUBS        blkCnt,#4
+                    BGE         firDecimateOuterLoop
+
+firDecimateEndOuterLoop:
+                    @/*Handle BlockSize Not a Multiple of 4*/
+                    ADDS        blkCnt,#4
+                    BEQ         firDecimateCopyData
+
+
+                    @/* Set accumulator to zero */
+                    VEOR        qAcc0,qAcc0,qAcc0
+                    VEOR        qAcc1,qAcc1,qAcc1
+                    VEOR        qAcc2,qAcc2,qAcc2
+                    VEOR        qAcc3,qAcc3,qAcc3
+                    @/* Initialize state pointer */
+                    MOV         pX,pState
+                    @/* Initialize coeff pointer */
+                    MOV         pB,pCoeffs
+                    SUBS        tapCnt,numTaps,#4
+                    VLD1        {dCoeff_0,dCoeff_1},[pB]!
+
+                    VLD1        {dInp0_0,dInp0_1},[pX],Offset
+                    VLD1        {dInp1_0,dInp1_1},[pX],Offset
+                    VLD1        {dInp2_0,dInp2_1},[pX],Offset
+                    VLD1        {dInp3_0,dInp3_1},[pX],Offset
+                    SUB         pX,pX,Offset, LSL #2
+                    ADD         pX,pX,#16
+
+                    BLT         firDecimateEndInnerLoop1
+firDecimateInnerLoop1:
+                    VMLA        qAcc0,qCoeff,qInp0
+                    VMLA        qAcc1,qCoeff,qInp1
+                    VMLA        qAcc2,qCoeff,qInp2
+                    VMLA        qAcc3,qCoeff,qInp3
+
+                    VLD1        {dCoeff_0,dCoeff_1},[pB]!
+                    VLD1        {dInp0_0,dInp0_1},[pX],Offset
+                    VLD1        {dInp1_0,dInp1_1},[pX],Offset
+                    VLD1        {dInp2_0,dInp2_1},[pX],Offset
+                    VLD1        {dInp3_0,dInp3_1},[pX],Offset
+                    SUB         pX,pX,Offset, LSL #2
+                    ADD         pX,pX,#16
+
+                    SUBS        tapCnt,#4
+                    BGE         firDecimateInnerLoop1
+firDecimateEndInnerLoop1:
+                    @/* If the filter length is not a multiple of 4, compute the remaining filter taps */
+
+                    VMOV        qMask,qMaskTmp
+                    VBSL        qMask,qCoeff,qZero
+
+                    VMLA        qAcc0,qMaskF32,qInp0
+                    VMLA        qAcc1,qMaskF32,qInp1
+                    VMLA        qAcc2,qMaskF32,qInp2
+                    VMLA        qAcc3,qMaskF32,qInp3
+
+                    VADD        dAcc0_0,dAcc0_0,dAcc0_1
+                    VADD        dAcc1_0,dAcc1_0,dAcc1_1
+                    VADD        dAcc2_0,dAcc2_0,dAcc2_1
+                    VADD        dAcc3_0,dAcc3_0,dAcc3_1
+
+
+                    MUL         Offset,Offset,blkCnt
+                    VPADD       dAcc0_0,dAcc0_0,dAcc1_0
+                    VPADD       dAcc0_1,dAcc2_0,dAcc3_0
+                    ADD         pState,pState,Offset
+
+                    VMOV        qMask,qMask1
+                    VLD1        {dTemp_0,dTemp_1},[pDst]
+                    VBSL        qMask,qAcc0,qTemp
+
+                    VST1        {dMask_0,dMask_1},[pDst]
+                    ADD         pDst,pDst,blkCnt,LSL #2
+
+
+firDecimateCopyData:
+                    @/* Processing is complete.  Now shift the data in the state buffer down by
+                    @** blockSize samples.  This prepares the state buffer for the next function
+                    @** call. */
+
+                    @/* Points to the start of the state buffer */
+
+                    SUB         numTaps,numTaps,#1
+                    AND         mask,numTaps,#3
+                    LDR         pX,[pStateStruct,#-4]
+                    ADD         pTemp,pMask,mask,LSL #4
+                    VLD1        {dInp0_0,dInp0_1},[pState]!
+                    VLD1        {dMask_0,dMask_1},[pTemp]
+
+                    @/* copy data */
+
+                    SUBS        Count,numTaps,#4
+                    BLT         firDecimateEnd
+firDecimateCopyLoop1:
+                    VST1        {dInp0_0,dInp0_1},[pX]!
+                    SUBS        Count,#4
+                    VLD1        {dInp0_0,dInp0_1},[pState]!
+                    BGE         firDecimateCopyLoop1
+firDecimateEnd:
+                    VLD1        {dTemp_0,dTemp_1},[pX]
+                    VBSL        qMask,qInp0,qTemp
+                    VST1        {dOut_0,dOut_1},[pX]
+                    ADD         pX,pX,mask, LSL #2
+
+                    @// Return From Function
+                    POP     {r4-r12,pc}
+
+@/*ARM Registers*/
+.unreq    pStateStruct
+.unreq    pSrc
+.unreq    pDst
+.unreq    blockSize
+
+.unreq    pState
+.unreq    pCoeffs
+.unreq    decimationFact
+.unreq    outBlockSize
+
+.unreq    pX
+.unreq    pB
+.unreq    numTaps
+
+.unreq    tapCnt
+.unreq    Count
+.unreq    pTemp
+.unreq    blkCnt
+.unreq    pMask
+
+.unreq    mask
+.unreq    Offset
+
+@/*NEON variale Declaration*/
+.unreq    qInp0
+.unreq    dInp0_0
+.unreq    dInp0_1
+
+.unreq    qCoeff
+.unreq    dCoeff_0
+.unreq    dCoeff_1
+
+.unreq    qZero
+.unreq    qMask
+.unreq    qMaskF32
+.unreq    dMask_0
+.unreq    dMask_1
+.unreq    dOut_0
+.unreq    dOut_1
+
+.unreq    qInp3
+.unreq    dInp3_0
+.unreq    dInp3_1
+
+.unreq    qAcc0
+.unreq    dAcc0_0
+.unreq    dAcc0_1
+
+.unreq    qTemp
+.unreq    dTemp_0
+.unreq    dTemp_1
+
+.unreq    qInp1
+.unreq    dInp1_0
+.unreq    dInp1_1
+
+.unreq    qAcc1
+.unreq    dAcc1_0
+.unreq    dAcc1_1
+.unreq    qAcc2
+.unreq    dAcc2_0
+.unreq    dAcc2_1
+.unreq    qAcc3
+.unreq    dAcc3_0
+.unreq    dAcc3_1
+
+.unreq    qMask1
+.unreq    dMask1_0
+.unreq    dMask1_1
+
+.unreq    qMaskTmp
+.unreq    dMaskTmp_0
+.unreq    dMaskTmp_1
+
+.unreq    qInp2
+.unreq    dInp2_0
+.unreq    dInp2_1
+
+
+        @/**
+        @ * @details
+        @ * This function operates on floating-point data types.
+        @ * There are no restrictions on numTaps and blockSize.
+        @ *
+        @ * The order of the coefficients in *coeffs should be
+        @ * bN, bN-1, bN-2, .....b1, b0
+        @ *
+        @ * <b>Cycle Count:</b>
+        @ *
+        @ * <code> C0 + C2 * blockSize + C3 * blockSize * interpolateFactor + C4 * numTaps * blockSize * interpolateFactor </code>
+        @ *
+        @ * @param[in]  *S                points to struct parameter
+        @ * @param[in]  *pSrc             points to the input buffer
+        @ * @param[out]  *pDst            points to the output buffer
+        @ * @param[in]  blockSize         block size of filter
+        @ */
+
+        .align   4
+        .global   ne10_fir_interpolate_float_neon
+        .extern   ne10_qMaskTable32
+        .thumb
+        .thumb_func
+
+ne10_fir_interpolate_float_neon:
+                            PUSH    {r4-r12,lr}
+
+
+@/*ARM Registers*/
+pStateStruct     .req   R0
+pSrc             .req   R1
+pDst             .req   R2
+blockSize        .req   R3
+
+pState           .req   R4             @/* State pointer */
+
+pB               .req   R5             @/* Temporary pointers for coefficient buffer */
+pCoeffs          .req   R5             @/* Coefficient pointer */
+pStateCurnt      .req   R5             @/* Points to the current sample of the state */
+
+pX               .req   R6             @/* Temporary pointers for state buffer */
+
+interpolationFact .req  R7
+intFact          .req   R8
+phaseLen         .req   R9
+Offset           .req   R10
+
+Count            .req   R11            @ /* Loop counter */
+pTemp            .req   R11
+
+mask             .req   R12
+
+pMask            .req   R14            @  /* Mask Table */
+index            .req   R14
+
+@/*NEON variale Declaration*/
+qInp             .qn   Q0.F32
+dInp_0           .dn   D0.F32
+dInp_1           .dn   D1.F32
+qCoeff0          .qn   Q1.F32
+dCoeff0_0        .dn   D2.F32
+dCoeff0_1        .dn   D3.F32
+qZero            .qn   Q2.F32
+
+qMask            .qn   Q3.U32
+dMask_0          .dn   D6.U32
+dMask_1          .dn   D7.U32
+dOut_0           .dn   D6.F32
+dOut_1           .dn   D7.F32
+
+qAcc0            .qn   Q8.F32
+dAcc0_0          .dn   D16.F32
+dAcc0_1          .dn   D17.F32
+
+
+qTemp            .qn   Q9.F32
+dTemp_0          .dn   D18.F32
+dTemp_1          .dn   D19.F32
+
+qCoeff1          .qn   Q10.F32
+dCoeff1_0        .dn   D20.F32
+dCoeff1_1        .dn   D21.F32
+qCoeff2          .qn   Q11.F32
+dCoeff2_0        .dn   D22.F32
+dCoeff2_1        .dn   D23.F32
+qCoeff3          .qn   Q12.F32
+dCoeff3_0        .dn   D24.F32
+dCoeff3_1        .dn   D25.F32
+
+qMask1           .qn   Q13.F32
+dMask1_0         .dn   D26.F32
+dMask1_1         .dn   D27.F32
+
+
+qMaskTemp         .qn   Q14.U32
+dMaskTemp_0       .dn   D28.U32
+dMaskTemp_1       .dn   D29.U32
+
+                    LDRB        interpolationFact,[pStateStruct],#2
+                    LDRH        phaseLen,[pStateStruct],#2
+                    LDR         pCoeffs,[pStateStruct],#4
+                    LDR         pState,[pStateStruct],#4
+
+                    LSL         Offset,interpolationFact, #2
+
+
+
+                    @/* S->state buffer contains previous frame (phaseLen - 1) samples */
+                    @/* pStateCurnt points to the location where the new input data should be written */
+
+
+                    AND         phaseLen,#3
+                    LDR         pMask,=ne10_qMaskTable32
+
+                    @/* Total number of intput samples */
+                    @/*Load Mask Value*/
+
+                    AND         mask,interpolationFact,#3
+                    ADD         pTemp,pMask,phaseLen,LSL #4
+                    ADD         mask,pMask,mask,LSL #4
+
+                    VLD1        {dMaskTemp_0,dMaskTemp_1},[pTemp]
+                    VLD1        {dMask1_0,dMask1_1},[mask]
+
+
+                    VEOR        qZero,qZero,qZero
+
+
+
+                    @/* Loop over the blockSize. */
+                    CMP         blockSize,#0
+                    BEQ         firInterpolateCopyData
+firInterpolateBlockLoop:
+                    @/* Copy new input sample into the state buffer */
+                    LDRH        phaseLen,[pStateStruct,#-10]
+                    LDR         mask,[pSrc],#4
+                    SUB         phaseLen,#1
+                    ADD         pStateCurnt,pState,phaseLen, LSL #2
+
+                    LDRB        interpolationFact,[pStateStruct,#-12]
+                    STR         mask,[pStateCurnt]
+
+
+                    SUBS        intFact,interpolationFact,#4
+                    MOV         index,#4
+
+                    BLT         firInterpolateEndIntplLoop
+firInterpolateInterpolLoop:
+                    VEOR        qAcc0,qAcc0,qAcc0
+                    LDRH        phaseLen,[pStateStruct,#-10]
+                    LDR         pCoeffs,[pStateStruct,#-8]
+                    MOV         pX,pState
+                    SUB         mask,interpolationFact,index
+                    ADD         pB,pCoeffs,mask, LSL #2
+                    @/*Load Coefficients*/
+                    @/*c0 c1 c2 c3*/
+                    VLD1        {dCoeff0_0,dCoeff0_1},[pB],Offset
+                    @/*c0 c1 c2 c3*/
+                    VLD1        {dCoeff1_0,dCoeff1_1},[pB],Offset
+                    @/*c0 c1 c2 c3*/
+                    VLD1        {dCoeff2_0,dCoeff2_1},[pB],Offset
+                    @/*c0 c1 c2 c3*/
+                    VLD1        {dCoeff3_0,dCoeff3_1},[pB],Offset
+                    VLD1        {dInp_0,dInp_1},[pX]!
+                    @/* Loop over the polyPhase length. Unroll by a factor of 4.
+                    @  ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+                    SUBS        phaseLen,#4
+                    BLT         firInterpolateEndPhaseLoop
+firInterpolatePhaseLoop:
+                    @/* Perform the multiply-accumulate */
+                    VMLA        qAcc0,qCoeff0,dInp_0[0]
+                    VMLA        qAcc0,qCoeff1,dInp_0[1]
+                    VMLA        qAcc0,qCoeff2,dInp_1[0]
+                    VMLA        qAcc0,qCoeff3,dInp_1[1]
+
+                    VLD1        {dInp_0,dInp_1},[pX]!
+                    @ /*Load Coefficients*/
+                    @/*c0 c1 c2 c3*/
+                    VLD1        {dCoeff0_0,dCoeff0_1},[pB],Offset
+                    @/*c0 c1 c2 c3*/
+                    VLD1        {dCoeff1_0,dCoeff1_1},[pB],Offset
+                    @/*c0 c1 c2 c3*/
+                    VLD1        {dCoeff2_0,dCoeff2_1},[pB],Offset
+                    @/*c0 c1 c2 c3*/
+                    VLD1        {dCoeff3_0,dCoeff3_1},[pB],Offset
+
+                    SUBS        phaseLen,#4
+                    BGE         firInterpolatePhaseLoop
+firInterpolateEndPhaseLoop:
+                    @/* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+                    VMOV        qMask,qMaskTemp
+                    VBSL        qMask,qInp,qZero
+                    @/* Perform the multiply-accumulate */
+                    VMLA        qAcc0,qCoeff0,dOut_0[0]
+                    VMLA        qAcc0,qCoeff1,dOut_0[1]
+                    VMLA        qAcc0,qCoeff2,dOut_1[0]
+                    VMLA        qAcc0,qCoeff3,dOut_1[1]
+                    @ /* The result is in the accumulator is in Reverse Order*/
+                    VREV64      qAcc0,qAcc0
+                    @/*Swap the D-Regs of Acc*/
+                    VMOV        dCoeff0_0,dAcc0_1
+                    VMOV        dCoeff0_1,dAcc0_0
+
+                    VST1        {dCoeff0_0,dCoeff0_1},[pDst]!
+                    ADD         index,#4
+                    SUBS        intFact,#4
+                    BGE         firInterpolateInterpolLoop
+
+firInterpolateEndIntplLoop:
+                    ADDS        intFact,#4
+                    BEQ         firInterpolateNextSample
+                    @/*Handle the Remaining Samples*/
+                    VEOR        qAcc0,qAcc0,qAcc0
+                    LDRH        phaseLen,[pStateStruct,#-10]
+                    LDR         pCoeffs,[pStateStruct,#-8]
+                    MOV         pX,pState
+                    SUB         mask,interpolationFact,index
+                    ADD         pB,pCoeffs,mask, LSL #2
+                    @/*Load Coefficients*/
+                    @/*c0 c1 c2 c3*/
+                    VLD1        {dCoeff0_0,dCoeff0_1},[pB],Offset
+                    @/*c0 c1 c2 c3*/
+                    VLD1        {dCoeff1_0,dCoeff1_1},[pB],Offset
+                    @/*c0 c1 c2 c3*/
+                    VLD1        {dCoeff2_0,dCoeff2_1},[pB],Offset
+                    @/*c0 c1 c2 c3*/
+                    VLD1        {dCoeff3_0,dCoeff3_1},[pB],Offset
+                    VLD1        {dInp_0,dInp_1},[pX]!
+                    @/* Loop over the polyPhase length. Unroll by a factor of 4.
+                    @  ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+                    SUBS        phaseLen,#4
+                    BLT         firInterpolateEndPhaseLoop1
+firInterpolatePhaseLoop1:
+                    @/* Perform the multiply-accumulate */
+                    VMLA        qAcc0,qCoeff0,dInp_0[0]
+                    VMLA        qAcc0,qCoeff1,dInp_0[1]
+                    VMLA        qAcc0,qCoeff2,dInp_1[0]
+                    VMLA        qAcc0,qCoeff3,dInp_1[1]
+
+                    VLD1        {dInp_0,dInp_1},[pX]!
+                    @ /*Load Coefficients*/
+                    @/*c0 c1 c2 c3*/
+                    VLD1        {dCoeff0_0,dCoeff0_1},[pB],Offset
+                    @/*c0 c1 c2 c3*/
+                    VLD1        {dCoeff1_0,dCoeff1_1},[pB],Offset
+                    @/*c0 c1 c2 c3*/
+                    VLD1        {dCoeff2_0,dCoeff2_1},[pB],Offset
+                    @/*c0 c1 c2 c3*/
+                    VLD1        {dCoeff3_0,dCoeff3_1},[pB],Offset
+                    SUBS        phaseLen,#4
+                    BGE         firInterpolatePhaseLoop1
+
+firInterpolateEndPhaseLoop1:
+                    @/* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+                    VMOV        qMask,qMaskTemp
+                    VBSL        qMask,qInp,qZero
+                    @/* Perform the multiply-accumulate */
+                    VMLA        qAcc0,qCoeff0,dOut_0[0]
+                    VMLA        qAcc0,qCoeff1,dOut_0[1]
+                    VMLA        qAcc0,qCoeff2,dOut_1[0]
+                    VMLA        qAcc0,qCoeff3,dOut_1[1]
+                    @ /* The result is in the accumulator is in Reverse Order*/
+                    VREV64      qAcc0,qAcc0
+
+                    VMOV        qMask,qMask1
+                    VLD1        {dTemp_0,dTemp_1},[pDst]
+                    @/*Swap the D-Regs of Acc*/
+                    VMOV        dCoeff0_0,dAcc0_1
+                    VMOV        dCoeff0_1,dAcc0_0
+
+                    VBSL        qMask,qCoeff0,qTemp
+                    VST1        {dMask_0,dMask_1},[pDst]
+                    ADD         pDst,pDst,intFact, LSL #2
+
+
+firInterpolateNextSample:
+                    SUBS        blockSize,#1
+                    ADD         pState,#4
+                    BGT         firInterpolateBlockLoop
+
+                    @/*End of Processing*/
+
+firInterpolateCopyData:
+
+                    @/* Save previous phaseLen - 1 samples and get rid of other samples  */
+                    @/* Points to the start of the state buffer */
+                    LDRH        phaseLen,[pStateStruct,#-10]
+                    LDR         pMask,=ne10_qMaskTable32
+                    LDR         pStateCurnt,[pStateStruct,#-4]
+
+                    SUB         phaseLen,phaseLen,#1
+                    AND         mask,phaseLen,#3
+                    ADD         pTemp,pMask,mask,LSL #4
+
+                    VLD1        {dInp_0,dInp_1},[pState]!
+                    VLD1        {dMask_0,dMask_1},[pTemp]
+
+                    @/* copy data */
+
+                    SUBS        Count,phaseLen,#4
+                    BLT         firInterpolateEnd
+firInterpolateCopyLoop:
+                    VST1        {dInp_0,dInp_1},[pStateCurnt]!
+                    SUBS        Count,#4
+                    VLD1        {dInp_0,dInp_1},[pState]!
+                    BGE         firInterpolateCopyLoop
+firInterpolateEnd:
+
+                    VLD1        {dTemp_0,dTemp_1},[pStateCurnt]
+                    VBSL        qMask,qInp,qTemp
+                    VST1        {dOut_0,dOut_1},[pStateCurnt]
+
+                    ADD         pStateCurnt,pStateCurnt,mask, LSL #2
+
+                    @/*Return From Function*/
+                    POP     {r4-r12,pc}
+@/*ARM Registers*/
+.unreq    pStateStruct
+.unreq    pSrc
+.unreq    pDst
+.unreq    blockSize
+
+.unreq    pState
+
+.unreq    pB
+.unreq    pCoeffs
+.unreq    pStateCurnt
+
+.unreq    pX
+
+.unreq    interpolationFact
+.unreq    intFact
+.unreq    phaseLen
+.unreq    Offset
+
+.unreq    Count
+.unreq    pTemp
+
+.unreq    mask
+
+.unreq    pMask
+.unreq    index
+
+@/*NEON variale Declaration*/
+.unreq    qInp
+.unreq    dInp_0
+.unreq    dInp_1
+.unreq    qCoeff0
+.unreq    dCoeff0_0
+.unreq    dCoeff0_1
+.unreq    qZero
+
+.unreq    qMask
+.unreq    dMask_0
+.unreq    dMask_1
+.unreq    dOut_0
+.unreq    dOut_1
+
+.unreq    qAcc0
+.unreq    dAcc0_0
+.unreq    dAcc0_1
+
+.unreq    qTemp
+.unreq    dTemp_0
+.unreq    dTemp_1
+
+.unreq    qCoeff1
+.unreq    dCoeff1_0
+.unreq    dCoeff1_1
+.unreq    qCoeff2
+.unreq    dCoeff2_0
+.unreq    dCoeff2_1
+.unreq    qCoeff3
+.unreq    dCoeff3_0
+.unreq    dCoeff3_1
+
+.unreq    qMask1
+.unreq    dMask1_0
+.unreq    dMask1_1
+
+.unreq    qMaskTemp
+.unreq    dMaskTemp_0
+.unreq    dMaskTemp_1
+
+
+        @/**
+        @ * @details
+        @ * This function operates on floating-point data types.
+        @ * There are no restrictions on numStages and blockSize.
+        @ *
+        @ * The order of the coefficients in *coeffs should be
+        @ * k1, k2, ...kM-1
+        @ *
+        @ * <b>Cycle Count:</b>
+        @ *
+        @ * <code>c0 + c1 * blockSize + c2 * numStages * blockSize</code>
+        @ *
+        @ * @param[in]  *S                points to struct parameter
+        @ * @param[in]  *pSrc             points to the input buffer
+        @ * @param[out]  *pDst            points to the output buffer
+        @ * @param[in]  blockSize         block size of filter
+        @ */
+
+        .align   4
+        .global   ne10_fir_lattice_float_neon
+        .extern   ne10_qMaskTable32
+        .thumb
+        .thumb_func
+
+ne10_fir_lattice_float_neon:
+
+                        PUSH    {r4-r12,lr}
+
+@/*ARM Registers*/
+pStateStruct     .req   R0
+pSrc             .req   R1
+pDst             .req   R2
+blockSize        .req   R3
+
+pState           .req   R4             @/* State pointer */
+pCoeffs          .req   R5             @/* Coefficient pointer */
+
+pX               .req   R7             @/* Temporary pointers for state buffer */
+pB               .req   R8             @/* Temporary pointers for coefficient buffer */
+numStages        .req   R9             @/* Length of the filter */
+
+stageCnt         .req   R10            @ /* Loop counter */
+
+
+pTemp            .req   R11
+pMask            .req   R14            @  /* Mask Table */
+mask             .req   R12
+
+@/*NEON variale Declaration*/
+qFcurr           .qn   Q0.F32
+dFcurr_0         .dn   D0.F32
+dFcurr_1         .dn   D1.F32
+qCoeff           .qn   Q1.F32
+dCoeff_0         .dn   D2.F32
+dCoeff_1         .dn   D3.F32
+
+qZero            .qn   Q2.F32
+
+qMask            .qn   Q3.U32
+dMask_0          .dn   D6.U32
+dMask_1          .dn   D7.U32
+dOut_0           .dn   D6.F32
+dOut_1           .dn   D7.F32
+
+qAcc0            .qn   Q8.F32
+dAcc0_0          .dn   D16.F32
+dAcc0_1          .dn   D17.F32
+
+qTemp            .qn   Q9.F32
+dTemp_0          .dn   D18.F32
+dTemp_1          .dn   D19.F32
+
+qFnext           .qn   Q10.F32
+dFnext_0         .dn   D20.F32
+dFnext_1         .dn   D21.F32
+qGcurr           .qn   Q11.F32
+dGcurr_0         .dn   D22.F32
+dGcurr_1         .dn   D23.F32
+qGnext           .qn   Q12.F32
+dGnext_0         .dn   D24.F32
+dGnext_1         .dn   D25.F32
+
+qMask1           .qn   Q13.U32
+dMask1_0         .dn   D26.U32
+dMask1_1         .dn   D27.U32
+qMaskTmp         .qn   Q14.U32
+dMaskTmp_0       .dn   D28.U32
+dMaskTmp_1       .dn   D29.U32
+qTemp1           .qn   Q15.F32
+dTemp1_0         .dn   D30.F32
+dTemp1_1         .dn   D31.F32
+
+fNext            .dn   D0.F32
+gCurr            .dn   D1.F32
+gNext            .dn   D2.F32
+fCurr            .dn   D3.F32
+Coeff            .dn   D4.F32
+
+                            @/* Length of the filter */
+                            LDRH        numStages,[pStateStruct],#4
+                            @/* State pointer */
+                            LDR         pState,[pStateStruct],#4
+                            @/* Coefficient pointer */
+                            LDR         pCoeffs,[pStateStruct],#4
+
+
+                            @// Get the Mask Values
+
+                            LDR         pMask,=ne10_qMaskTable32
+                            SUB         numStages,#1
+                            AND         mask,numStages, #3
+                            AND         stageCnt,blockSize,#3
+
+                            ADD         pTemp,pMask,mask,LSL #4
+                            ADD         stageCnt,pMask,stageCnt,LSL #4
+                            VLD1        {dMaskTmp_0,dMaskTmp_1},[pTemp]
+                            VLD1        {dMask1_0,dMask1_1},[stageCnt]
+                            VEOR        qZero,qZero,qZero
+
+
+                            SUBS        blockSize,#4
+                            BLT         firLatticeEndOuterLoop
+firLatticeOuterLoop:
+                            @/* Initialize coeff pointer */
+                            MOV         pB,pCoeffs
+                            @/* Initialize state pointer */
+                            MOV         pX,pState
+                            @/* Read Four samples from input buffer: fcurr0, fcurr1,fcurr2,fcurr3*/
+                            @/* f0(n) = x(n) */
+                            VLD1        {dFcurr_0,dFcurr_1},[pSrc]!
+                            @/*Read one Sample from the State Buffer*/
+                            VLD1        {dGcurr_1[1]},[pX]
+                            VEXT        qGnext,qGcurr,qFcurr,#3
+
+                            VLD1        {dCoeff_0[],dCoeff_1[]},[pB]!
+                            VMOV        qFnext,qFcurr
+                            VST1        {dFcurr_1[1]},[pX]!
+                            @/* fi(n) = fi-1(n) + Ki * gi-1(n-1) */
+                            @/* gi(n) = fi-1(n) * Ki + gi-1(n-1) */
+                            @/* ki*gcurr4+fcurr4 ki*gcurr3+fcurr3 ki*gcurr2+fcurr2 ki*gcurr1+fcurr1*/
+
+                            VMLA        qFcurr,qGnext,qCoeff
+                            @/* ki*fcurr4+gcurr4 ki*fcurr3+gcurr3 ki*fcurr2+gcurr2 ki*fcurr1+gcurr1*/
+                            VMLA        qGnext,qFnext,qCoeff
+
+
+                            @/* Loop unrolling.  Process 4 taps at a time . */
+                            SUBS        stageCnt,numStages,#4
+                            BLT         firLatticeEndInnerLoop
+                            @/* Loop over the number of taps.  Unroll by a factor of 4.
+                            @  * Repeat until we've computed numStages-3 coefficients. */
+                            @/* Process 2nd, 3rd, 4th and 5th taps ... here */
+firLatticeInnerLoop:
+                            VLD1        {dGcurr_1[1]},[pX]!
+                            VREV64      dTemp_0,dGnext_1
+
+                            VLD1        {dCoeff_0[],dCoeff_1[]},[pB]!
+                            VEXT        qGcurr,qGcurr,qGnext,#3
+
+                            @ /* fi(n) = fi-1(n) + Ki * gi-1(n-1) */
+                            @/* gi(n) = fi-1(n) * Ki + gi-1(n-1) */
+                            @/* ki*gcurr4+fcurr4 ki*gcurr3+fcurr3 ki*gcurr2+fcurr2 ki*gcurr1+fcurr1*/
+                            VMOV        qFnext,qFcurr
+                            VMOV        qGnext,qGcurr
+                            @/* ki*fcurr4+gcurr4 ki*fcurr3+gcurr3 ki*fcurr2+gcurr2 ki*fcurr1+gcurr1*/
+                            VMLA        qGnext,qFnext,qCoeff
+                            VMLA        qFcurr,qGcurr,qCoeff
+
+                            @/*Prepare for Next Stage*/
+                            VLD1        {dGcurr_1[1]},[pX]!
+
+                            VLD1        {dCoeff_0[],dCoeff_1[]},[pB]!
+                            VEXT        dTemp_0,dGnext_1,dTemp_0,#1
+                            VEXT        qGcurr,qGcurr,qGnext,#3
+
+                            @/*Next Stage*/
+                            VMOV        qFnext,qFcurr
+                            VMOV        qGnext,qGcurr
+                            VMLA        qGnext,qFnext,qCoeff
+                            VMLA        qFcurr,qGcurr,qCoeff
+
+                            @/*Prepare for Next Stage*/
+                            VLD1        {dGcurr_1[1]},[pX]!
+                            VLD1        {dCoeff_0[],dCoeff_1[]},[pB]!
+                            VEXT        dTemp_1,dGnext_1,dTemp_1,#1
+                            VEXT        qGcurr,qGcurr,qGnext,#3
+
+
+                            @/*Next Stage*/
+                            VMOV        qFnext,qFcurr
+                            VMOV        qGnext,qGcurr
+                            VMLA        qGnext,qFnext,qCoeff
+                            VMLA        qFcurr,qGcurr,qCoeff
+
+
+                            @/*Prepare for Next Stage*/
+                            VLD1        {dGcurr_1[1]},[pX]!
+                            VLD1        {dCoeff_0[],dCoeff_1[]},[pB]!
+                            VEXT        dTemp_1,dGnext_1,dTemp_1,#1
+                            VEXT        qGcurr,qGcurr,qGnext,#3
+
+                            VREV64      qTemp,qTemp
+                            @/*Next Stage*/
+                            VMOV        qFnext,qFcurr
+                            VMOV        qGnext,qGcurr
+                            VMLA        qFcurr,qGcurr,qCoeff
+                            VMLA        qGnext,qFnext,qCoeff
+                            SUB         pX,#16
+
+                            @/*Store the samples in the state buffer for next frame*/
+                            VST1        {dTemp_0,dTemp_1},[pX]!
+                            SUBS        stageCnt,#4
+                            BGE         firLatticeInnerLoop
+firLatticeEndInnerLoop:
+                            ADDS        stageCnt,#4
+                            BEQ         firLatticeFinishInner
+                            VMOV        qMask,qMaskTmp
+                            VLD1        {dCoeff_0,dCoeff_1},[pB]!
+
+                            VLD1        {dGcurr_1[1]},[pX]!
+                            VREV64      dTemp_0,dGnext_1
+                            VBSL        qMask,qCoeff,qZero
+
+
+                            VEXT        qGcurr,qGcurr,qGnext,#3
+                            VDUP        qCoeff,dMask_0[0]
+                            VMOV        qFnext,qFcurr
+                            VMOV        qGnext,qGcurr
+                            VMLA        qGnext,qFnext,qCoeff
+                            VMLA        qFcurr,qGcurr,qCoeff
+
+                            VLD1        {dGcurr_1[1]},[pX]!
+
+                            VDUP        qCoeff,dMask_0[1]
+                            VEXT        dTemp_0,dGnext_1,dTemp_0,#1
+                            VEXT        qGcurr,qGcurr,qGnext,#3
+
+                            VMOV        qFnext,qFcurr
+                            VMOV        qGnext,qGcurr
+                            VMLA        qGnext,qFnext,qCoeff
+                            VMLA        qFcurr,qGcurr,qCoeff
+
+                            VLD1        {dGcurr_1[1]},[pX]!
+                            VDUP        qCoeff,dMask_1[0]
+                            VEXT        dTemp_1,dGnext_1,dTemp_1,#1
+                            VEXT        qGcurr,qGcurr,qGnext,#3
+
+                            VMOV        qFnext,qFcurr
+                            VMOV        qGnext,qGcurr
+                            VMLA        qGnext,qFnext,qCoeff
+                            VMLA        qFcurr,qGcurr,qCoeff
+
+                            VLD1        {dGcurr_1[1]},[pX]!
+                            VDUP        qCoeff,dMask_1[1]
+                            VEXT        dTemp_1,dGnext_1,dTemp_1,#1
+                            VEXT        qGcurr,qGcurr,qGnext,#3
+
+                            VREV64      qTemp,qTemp
+
+                            VMOV        qFnext,qFcurr
+                            VMOV        qGnext,qGcurr
+                            SUB         pX,pX,#16
+
+                            VMOV        qMask,qMaskTmp
+                            VMLA        qFcurr,qGcurr,qCoeff
+                            VLD1        {dTemp1_0,dTemp1_1},[pX]
+                            VMLA        qGnext,qFnext,qCoeff
+                            VBSL        qMask,qTemp,qTemp1
+                            VST1        {dMask_0,dMask_1},[pX]
+                            ADD         pX,pX,stageCnt, LSL #2
+
+firLatticeFinishInner:
+
+                            VST1        {dFcurr_0,dFcurr_1},[pDst]!
+                            SUBS        blockSize,#4
+                            BGE         firLatticeOuterLoop
+
+firLatticeEndOuterLoop:
+                            ADDS        blockSize,#4
+                            BEQ         firLatticeEnd
+
+firLatticeOuterLoop1:
+                            VLD1        {fCurr[0]},[pSrc]!
+                            MOV         pB,pCoeffs
+                            MOV         pX,pState
+                            VLD1        {gCurr[0]},[pX]
+                            VLD1        {Coeff[0]},[pB]!
+
+                            VST1        {fCurr[0]},[pX]!
+                            VMOV        gNext,gCurr
+                            VMLA        gNext,Coeff,fCurr
+                            VMLA        fCurr,Coeff,gCurr
+
+                            SUBS        stageCnt,numStages,#1
+                            BLE         firLatticeEndinnerLoop1
+firLatticeInnerLoop1:
+                            VLD1        {gCurr[0]},[pX]
+                            VST1        {gNext[0]},[pX]!
+
+                            VLD1        {Coeff[0]},[pB]!
+
+                            VMOV        gNext,gCurr
+                            VMLA        gNext,Coeff,fCurr
+                            VMLA        fCurr,Coeff,gCurr
+                            SUBS        stageCnt,#1
+                            BGE         firLatticeInnerLoop1
+firLatticeEndinnerLoop1:
+                            VST1        {fCurr[0]},[pDst]!
+                            SUBS        blockSize,#1
+                            BGT         firLatticeOuterLoop1
+
+firLatticeEnd:
+                             @/*Return From Function*/
+                            POP     {r4-r12,pc}
+
+@/*ARM Registers*/
+.unreq    pStateStruct
+.unreq    pSrc
+.unreq    pDst
+.unreq    blockSize
+
+.unreq    pState
+.unreq    pCoeffs
+
+.unreq    pX
+.unreq    pB
+.unreq    numStages
+
+.unreq    stageCnt
+
+.unreq    pTemp
+.unreq    pMask
+.unreq    mask
+
+.unreq    fNext
+.unreq    gCurr
+.unreq    gNext
+.unreq    fCurr
+.unreq    Coeff
+
+@/*NEON variale Declaration*/
+.unreq    qFcurr
+.unreq    dFcurr_0
+.unreq    dFcurr_1
+.unreq    qCoeff
+.unreq    dCoeff_0
+.unreq    dCoeff_1
+
+.unreq    qZero
+
+.unreq    qMask
+.unreq    dMask_0
+.unreq    dMask_1
+.unreq    dOut_0
+.unreq    dOut_1
+
+.unreq    qAcc0
+.unreq    dAcc0_0
+.unreq    dAcc0_1
+
+.unreq    qTemp
+.unreq    dTemp_0
+.unreq    dTemp_1
+
+.unreq    qFnext
+.unreq    dFnext_0
+.unreq    dFnext_1
+.unreq    qGcurr
+.unreq    dGcurr_0
+.unreq    dGcurr_1
+.unreq    qGnext
+.unreq    dGnext_0
+.unreq    dGnext_1
+
+.unreq    qMask1
+.unreq    dMask1_0
+.unreq    dMask1_1
+.unreq    qMaskTmp
+.unreq    dMaskTmp_0
+.unreq    dMaskTmp_1
+.unreq    qTemp1
+.unreq    dTemp1_0
+.unreq    dTemp1_1
+
+        @/**
+        @ * @details
+        @ * This function operates on floating-point data types.
+        @ * There are no restrictions on numTaps and blockSize.
+        @ *
+        @ * The scratch buffer, pScratch is internally used for holding the state values temporarily.
+        @ * <b>Cycle Count:</b>
+        @ *
+        @ * <code> C0 * blockSize + C1 * numTaps + C2 * numTaps * blockSize</code>
+        @ *
+        @ * <b>Cycle Count:</b>
+        @ *
+        @ * <code> C0 + C2 * blockSize + C3 * blockSize * interpolateFactor + C4 * numTaps * blockSize * interpolateFactor </code>
+        @ *
+        @ * @param[in]  *S                points to struct parameter
+        @ * @param[in]  *pSrc             points to the input buffer
+        @ * @param[out]  *pDst            points to the output buffer
+        @ * @param[out]  *pScratch        points to the scratch buffer
+        @ * @param[in]  blockSize         block size of filter
+        @ */
+
+        .align   4
+        .global   ne10_fir_sparse_float_neon
+        .extern   ne10_qMaskTable32
+        .thumb
+        .thumb_func
+
+ne10_fir_sparse_float_neon:
+                            PUSH    {r4-r12,lr}
+                            PUSH    {r0}
+
+@/*ARM Registers*/
+pStateStruct     .req   R0
+pSrc             .req   R1
+pDst             .req   R2
+pScratch         .req   R3
+blockSize        .req   R4
+size2            .req   R4
+
+pYtmp1           .req   R0
+pOut             .req   R0
+Offset           .req   R0
+
+readIndex        .req   R1
+
+numTaps          .req   R5             @/* Length of the filter */
+
+pState           .req   R6             @/* State pointer */
+pCoeffs          .req   R7             @/* Coefficient pointer */
+stateIndex       .req   R8
+
+maxDelay         .req   R9
+delaySize        .req   R9
+
+pTapDelay        .req   R10
+
+blkCnt           .req   R11
+size1            .req   R11
+temp             .req   R1
+mask             .req   R11
+pMask            .req   R11
+
+pX               .req   R12
+
+pY               .req   R14
+pYtmp2           .req   R14
+
+
+@/*NEON variale Declaration*/
+qInp             .qn   Q0.F32
+dInp_0           .dn   D0.F32
+dInp_1           .dn   D1.F32
+
+qCoeff           .qn   Q1.F32
+dCoeff_0         .dn   D2.F32
+dCoeff_1         .dn   D3.F32
+
+qZero            .qn   Q2.F32
+
+qMask            .qn   Q3.U32
+qMaskF32         .qn   Q3.F32
+dMask_0          .dn   D6.U32
+dMask_1          .dn   D7.U32
+
+
+qAcc0            .qn   Q4.F32
+dAcc0_0          .dn   D8.F32
+dAcc0_1          .dn   D9.F32
+
+
+qTemp            .qn   Q8.F32
+dTemp_0          .dn   D16.F32
+dTemp_1          .dn   D17.F32
+
+
+qMaskTmp         .qn   Q9.U32
+dMaskTmp_0       .dn   D18.U32
+dMaskTmp_1       .dn   D19.U32
+
+
+                    /*Load Mask Table*/
+
+                    LDRH        numTaps,[pStateStruct],#2
+                    LDRH        stateIndex,[pStateStruct],#2
+                    LDR         pState,[pStateStruct],#4
+                    LDR         pCoeffs,[pStateStruct],#4
+                    LDRH        maxDelay,[pStateStruct],#4
+                    LDR         pTapDelay,[pStateStruct],#4
+
+                    @// Load blockSize from Stack
+                    LDR         blockSize,[SP,#44]
+                    LDR         pMask,=ne10_qMaskTable32
+                    ADD         delaySize,blockSize,maxDelay
+
+                    VEOR        qZero,qZero
+                    AND         pY,blockSize,#3
+                    ADD         pY,pMask,pY,LSL #4
+                    VLD1        {dMaskTmp_0,dMaskTmp_1},[pY]
+
+
+                    @/* BlockSize of Input samples are copied into the state buffer */
+                    @/* StateIndex points to the starting position to write in the state buffer */
+                    MOV         pX,pState
+                    LSL         Offset,stateIndex,#2
+
+                    SUBS        blkCnt,blockSize,#1
+                    BLT         firSparseEndSrcCopy
+firSparseSrcCopyLoop:
+
+                    LDR         pY,[pSrc],#4
+                    STR         pY,[pX,Offset]
+                    ADD         Offset,#4
+                    CMP         delaySize,Offset,LSR #2
+                    IT          LE
+                    SUBLE       Offset,Offset,delaySize, LSL #2
+                    SUBS        blkCnt,#1
+                    BGE         firSparseSrcCopyLoop
+firSparseEndSrcCopy:
+
+                    LSR         stateIndex,Offset,#2
+                    LDR         Offset,[SP,#0]
+                    STRH        stateIndex,[Offset,#2]
+
+                    LDR         readIndex,[pTapDelay],#4
+                    ADD         readIndex,readIndex,blockSize
+                    SUBS        readIndex,stateIndex,readIndex
+
+                    @/*Wrap arround index*/
+                    IT          LT
+                    ADDLT       readIndex,readIndex,delaySize
+
+
+                    @/*Processing begins*/
+                    @/*First stage*/
+                    MOV         pY,pState
+                    MOV         pX,pScratch
+
+                    @/* copy the sample from the circular buffer to the destination buffer */
+                    SUB         size1,delaySize,readIndex
+                    CMP         size1,blockSize
+                    IT          GT
+                    MOVGT       size1,blockSize
+
+                    ADD         pYtmp1,pY,readIndex, LSL #2
+                    SUB         size2,blockSize,size1
+                    MOV         pYtmp2,pY
+
+                    CMP         size1,#0
+                    BLE         firSparseEndcopy1
+firSparseCopy1:
+                    LDR         temp,[pYtmp1],#4
+                    SUBS        size1,#1
+                    STR         temp,[pScratch],#4
+                    BGT         firSparseCopy1
+firSparseEndcopy1:
+
+                    CMP         size2,#0
+                    BLE         firSparseEndcopy2
+firSparseCopy2:
+                    LDR         temp,[pYtmp2],#4
+                    SUBS        size2,#1
+                    STR         temp,[pScratch],#4
+                    BGT         firSparseCopy2
+firSparseEndcopy2:
+
+
+                    @// Load blockSize from Stack
+                    LDR         blockSize,[SP,#44]
+
+
+                    MOV         pOut,pDst
+                    VLD1         {dCoeff_0[],dCoeff_1[]},[pCoeffs]!
+                    @//CMP         tapCnt,numTaps
+
+                    @//Complete the case of tapCnt=numTaps
+                    SUBS        blkCnt,blockSize,#4
+                    VLD1        {dInp_0,dInp_1},[pX]!
+                    BLT         firSparseEndInnerLoop
+firSparseInnerLoop:
+                    VMUL        qAcc0,qInp,qCoeff
+                    VLD1        {dInp_0,dInp_1},[pX]!
+                    SUBS        blkCnt,#4
+                    VST1        {dAcc0_0,dAcc0_1},[pOut]!
+                    BGE         firSparseInnerLoop
+
+firSparseEndInnerLoop:
+                    ADDS        blkCnt,#4
+                    @/* If the blockSize is not a multiple of 4,
+                    @* * compute the remaining samples */
+
+                    VLD1        {dTemp_0,dTemp_1},[pOut]
+                    VMUL        qAcc0,qInp,qCoeff
+                    VMOV        qMask,qMaskTmp
+                    VBSL        qMask,qAcc0,qTemp
+                    VST1        {dMask_0,dMask_1},[pOut]
+                    ADD         pOut,pOut,blkCnt,LSL #2
+
+                    LDR         readIndex,[pTapDelay],#4
+                    ADD         readIndex,readIndex,blockSize
+                    SUBS        readIndex,stateIndex,readIndex
+
+                    @/*Wrap arround index*/
+                    IT          LT
+                    ADDLT       readIndex,readIndex,delaySize
+
+                    SUBS        numTaps,#1
+                    BLE         firSparseEnd
+firSparseOuterLoop:
+
+                    @// Load blockSize from Stack
+                    LDR         blockSize,[SP,#44]
+
+                    MOV         pY,pState
+                    MOV         pX,pScratch
+
+                    @/* copy the sample from the circular buffer to the destination buffer */
+                    SUB         size1,delaySize,readIndex
+                    CMP         size1,blockSize
+                    IT          GT
+                    MOVGT       size1,blockSize
+
+                    ADD         pYtmp1,pY,readIndex, LSL #2
+                    SUB         size2,blockSize,size1
+                    MOV         pYtmp2,pY
+
+
+                    CMP         size1,#0
+                    BLE         firSparseEndcopy3
+firSparseCopy3:
+                    LDR         temp,[pYtmp1],#4
+                    SUBS        size1,#1
+                    STR         temp,[pScratch],#4
+                    BGT         firSparseCopy3
+firSparseEndcopy3:
+                    CMP         size2,#0
+                    BLE         firSparseEndcopy4
+firSparseCopy4:
+                    LDR         temp,[pYtmp2],#4
+                    SUBS        size2,#1
+                    STR         temp,[pScratch],#4
+                    BGT         firSparseCopy4
+firSparseEndcopy4:
+
+                    @// Load blockSize from Stack
+                    LDR         blockSize,[SP,#44]
+
+
+                    MOV         pOut,pDst
+                    VLD1         {dCoeff_0[],dCoeff_1[]},[pCoeffs]!
+
+
+                    @//Complete the case of tapCnt=numTaps
+                    SUBS        blkCnt,blockSize,#4
+                    VLD1        {dInp_0,dInp_1},[pX]!
+                    VLD1        {dAcc0_0,dAcc0_1},[pOut]
+                    BLT         firSparseEndInnerLoop1
+firSparseInnerLoop1:
+                    VMLA        qAcc0,qInp,qCoeff
+                    VLD1        {dInp_0,dInp_1},[pX]!
+                    SUBS        blkCnt,#4
+                    VST1        {dAcc0_0,dAcc0_1},[pOut]!
+                    VLD1        {dAcc0_0,dAcc0_1},[pOut]
+                    BGE         firSparseInnerLoop1
+
+firSparseEndInnerLoop1:
+                    ADDS        blkCnt,#4
+                    @/* If the blockSize is not a multiple of 4,
+                    @* * compute the remaining samples */
+
+
+                    VMOV        qMask,qMaskTmp
+                    VBSL        qMask,qInp,qZero
+                    VMLA        qAcc0,qMaskF32,qCoeff
+
+                    VST1        {dAcc0_0,dAcc0_1},[pOut]
+                    ADD         pOut,pOut,blkCnt,LSL #2
+
+                    LDR         readIndex,[pTapDelay],#4
+                    ADD         readIndex,readIndex,blockSize
+                    SUBS        readIndex,stateIndex,readIndex
+
+                    @/*Wrap arround index*/
+                    IT          LT
+                    ADDLT       readIndex,readIndex,delaySize
+
+                    SUBS        numTaps,#1
+
+                    BGT         firSparseOuterLoop
+firSparseEnd:
+                    @// Return From Function
+                    POP     {r0}
+                    POP     {r4-r12,pc}
+
+@/*ARM Registers*/
+.unreq    pStateStruct
+.unreq    pSrc
+.unreq    pDst
+.unreq    pScratch
+.unreq    blockSize
+.unreq    size2
+
+.unreq    pYtmp1
+.unreq    pOut
+.unreq    Offset
+
+.unreq    readIndex
+
+.unreq    numTaps
+
+.unreq    pState
+.unreq    pCoeffs
+.unreq    stateIndex
+
+.unreq    maxDelay
+.unreq    delaySize
+
+.unreq    pTapDelay
+
+.unreq    blkCnt
+.unreq    size1
+.unreq    temp
+.unreq    mask
+.unreq    pMask
+
+.unreq    pX
+
+.unreq    pY
+.unreq    pYtmp2
+
+@/*NEON variale Declaration*/
+.unreq    qInp
+.unreq    dInp_0
+.unreq    dInp_1
+
+.unreq    qCoeff
+.unreq    dCoeff_0
+.unreq    dCoeff_1
+
+.unreq    qZero
+
+.unreq    qMask
+.unreq    qMaskF32
+.unreq    dMask_0
+.unreq    dMask_1
+
+.unreq    qAcc0
+.unreq    dAcc0_0
+.unreq    dAcc0_1
+
+.unreq    qTemp
+.unreq    dTemp_0
+.unreq    dTemp_1
+
+.unreq    qMaskTmp
+.unreq    dMaskTmp_0
+.unreq    dMaskTmp_1
+
+        .end
diff --git a/modules/dsp/NE10_fir_init.c b/modules/dsp/NE10_fir_init.c
new file mode 100644
index 0000000..de9bce4
--- /dev/null
+++ b/modules/dsp/NE10_fir_init.c
@@ -0,0 +1,276 @@
+/*
+ *  Copyright 2012 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <string.h>
+
+#include "NE10_types.h"
+
+/**
+ * @details
+ *
+ * @param[in,out] *S points to an instance of the floating-point FIR filter structure.
+ * @param[in]     numTaps  Number of filter coefficients in the filter.
+ * @param[in]     *pCoeffs points to the filter coefficients buffer.
+ * @param[in]     *pState points to the state buffer.
+ * @param[in]     blockSize number of samples that are processed per call.
+ * @return        none.
+ *
+ * <b>Description:</b>
+ * \par
+ * <code>pCoeffs</code> points to the array of filter coefficients stored in time reversed order:
+ * <pre>
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * </pre>
+ * \par
+ * <code>pState</code> points to the array of state variables.
+ * <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples, where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f32()</code>.
+ */
+
+ne10_result_t ne10_fir_init_float (ne10_fir_instance_f32_t * S,
+                                   ne10_uint16_t numTaps,
+                                   ne10_float32_t * pCoeffs,
+                                   ne10_float32_t * pState,
+                                   ne10_uint32_t blockSize)
+{
+    /* Assign filter taps */
+    S->numTaps = numTaps;
+
+    /* Assign coefficient pointer */
+    S->pCoeffs = pCoeffs;
+
+    /* Clear state buffer and the size of state buffer is (blockSize + numTaps - 1) */
+    memset (pState, 0, (numTaps + (blockSize - 1u)) * sizeof (ne10_float32_t));
+
+    /* Assign state pointer */
+    S->pState = pState;
+    return NE10_OK;
+}
+
+/**
+ * @brief  Initialization function for the floating-point FIR decimator.
+ * @param[in,out] *S points to an instance of the floating-point FIR decimator structure.
+ * @param[in] numTaps  number of coefficients in the filter.
+ * @param[in] M  decimation factor.
+ * @param[in] *pCoeffs points to the filter coefficients.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return    The function returns NE10_OK if initialization was successful or NE10_ERR if
+ * <code>blockSize</code> is not a multiple of <code>M</code>.
+ *
+ * <b>Description:</b>
+ * \par
+ * <code>pCoeffs</code> points to the array of filter coefficients stored in time reversed order:
+ * <pre>
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * </pre>
+ * \par
+ * <code>pState</code> points to the array of state variables.
+ * <code>pState</code> is of length <code>numTaps+blockSize-1</code> words where <code>blockSize</code> is the number of input samples passed to <code>arm_fir_decimate_f32()</code>.
+ * <code>M</code> is the decimation factor.
+ */
+
+ne10_result_t ne10_fir_decimate_init_float (
+    ne10_fir_decimate_instance_f32_t * S,
+    ne10_uint16_t numTaps,
+    ne10_uint8_t M,
+    ne10_float32_t * pCoeffs,
+    ne10_float32_t * pState,
+    ne10_uint32_t blockSize)
+{
+    ne10_result_t status;
+
+    /* The size of the input block must be a multiple of the decimation factor */
+    if ( (blockSize % M) != 0u)
+    {
+        /* Set status as NE10_ERR */
+        status = NE10_ERR;
+    }
+    else
+    {
+        /* Assign filter taps */
+        S->numTaps = numTaps;
+
+        /* Assign coefficient pointer */
+        S->pCoeffs = pCoeffs;
+
+        /* Clear state buffer and size is always (blockSize + numTaps - 1) */
+        memset (pState, 0, (numTaps + (blockSize - 1u)) * sizeof (ne10_float32_t));
+
+        /* Assign state pointer */
+        S->pState = pState;
+
+        /* Assign Decimation Factor */
+        S->M = M;
+
+        status = NE10_OK;
+    }
+
+    return (status);
+
+}
+
+/**
+ * @brief  Initialization function for the floating-point FIR interpolator.
+ * @param[in,out] *S        points to an instance of the floating-point FIR interpolator structure.
+ * @param[in]     L         upsample factor.
+ * @param[in]     numTaps   number of filter coefficients in the filter.
+ * @param[in]     *pCoeffs  points to the filter coefficient buffer.
+ * @param[in]     *pState   points to the state buffer.
+ * @param[in]     blockSize number of input samples to process per call.
+ * @return        The function returns NE10_OK if initialization was successful or NE10_ERR if
+ * the filter length <code>numTaps</code> is not a multiple of the interpolation factor <code>L</code>.
+ *
+ * <b>Description:</b>
+ * \par
+ * <code>pCoeffs</code> points to the array of filter coefficients stored in time reversed order:
+ * <pre>
+ *    {b[numTaps-1], b[numTaps-2], b[numTaps-2], ..., b[1], b[0]}
+ * </pre>
+ * The length of the filter <code>numTaps</code> must be a multiple of the interpolation factor <code>L</code>.
+ * \par
+ * <code>pState</code> points to the array of state variables.
+ * <code>pState</code> is of length <code>(numTaps/L)+blockSize-1</code> words
+ * where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_interpolate_f32()</code>.
+ */
+
+ne10_result_t ne10_fir_interpolate_init_float (
+    ne10_fir_interpolate_instance_f32_t * S,
+    ne10_uint8_t L,
+    ne10_uint16_t numTaps,
+    ne10_float32_t * pCoeffs,
+    ne10_float32_t * pState,
+    ne10_uint32_t blockSize)
+{
+    ne10_result_t status;
+
+    /* The filter length must be a multiple of the interpolation factor */
+    if ( (numTaps % L) != 0u)
+    {
+        /* Set status as NE10_ERR */
+        status = NE10_ERR;
+    }
+    else
+    {
+
+        /* Assign coefficient pointer */
+        S->pCoeffs = pCoeffs;
+
+        /* Assign Interpolation factor */
+        S->L = L;
+
+        /* Assign polyPhaseLength */
+        S->phaseLength = numTaps / L;
+
+        /* Clear state buffer and size of state array is always phaseLength + blockSize - 1 */
+        memset (pState, 0,
+                (blockSize +
+                 ( (ne10_uint32_t) S->phaseLength - 1u)) * sizeof (ne10_float32_t));
+
+        /* Assign state pointer */
+        S->pState = pState;
+
+        status = NE10_OK;
+    }
+
+    return (status);
+
+}
+
+/**
+ * @brief Initialization function for the floating-point FIR lattice filter.
+ * @param[in] *S points to an instance of the floating-point FIR lattice structure.
+ * @param[in] numStages  number of filter stages.
+ * @param[in] *pCoeffs points to the coefficient buffer.  The array is of length numStages.
+ * @param[in] *pState points to the state buffer.  The array is of length numStages.
+ * @return none.
+ */
+
+ne10_result_t ne10_fir_lattice_init_float (
+    ne10_fir_lattice_instance_f32_t * S,
+    ne10_uint16_t numStages,
+    ne10_float32_t * pCoeffs,
+    ne10_float32_t * pState)
+{
+    /* Assign filter taps */
+    S->numStages = numStages;
+
+    /* Assign coefficient pointer */
+    S->pCoeffs = pCoeffs;
+
+    /* Clear state buffer and size is always numStages */
+    memset (pState, 0, (numStages) * sizeof (ne10_float32_t));
+
+    /* Assign state pointer */
+    S->pState = pState;
+
+    return NE10_OK;
+}
+
+/**
+ * @brief  Initialization function for the floating-point sparse FIR filter.
+ * @param[in,out] *S         points to an instance of the floating-point sparse FIR structure.
+ * @param[in]     numTaps    number of nonzero coefficients in the filter.
+ * @param[in]     *pCoeffs   points to the array of filter coefficients.
+ * @param[in]     *pState    points to the state buffer.
+ * @param[in]     *pTapDelay points to the array of offset times.
+ * @param[in]     maxDelay   maximum offset time supported.
+ * @param[in]     blockSize  number of samples that will be processed per block.
+ * @return none
+ *
+ * <b>Description:</b>
+ * \par
+ * <code>pCoeffs</code> holds the filter coefficients and has length <code>numTaps</code>.
+ * <code>pState</code> holds the filter's state variables and must be of length
+ * <code>maxDelay + blockSize</code>, where <code>maxDelay</code>
+ * is the maximum number of delay line values.
+ * <code>blockSize</code> is the
+ * number of samples processed by the <code>arm_fir_sparse_f32()</code> function.
+ */
+
+ne10_result_t ne10_fir_sparse_init_float (
+    ne10_fir_sparse_instance_f32_t * S,
+    ne10_uint16_t numTaps,
+    ne10_float32_t * pCoeffs,
+    ne10_float32_t * pState,
+    ne10_int32_t * pTapDelay,
+    ne10_uint16_t maxDelay,
+    ne10_uint32_t blockSize)
+{
+    /* Assign filter taps */
+    S->numTaps = numTaps;
+
+    /* Assign coefficient pointer */
+    S->pCoeffs = pCoeffs;
+
+    /* Assign TapDelay pointer */
+    S->pTapDelay = pTapDelay;
+
+    /* Assign MaxDelay */
+    S->maxDelay = maxDelay;
+
+    /* reset the stateIndex to 0 */
+    S->stateIndex = 0u;
+
+    /* Clear state buffer and size is always maxDelay + blockSize */
+    memset (pState, 0, (maxDelay + blockSize) * sizeof (ne10_float32_t));
+
+    /* Assign state pointer */
+    S->pState = pState;
+
+    return NE10_OK;
+}
+
+
diff --git a/modules/dsp/NE10_iir.c b/modules/dsp/NE10_iir.c
new file mode 100644
index 0000000..21afc7d
--- /dev/null
+++ b/modules/dsp/NE10_iir.c
@@ -0,0 +1,306 @@
+/*
+ *  Copyright 2012 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : dsp/NE10_iir.c
+ */
+
+#include "NE10_types.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @defgroup IIR_Lattice Infinite Impulse Response (IIR) Lattice Filters
+ *
+ * This set of functions implements lattice filters
+ * for Q15, Q31 and floating-point data types.  Lattice filters are used in a
+ * variety of adaptive filter applications.  The filter structure has feedforward and
+ * feedback components and the net impulse response is infinite length.
+ * The functions operate on blocks
+ * of input and output data and each call to the function processes
+ * <code>blockSize</code> samples through the filter.  <code>pSrc</code> and
+ * <code>pDst</code> point to input and output arrays containing <code>blockSize</code> values.
+
+ * \par Algorithm:
+ * \image html IIRLattice.gif "Infinite Impulse Response Lattice filter"
+ * <pre>
+ *    fN(n)   =  x(n)
+ *    fm-1(n) = fm(n) - km * gm-1(n-1)   for m = N, N-1, ...1
+ *    gm(n)   = km * fm-1(n) + gm-1(n-1) for m = N, N-1, ...1
+ *    y(n)    = vN * gN(n) + vN-1 * gN-1(n) + ...+ v0 * g0(n)
+ * </pre>
+ * \par
+ * <code>pkCoeffs</code> points to array of reflection coefficients of size <code>numStages</code>.
+ * Reflection coefficients are stored in time-reversed order.
+ * \par
+ * <pre>
+ *    {kN, kN-1, ....k1}
+ * </pre>
+ * <code>pvCoeffs</code> points to the array of ladder coefficients of size <code>(numStages+1)</code>.
+ * Ladder coefficients are stored in time-reversed order.
+ * \par
+ * <pre>
+ *    v0, v1, ...vN
+ * </pre>
+ * <code>pState</code> points to a state array of size <code>numStages + blockSize</code>.
+ * The state variables shown in the figure above (the g values) are stored in the <code>pState</code> array.
+ * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+  *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ *
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Set the values in the state buffer to zeros and then manually initialize the instance structure as follows:
+ * <pre>
+ *arm_iir_lattice_instance_f32 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ *arm_iir_lattice_instance_q31 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ *arm_iir_lattice_instance_q15 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ * </pre>
+ * \par
+ * where <code>numStages</code> is the number of stages in the filter; <code>pState</code> points to the state buffer array;
+ * <code>pkCoeffs</code> points to array of the reflection coefficients; <code>pvCoeffs</code> points to the array of ladder coefficients.
+ * \par Fixed-Point Behavior
+ * Care must be taken when using the fixed-point versions of the IIR lattice filter functions.
+ * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ * Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ * @addtogroup IIR_Lattice
+ * @{
+ */
+
+/**
+ * @brief Processing function for the floating-point IIR lattice filter.
+ * @param[in] *S points to an instance of the floating-point IIR lattice structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ */
+
+void ne10_iir_lattice_float_c (const ne10_iir_lattice_instance_f32_t * S,
+                               ne10_float32_t * pSrc,
+                               ne10_float32_t * pDst,
+                               ne10_uint32_t blockSize)
+{
+    ne10_float32_t fcurr, fnext = 0, gcurr, gnext;      /* Temporary variables for lattice stages */
+    ne10_float32_t acc;                                 /* Accumlator */
+    ne10_uint32_t blkCnt, tapCnt;                       /* temporary variables for counts */
+    ne10_float32_t *px1, *px2, *pk, *pv;                /* temporary pointers for state and coef */
+    ne10_uint32_t numStages = S->numStages;             /* number of stages */
+    ne10_float32_t *pState;                             /* State pointer */
+    ne10_float32_t *pStateCurnt;                        /* State current pointer */
+
+
+    /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+    gcurr = 0.0f;
+    blkCnt = blockSize;
+
+    pState = &S->pState[0];
+
+    /* Sample processing */
+    while (blkCnt > 0u)
+    {
+        /* Read Sample from input buffer */
+        /* fN(n) = x(n) */
+        fcurr = *pSrc++;
+
+        /* Initialize state read pointer */
+        px1 = pState;
+        /* Initialize state write pointer */
+        px2 = pState;
+        /* Set accumulator to zero */
+        acc = 0.0f;
+        /* Initialize Ladder coeff pointer */
+        pv = &S->pvCoeffs[S->numStages];
+        /* Initialize Reflection coeff pointer */
+        pk = &S->pkCoeffs[0];
+
+
+        /* Process sample for first tap */
+        gcurr = *px1++;
+        /* fN-1(n) = fN(n) - kN * gN-1(n-1) */
+        fnext = fcurr - ( (*pk) * gcurr);
+        /* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+        gnext = (fnext * (*pk++)) + gcurr;
+        /* write gN(n) into state for next sample processing */
+        *px2++ = gnext;
+        /* y(n) += gN(n) * vN  */
+        acc += (gnext * (*pv--));
+
+        /* Update f values for next coefficient processing */
+        fcurr = fnext;
+
+        /* Loop unrolling.  Process 4 taps at a time. */
+        tapCnt = (numStages - 1u) >> 2;
+
+        while (tapCnt > 0u)
+        {
+            /* Process sample for 2nd, 6th ...taps */
+            /* Read gN-2(n-1) from state buffer */
+            gcurr = *px1++;
+            /* Process sample for 2nd, 6th .. taps */
+            /* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */
+            fnext = fcurr - ( (*pk) * gcurr);
+            /* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */
+            gnext = (fnext * (*pk++)) + gcurr;
+            /* y(n) += gN-1(n) * vN-1  */
+            /* process for gN-5(n) * vN-5, gN-9(n) * vN-9 ... */
+            acc += (gnext * (*pv--));
+            /* write gN-1(n) into state for next sample processing */
+            *px2++ = gnext;
+
+
+            /* Process sample for 3nd, 7th ...taps */
+            /* Read gN-3(n-1) from state buffer */
+            gcurr = *px1++;
+            /* Process sample for 3rd, 7th .. taps */
+            /* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */
+            fcurr = fnext - ( (*pk) * gcurr);
+            /* gN-2(n) = kN-2 * fN-3(n) + gN-3(n-1) */
+            gnext = (fcurr * (*pk++)) + gcurr;
+            /* y(n) += gN-2(n) * vN-2  */
+            /* process for gN-6(n) * vN-6, gN-10(n) * vN-10 ... */
+            acc += (gnext * (*pv--));
+            /* write gN-2(n) into state for next sample processing */
+            *px2++ = gnext;
+
+
+            /* Process sample for 4th, 8th ...taps */
+            /* Read gN-4(n-1) from state buffer */
+            gcurr = *px1++;
+            /* Process sample for 4th, 8th .. taps */
+            /* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */
+            fnext = fcurr - ( (*pk) * gcurr);
+            /* gN-3(n) = kN-3 * fN-4(n) + gN-4(n-1) */
+            gnext = (fnext * (*pk++)) + gcurr;
+            /* y(n) += gN-3(n) * vN-3  */
+            /* process for gN-7(n) * vN-7, gN-11(n) * vN-11 ... */
+            acc += (gnext * (*pv--));
+            /* write gN-3(n) into state for next sample processing */
+            *px2++ = gnext;
+
+
+            /* Process sample for 5th, 9th ...taps */
+            /* Read gN-5(n-1) from state buffer */
+            gcurr = *px1++;
+            /* Process sample for 5th, 9th .. taps */
+            /* fN-5(n) = fN-4(n) - kN-4 * gN-1(n-1) */
+            fcurr = fnext - ( (*pk) * gcurr);
+            /* gN-4(n) = kN-4 * fN-5(n) + gN-5(n-1) */
+            gnext = (fcurr * (*pk++)) + gcurr;
+            /* y(n) += gN-4(n) * vN-4  */
+            /* process for gN-8(n) * vN-8, gN-12(n) * vN-12 ... */
+            acc += (gnext * (*pv--));
+            /* write gN-4(n) into state for next sample processing */
+            *px2++ = gnext;
+
+            tapCnt--;
+
+        }
+
+        fnext = fcurr;
+
+        /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+        tapCnt = (numStages - 1u) % 0x4u;
+
+        while (tapCnt > 0u)
+        {
+            gcurr = *px1++;
+            /* Process sample for last taps */
+            fnext = fcurr - ( (*pk) * gcurr);
+            gnext = (fnext * (*pk++)) + gcurr;
+            /* Output samples for last taps */
+            acc += (gnext * (*pv--));
+            *px2++ = gnext;
+            fcurr = fnext;
+
+            tapCnt--;
+
+        }
+
+
+        /* y(n) += g0(n) * v0 */
+        acc += (fnext * (*pv));
+
+        *px2++ = fnext;
+
+        /* write out into pDst */
+        *pDst++ = acc;
+
+        /* Advance the state pointer by 4 to process the next group of 4 samples */
+        pState = pState + 1u;
+        blkCnt--;
+
+    }
+
+    /* Processing is complete. Now copy last S->numStages samples to start of the buffer
+       for the preperation of next frame process */
+
+    /* Points to the start of the state buffer */
+    pStateCurnt = &S->pState[0];
+    pState = &S->pState[blockSize];
+
+    tapCnt = numStages >> 2u;
+
+    /* copy data */
+    while (tapCnt > 0u)
+    {
+        *pStateCurnt++ = *pState++;
+        *pStateCurnt++ = *pState++;
+        *pStateCurnt++ = *pState++;
+        *pStateCurnt++ = *pState++;
+
+        /* Decrement the loop counter */
+        tapCnt--;
+
+    }
+
+    /* Calculate remaining number of copies */
+    tapCnt = (numStages) % 0x4u;
+
+    /* Copy the remaining q31_t data */
+    while (tapCnt > 0u)
+    {
+        *pStateCurnt++ = *pState++;
+
+        /* Decrement the loop counter */
+        tapCnt--;
+    }
+
+}
+
+
+
+
+/**
+ * @} end of IIR_Lattice group
+ */
diff --git a/modules/dsp/NE10_iir.neon.s b/modules/dsp/NE10_iir.neon.s
new file mode 100644
index 0000000..81af588
--- /dev/null
+++ b/modules/dsp/NE10_iir.neon.s
@@ -0,0 +1,378 @@
+@/*
+@ * Copyright 2012 ARM Limited
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License")@
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at
+@ *
+@ *     http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ */
+
+@/*
+@ * NE10 Library : dsp/NE10_iir.neon.s
+@ */
+
+
+        .text
+        .syntax   unified
+
+        @/**
+        @ * @brief Processing function for the floating-point IIR lattice filter.
+        @ * @param[in] *S points to an instance of the floating-point IIR lattice structure.
+        @ * @param[in] *pSrc points to the block of input data.
+        @ * @param[out] *pDst points to the block of output data.
+        @ * @param[in] blockSize number of samples to process.
+        @ * @return none.
+        @ */
+
+        .align   4
+        .global   ne10_iir_lattice_float_neon
+        .extern   ne10_qMaskTable32
+        .thumb
+        .thumb_func
+
+ne10_iir_lattice_float_neon:
+                        PUSH    {r4-r12,lr}
+                        @VPUSH   {d8,d9}
+
+@/*ARM Registers*/
+pStateStruct     .req   R0
+pSrc             .req   R1
+pDst             .req   R2
+blockSize        .req   R3
+
+pState           .req   R4             @/* State pointer */
+pKcoeffs         .req   R5             @/* Coefficient pointer */
+pVcoeffs         .req   R6             @/* Coefficient pointer */
+
+pX               .req   R7             @/* Temporary pointers for state buffer */
+pK               .req   R8             @/* Temporary pointers for coefficient buffer */
+numStages        .req   R9             @/* Length of the filter */
+
+tapCnt           .req   R10            @ /* Loop counter */
+pTemp            .req   R11
+
+
+pMask            .req   R14            @  /* Mask Table */
+
+mask             .req   R12
+pV               .req   R12
+
+@/*NEON variale Declaration*/
+dTemp3a_0        .dn    D0.U32
+dTemp3_0         .dn    D0.F32
+dMask2           .dn    D1.U32
+
+qGcurr           .qn   Q1.F32
+dGcurr_0         .dn   D2.F32
+dGcurr_1         .dn   D3.F32
+
+qZero            .qn   Q2.F32
+
+qMask            .qn   Q3.U32
+dMask_0          .dn   D6.U32
+dMask_1          .dn   D7.U32
+dOut_0           .dn   D6.F32
+dOut_1           .dn   D7.F32
+
+qGK              .qn   Q4.F32
+dGK_0            .dn   D8.F32
+dGK_1            .dn   D9.F32
+
+qAcc0            .qn   Q8.F32
+dAcc0_0          .dn   D16.F32
+dAcc0_1          .dn   D17.F32
+
+qTemp            .qn   Q9.F32
+dTemp_0          .dn   D18.F32
+dTemp_1          .dn   D19.F32
+
+qFnext           .qn   Q10.F32
+dFnext_0         .dn   D20.F32
+dFnext_1         .dn   D21.F32
+
+qFcurr           .qn   Q11.F32
+dFcurr_0         .dn   D22.F32
+dFcurr_1         .dn   D23.F32
+
+qCoeff0          .qn   Q12.F32
+dCoeff0_0        .dn   D24.F32
+dCoeff0_1        .dn   D25.F32
+
+qMask1           .qn   Q13.U32
+dMask1_0         .dn   D26.U32
+dMask1_1         .dn   D27.U32
+
+
+qMaskTmp         .qn   Q14.U32
+dMaskTmp_0       .dn   D28.U32
+dMaskTmp_1       .dn   D29.U32
+
+qGnext           .qn   Q15.F32
+dGnext_0         .dn   D30.F32
+dGnext_1         .dn   D31.F32
+
+
+                            @/* Length of the filter */
+                            LDRH         numStages,[pStateStruct],#4
+                            @/* State pointer */
+                            LDR         pState,[pStateStruct],#4
+                            @/* Coefficient pointer */
+                            LDR         pKcoeffs,[pStateStruct],#4
+                            LDR         pVcoeffs,[pStateStruct],#4
+
+
+                            @/*Load Mask Valies*/
+                            LDR         pMask,=ne10_qMaskTable32
+                            AND         mask,numStages,#3
+                            ADD         tapCnt,mask,#1
+
+                            ADD         pTemp,pMask,mask,LSL #4
+                            ADD         tapCnt,pMask,tapCnt,LSL #4
+
+                            VLD1        {dMaskTmp_0,dMaskTmp_1},[pTemp]
+                            VLD1        {dMask1_0,dMask1_1},[tapCnt]
+
+                            ADD         pTemp,pMask,#16
+                            VEOR        qZero,qZero
+                            VLD1        {dMask2},[pTemp]
+
+                            @/*while blockSize > 0*/
+                            CMP         blockSize, #0
+                            BEQ         firLatticeCopy
+
+
+firLatticeOuterLoop:
+                            VLD1        {dFcurr_0[],dFcurr_1[]},[pSrc]!
+                            MOV         pX,pState
+                            VEOR        qAcc0,qAcc0
+                            @/* Initialize Ladder coeff pointer */
+                            ADD         pV,pVcoeffs,numStages, LSL #2
+                            MOV         pK,pKcoeffs
+
+                            VLD1        {dGcurr_0,dGcurr_1},[pX]
+                            @/* Load the filter Taps */
+                            VLD1        {dCoeff0_0,dCoeff0_1},[pK]!
+
+                            SUBS        tapCnt,numStages,#4
+                            ADD         pV,pV,#4
+                            BLT         firLatticeEndInnerLoop
+
+
+firLatticeInnerLoop:
+
+
+
+                            VMUL        qGK,qGcurr,qCoeff0
+
+                            @/* g4k4+g5k5 g6k6+g7k7*/
+                            VPADD       dTemp_0,dGK_1,dGK_0
+                            @/*g6k6 g4k4+g5k5*/
+                            VEXT        dTemp_1,dTemp_0,dGK_1,#1
+                            @/*g7k7+g6k6+g5k5+g4k4 g6k6+g5k5+g4k4*/
+                            VPADD       dTemp_1,dTemp_1,dTemp_0
+                            VMOV        dTemp3a_0,dMask2
+                            VBSL        dTemp3a_0,dGK_0,dTemp_0
+                            VMOV        dTemp_0,dTemp3_0
+                            VSUB        qFnext,qFcurr,qTemp
+
+                            @/* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+                            VMLA        qGcurr,qFnext,qCoeff0
+
+                            @/* y(n) += gN(n) * vN  */
+                            SUB         pV,pV,#16
+                            VLD1        {dCoeff0_0,dCoeff0_1},[pV]
+
+                            @/* write gN-1(n-1) into state for next sample processing */
+                            VST1        {dGcurr_0,dGcurr_1},[pX]!
+                            VREV64      qCoeff0,qCoeff0
+                            @/* acc0 += gnext * (*pv--)@  */
+                            VMLA        dAcc0_0,dGcurr_0,dCoeff0_1
+                            VMLA        dAcc0_1,dGcurr_1,dCoeff0_0
+
+                            @/* Update f values for next coefficients processing */
+
+                            VDUP        qFcurr,dFnext_1[1]
+
+                            VLD1        {dGcurr_0,dGcurr_1},[pX]
+                            @/* Load the filter Taps */
+                            VLD1        {dCoeff0_0,dCoeff0_1},[pK]!
+
+                            SUBS        tapCnt,#4
+                            BGE         firLatticeInnerLoop
+firLatticeEndInnerLoop:
+                            @/* If the filter length is not a multiple of 4, compute the remaining filter taps */
+                            ADDS        tapCnt,#4
+                            IT          GT
+                            SUBGT       tapCnt,#1
+
+
+                            VMUL        qGK,qGcurr,qCoeff0
+
+                            VPADD       dTemp_0,dGK_1,dGK_0
+                            VEXT        dTemp_1,dTemp_0,dGK_1,#1
+                            VPADD       dTemp_1,dTemp_1,dTemp_0
+                            VMOV        dTemp3a_0,dMask2
+                            VBSL        dTemp3a_0,dGK_0,dTemp_0
+                            VMOV        dTemp_0,dTemp3_0
+
+                            @/*Mask the Uncessary f values*/
+                            VMOV        qFnext,qMaskTmp
+                            VBSL        qFnext,qTemp,qZero
+                            VSUB        qFnext,qFcurr,qFnext
+
+                            VMOV        qGnext,qGcurr
+                            VMLA        qGnext,qFnext,qCoeff0
+
+                            @/*Store on to stack for getting proper Fnext*/
+                            SUB         pTemp,SP,#20
+                            VST1        {dFnext_0,dFnext_1},[pTemp]
+
+                            ADD         pTemp,pTemp,tapCnt, LSL #2
+                            VLD1        {dTemp_0[],dTemp_1[]},[pTemp]
+
+                            VMOV        qGcurr,qMaskTmp
+                            VBSL        qGcurr,qGnext,qTemp
+
+                            VLD1        {dTemp_0,dTemp_1},[pX]
+                            VMOV        qMask,qMask1
+                            VBSL        qMask,qGcurr,qTemp
+                            VST1        {dMask_0,dMask_1},[pX]
+
+                            ADD         pX,pX,tapCnt,LSL #2
+
+                            SUB         pV,pV,#16
+                            VLD1        {dCoeff0_0,dCoeff0_1},[pV]
+
+                            @// MASk the Gnext value used for Output calculation
+                            VMOV        qGnext,qMask1
+                            VBSL        qGnext,qGcurr,qZero
+                            ADD         pX,pX,#4
+
+                            VREV64      qCoeff0,qCoeff0
+
+                            VMLA        dAcc0_0,dGnext_0,dCoeff0_1
+                            VMLA        dAcc0_1,dGnext_1,dCoeff0_0
+
+                            /*Get Accumulated Result in to single Value*/
+
+                            VLD1        {dTemp_1},[pDst]
+                            VPADD       dTemp_0,dAcc0_0,dAcc0_1
+                            VPADD       dTemp_0,dTemp_0
+
+                            VMOV        dMask_0,dMask2
+                            VBSL        dMask_0,dTemp_0,dTemp_1
+
+                            VST1        {dMask_0},[pDst]
+                            ADD         pDst,#4
+                            ADD         pState,#4
+
+                            SUBS        blockSize,#1
+
+                            BGT         firLatticeOuterLoop
+
+
+                            @/* copy last S->numStages samples to start of the buffer
+                            @for next frame process */
+
+firLatticeCopy:
+                            AND         mask,numStages,#3
+                            ADD         pTemp,pMask,mask,LSL #4
+                            LDR         pX,[pStateStruct,#-12]
+
+                            VLD1        {dFcurr_0,dFcurr_1},[pState]!
+                            VLD1        {dMask_0,dMask_1},[pTemp]
+                            SUBS        tapCnt,numStages,#4
+                            BLT         firLatticeEnd
+firLatticeCopyLoop:
+                            VST1        {dFcurr_0,dFcurr_1},[pX]!
+                            SUBS        tapCnt,#4
+                            VLD1        {dFcurr_0,dFcurr_1},[pState]!
+                            BGE         firLatticeCopyLoop
+firLatticeEnd:
+                            VLD1        {dTemp_0,dTemp_1},[pX]
+                            VBSL        qMask,qFcurr,qTemp
+                            VST1        {dOut_0,dOut_1},[pX]
+                            ADD         pX,pX,mask, LSL #2
+@/*ARM Registers*/
+.unreq    pStateStruct
+.unreq    pSrc
+.unreq    pDst
+.unreq    blockSize
+
+.unreq    pState
+.unreq    pKcoeffs
+.unreq    pVcoeffs
+
+.unreq    pX
+.unreq    pK
+.unreq    numStages
+
+.unreq    tapCnt
+.unreq    pTemp
+.unreq    pMask
+.unreq    mask
+.unreq    pV
+
+@/*NEON variale Declaration*/
+.unreq    dTemp3a_0
+.unreq    dTemp3_0
+.unreq    dMask2
+
+.unreq    qGcurr
+.unreq    dGcurr_0
+.unreq    dGcurr_1
+
+.unreq    qZero
+.unreq    qMask
+.unreq    dMask_0
+.unreq    dMask_1
+.unreq    dOut_0
+.unreq    dOut_1
+
+.unreq    qGK
+.unreq    dGK_0
+.unreq    dGK_1
+
+.unreq    qAcc0
+.unreq    dAcc0_0
+.unreq    dAcc0_1
+
+.unreq    qTemp
+.unreq    dTemp_0
+.unreq    dTemp_1
+
+.unreq    qFnext
+.unreq    dFnext_0
+.unreq    dFnext_1
+
+.unreq    qFcurr
+.unreq    dFcurr_0
+.unreq    dFcurr_1
+
+.unreq    qCoeff0
+.unreq    dCoeff0_0
+.unreq    dCoeff0_1
+
+.unreq    qMask1
+.unreq    dMask1_0
+.unreq    dMask1_1
+
+.unreq    qMaskTmp
+.unreq    dMaskTmp_0
+.unreq    dMaskTmp_1
+
+.unreq    qGnext
+.unreq    dGnext_0
+.unreq    dGnext_1
+
+                            @VPOP        {d8,d9}
+                            POP         {r4-r12,pc}
+
+        .end
diff --git a/modules/dsp/NE10_iir_init.c b/modules/dsp/NE10_iir_init.c
new file mode 100644
index 0000000..22185dc
--- /dev/null
+++ b/modules/dsp/NE10_iir_init.c
@@ -0,0 +1,61 @@
+/*
+ *  Copyright 2012 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : dsp/NE10_iir_init.c
+ */
+#include "NE10_types.h"
+
+
+/**
+ * @brief Initialization function for the floating-point IIR lattice filter.
+ * @param[in] *S points to an instance of the floating-point IIR lattice structure.
+ * @param[in] numStages number of stages in the filter.
+ * @param[in] *pkCoeffs points to the reflection coefficient buffer.  The array is of length numStages.
+ * @param[in] *pvCoeffs points to the ladder coefficient buffer.  The array is of length numStages+1.
+ * @param[in] *pState points to the state buffer.  The array is of length numStages+blockSize.
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ */
+
+ne10_result_t ne10_iir_lattice_init_float (ne10_iir_lattice_instance_f32_t * S,
+        ne10_uint16_t numStages,
+        ne10_float32_t * pkCoeffs,
+        ne10_float32_t * pvCoeffs,
+        ne10_float32_t * pState,
+        ne10_uint32_t blockSize)
+{
+    /* Assign filter taps */
+    S->numStages = numStages;
+
+    /* Assign reflection coefficient pointer */
+    S->pkCoeffs = pkCoeffs;
+
+    /* Assign ladder coefficient pointer */
+    S->pvCoeffs = pvCoeffs;
+
+    /* Clear state buffer and size is always blockSize + numStages */
+    memset (pState, 0, (numStages + blockSize) * sizeof (ne10_float32_t));
+
+    /* Assign state pointer */
+    S->pState = pState;
+
+    return NE10_OK;
+}
+
+/**
+ * @} end of IIR_Lattice group
+ */
diff --git a/modules/dsp/NE10_init_dsp.c b/modules/dsp/NE10_init_dsp.c
index fefcc1e..7885a0c 100644
--- a/modules/dsp/NE10_init_dsp.c
+++ b/modules/dsp/NE10_init_dsp.c
@@ -25,12 +25,28 @@ ne10_result_t ne10_init_dsp (ne10_int32_t is_NEON_available)
         ne10_radix4_butterfly_float = ne10_radix4_butterfly_float_neon;
         ne10_radix4_butterfly_inverse_float = ne10_radix4_butterfly_inverse_float_neon;
         ne10_rfft_float = ne10_rfft_float_neon;
+
+        ne10_fir_float = ne10_fir_float_neon;
+        ne10_fir_decimate_float = ne10_fir_decimate_float_neon;
+        ne10_fir_interpolate_float = ne10_fir_interpolate_float_neon;
+        ne10_fir_lattice_float = ne10_fir_lattice_float_neon;
+        ne10_fir_sparse_float = ne10_fir_sparse_float_neon;
+
+        ne10_iir_lattice_float = ne10_iir_lattice_float_neon;
     }
     else
     {
         ne10_radix4_butterfly_float = ne10_radix4_butterfly_float_c;
         ne10_radix4_butterfly_inverse_float = ne10_radix4_butterfly_inverse_float_c;
         ne10_rfft_float = ne10_rfft_float_c;
+
+        ne10_fir_float = ne10_fir_float_c;
+        ne10_fir_decimate_float = ne10_fir_decimate_float_c;
+        ne10_fir_interpolate_float = ne10_fir_interpolate_float_c;
+        ne10_fir_lattice_float = ne10_fir_lattice_float_c;
+        ne10_fir_sparse_float = ne10_fir_sparse_float_c;
+
+        ne10_iir_lattice_float = ne10_iir_lattice_float_c;
     }
     return NE10_OK;
 }
@@ -51,3 +67,38 @@ void (*ne10_rfft_float)(
                      ne10_float32_t * pSrc,
                      ne10_float32_t * pDst,
                      ne10_float32_t * pTemp);
+
+void (*ne10_fir_float)(const ne10_fir_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
+
+void (*ne10_fir_decimate_float)(
+                     const ne10_fir_decimate_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
+
+void (*ne10_fir_interpolate_float)(
+                     const ne10_fir_interpolate_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
+
+void (*ne10_fir_lattice_float)(
+                     const ne10_fir_lattice_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);
+
+void (*ne10_fir_sparse_float)(
+                     ne10_fir_sparse_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_float32_t * pScratchIn,
+                     ne10_uint32_t blockSize);
+
+void (*ne10_iir_lattice_float)(const ne10_iir_lattice_instance_f32_t * S,
+                     ne10_float32_t * pSrc,
+                     ne10_float32_t * pDst,
+                     ne10_uint32_t blockSize);