add notes and image for doxygen

author yang <yang.zhang@arm.com>

Tue, 18 Dec 2012 08:33:59 +0000 (16:33 +0800)

committer yang <yang.zhang@arm.com>

Tue, 18 Dec 2012 08:33:59 +0000 (16:33 +0800)
author yang <yang.zhang@arm.com>
Tue, 18 Dec 2012 08:33:59 +0000 (16:33 +0800)
committer yang <yang.zhang@arm.com>
Tue, 18 Dec 2012 08:33:59 +0000 (16:33 +0800)
diff --git a/doc/FunctionList.txt b/doc/FunctionList.txt

deleted file mode 100644 (file)

index c8845d6..0000000
--- a/doc/FunctionList.txt
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- *  Copyright 2012 ARM Limited
- *  All rights reserved.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions are met:
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *    * Neither the name of ARM Limited nor the
- *      names of its contributors may be used to endorse or promote products
- *      derived from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
- *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
- *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * NE10 Library : FunctionList.txt
- */
-
-UPDATE HISTORY
-==============
-----UPDATED ON: 30 / NOV / 2012
-----UPDATED ON: 10 / APR / 2012
-
-Overview
-=========
-
-This file lists currently available functions in Ne10.
-
-math module
-============
-  a) Vector Arithmetic
-
-   abs (float, vec2f, vec3f, vec4f)
-   addc (float, vec2f, vec3f, vec4f)
-   add (float, vec2f, vec3f, vec4f)
-   cross (vec3f)
-   divc (float, vec2f, vec3f, vec4f)
-   div (float, vec2f, vec3f, vec4f)
-   dot (vec2f, vec3f, vec4f)
-   len (vec2f, vec3f, vec4f)
-   mlac (float, vec2f, vec3f, vec4f)
-   mla (float, vec2f, vec3f, vec4f)
-   mulc (float, vec2f, vec3f, vec4f)
-   mul (float, vec2f, vec3f, vec4f)
-   normalize (vec2f, vec3f, vec4f)
-   rsbc (float, vec2f, vec3f, vec4f)
-   setc (float, vec2f, vec3f, vec4f)
-   subc (float, vec2f, vec3f, vec4f)
-   sub (float, vec2f, vec3f, vec4f)
-
-  b) Matrix operations:
-
-   addmat (2x2f, 3x3f, 4x4f)
-   detmat (2x2f, 3x3f, 4x4f)
-   identitymat (2x2f, 3x3f, 4x4f)
-   invmat (2x2f, 3x3f, 4x4f)
-   mulcmatvec (2x2f, 3x3f, 4x4f)
-   mulmat (2x2f, 3x3f, 4x4f)
-   submat (2x2f, 3x3f, 4x4f)
-   transmat (2x2f, 3x3f, 4x4f)
-
-dsp module
-===========
-  a) FFT
-
-   cfft (16, 64, 256, 1024 points)
-   rfft (128, 512 points)
-
-  b) Filter
-
-   fir
-   fir decimate
-   fir interpolat
-   fir lattice
-   fir sparse
-   iir lattice
diff --git a/tools/doxygen/doxygen.cfg b/doc/doxygen/doxygen.cfg

similarity index 99%

rename from tools/doxygen/doxygen.cfg

rename to doc/doxygen/doxygen.cfg

index 8d886fa..d34569d 100644 (file)
--- a/tools/doxygen/doxygen.cfg
+++ b/doc/doxygen/doxygen.cfg
@@ -858,7 +858,7 @@ GENERATE_HTML          = YES
  # If a relative path is entered the value of OUTPUT_DIRECTORY will be
  # put in front of it. If left blank `html' will be used as the default path.
  
-HTML_OUTPUT            = html
+HTML_OUTPUT            = ./documentation
  
  # The HTML_FILE_EXTENSION tag can be used to specify the file extension for
  # each generated HTML page (for example: .htm,.php,.asp). If it is left blank
diff --git a/doc/doxygen/image/CFFT.gif b/doc/doxygen/image/CFFT.gif

new file mode 100755 (executable)

index 0000000..1dd540c

Binary files /dev/null and b/doc/doxygen/image/CFFT.gif differ
diff --git a/doc/doxygen/image/FIR.gif b/doc/doxygen/image/FIR.gif

new file mode 100755 (executable)

index 0000000..2e0d1fc

Binary files /dev/null and b/doc/doxygen/image/FIR.gif differ
diff --git a/doc/doxygen/image/FIRDecimator.gif b/doc/doxygen/image/FIRDecimator.gif

new file mode 100755 (executable)

index 0000000..0229d31

Binary files /dev/null and b/doc/doxygen/image/FIRDecimator.gif differ
diff --git a/doc/doxygen/image/FIRInterpolator.gif b/doc/doxygen/image/FIRInterpolator.gif

new file mode 100755 (executable)

index 0000000..ee83141

Binary files /dev/null and b/doc/doxygen/image/FIRInterpolator.gif differ
diff --git a/doc/doxygen/image/FIRLattice.gif b/doc/doxygen/image/FIRLattice.gif

new file mode 100755 (executable)

index 0000000..7558ffa

Binary files /dev/null and b/doc/doxygen/image/FIRLattice.gif differ
diff --git a/doc/doxygen/image/FIRSparse.gif b/doc/doxygen/image/FIRSparse.gif

new file mode 100755 (executable)

index 0000000..bc05c4f

Binary files /dev/null and b/doc/doxygen/image/FIRSparse.gif differ
diff --git a/doc/doxygen/image/IIRLattice.gif b/doc/doxygen/image/IIRLattice.gif

new file mode 100755 (executable)

index 0000000..356152b

Binary files /dev/null and b/doc/doxygen/image/IIRLattice.gif differ
diff --git a/doc/doxygen/image/RFFT.gif b/doc/doxygen/image/RFFT.gif

new file mode 100755 (executable)

index 0000000..c05ed8e

Binary files /dev/null and b/doc/doxygen/image/RFFT.gif differ
diff --git a/doc/doxygen/image/RIFFT.gif b/doc/doxygen/image/RIFFT.gif

new file mode 100755 (executable)

index 0000000..0d9322d

Binary files /dev/null and b/doc/doxygen/image/RIFFT.gif differ
diff --git a/doc/doxygen/image/ne10_library.png b/doc/doxygen/image/ne10_library.png

new file mode 100644 (file)

index 0000000..e6f5282

Binary files /dev/null and b/doc/doxygen/image/ne10_library.png differ
diff --git a/doc/doxygen/image/ne10_logo.png b/doc/doxygen/image/ne10_logo.png

new file mode 100644 (file)

index 0000000..b9238d5

Binary files /dev/null and b/doc/doxygen/image/ne10_logo.png differ
diff --git a/inc/NE10.h b/inc/NE10.h

index d30092e..7857665 100644 (file)
--- a/inc/NE10.h
+++ b/inc/NE10.h
@@ -30,12 +30,12 @@
   */
  
  /**
-   \mainpage Ne10 Software Library
+   \mainpage Welcome to Ne10 Documentation!
     *
     *
     *\par Introduction
     *
-   * Ne10 is a library of the most commonly used functions that have been heavily
+   * Ne10 (http://projectne10.github.com/Ne10/) is a library of the most commonly used functions that have been heavily
     * optimized for ARM-based CPUs with NEON. These functions provide a consistent
     * well tested behavior that can be easily incorporated into applications enabling
     * developers to get the most out of the ARM V7/NEON without arduous assembly coding.
@@ -43,26 +43,58 @@
     * that can be incorporated in a more modular "pick and mix" form where binary size might
     * be an issue.
     *
-   * The Ne10 components are:
+   * The following figure illustrates the basic concepts of "What's Ne10"
+   *\image html ne10_library.png "Ne10 Library Description"
+   *
+   *\par Top-Level Overview
+   * When you checkout Ne10, you will notice a number of directories. These directories are as follows:
+   * <pre>
+   * ├── android
+   * │   └── Android reference files
+   * ├── build
+   * │   └── directory for build-related files
+   * ├── common
+   * │   └── directory for common header, table and macro definition files
+   * ├── doc
+   * │   └── directory for documentations
+   * ├── inc
+   * │   └── directory for functions'heaeder files
+   * ├── modules
+   * │   ├── dsp
+   * │   │   ├── @link groupDSPs dsp module@endlink that provides a set of signal processing functions, such as complex/real FFT/IFFT, FIR and IIR
+   * │   │   └── test
+   * │   │       └──  directory for test files
+   * │   ├── math
+   * │   │   ├── @link groupMaths math module@endlink that provides a set of vector/matrix algebra functions
+   * │   │   └── test
+   * │   │       └──  directory for test files
+   * ├── samples
+   * │   └── @link groupSamples sample code@endlink
+   * ├── test
+   * │   ├── directory for test framework
+   * ├── tools
+   * │   ├── directory for tools such as Cformatter, doxygen, etc
+   * </pre>
+   *
+   *\par Modules Description
+   * Ne10 has a modular structure, which means that the package includes several shared or static libraries.
+   * Currently, the following modules are available:
     *
     * - @link groupMaths Math Functions@endlink
     * - @link groupDSPs Signal Processing Functions@endlink
     * - Physics functions
     * - Image Processing functions
     * - Others
-   *\par
-   *\image html ne10_library.png "Ne10 Library Description"
     *
     *\par License
     *
-   * The Ne10 is provided free of charge by ARM Limited and licensed under New BSD license.
+   * The Ne10 is provided free of charge by ARM Limited and licensed under New BSD License (http://en.wikipedia.org/wiki/BSD_licenses#3-clause_license_.28.22New_BSD_License.22_or_.22Modified_BSD_License.22.29).
     */
  
  
  /**
   * @defgroup groupMaths Math Functions
   *
- *\par Introduction
   *
   * This set of functions provide vector/matrix algebra functions that include
   * add, sub, multiply, div and so on. Currently, only the float (single precision)
@@ -72,12 +104,17 @@
  /**
   * @defgroup groupDSPs Signal Processing Functions
   *
- *\par Introduction
   *
   * This set of functions provide some commonly used functions in signal processing,
   * such as complex/real FFT/IFFT, FIR and IIR. Currently, only the float (single precision)
   * data type is supported.
   */
+/**
+ * @defgroup groupSamples Sample Functions
+ *
+ *
+ * This set of functions provide some sample functions.
+ */
  
  
  #ifndef NE10_H
diff --git a/inc/NE10_dsp.h b/inc/NE10_dsp.h

index fd6f308..d25a9cd 100644 (file)
--- a/inc/NE10_dsp.h
+++ b/inc/NE10_dsp.h
@@ -43,216 +43,255 @@ extern "C" {
  // function prototypes:
  ///////////////////////////
  
-/* fft functions*/
-
-/* function pointers*/
-extern void (*ne10_radix4_butterfly_float)(ne10_float32_t *pDst,
-                     ne10_float32_t *pSrc,
-                     ne10_uint16_t N,
-                     ne10_float32_t *pCoef);
-
-extern void (*ne10_radix4_butterfly_inverse_float)(ne10_float32_t *pDst,
-                     ne10_float32_t *pSrc,
-                     ne10_uint16_t N,
-                     ne10_float32_t *pCoef,
-                     ne10_float32_t onebyN);
-
-extern void (*ne10_rfft_float)(const ne10_rfft_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_float32_t * pTemp);
-/* init functions*/
-extern ne10_result_t ne10_cfft_radix4_init_float(ne10_cfft_radix4_instance_f32_t * S,
-                     ne10_uint16_t fftLen,
-                     ne10_uint8_t ifftFlag);
-
-extern ne10_result_t ne10_rfft_init_float(ne10_rfft_instance_f32_t * S,
-                     ne10_cfft_radix4_instance_f32_t * S_CFFT,
-                     ne10_uint32_t fftLen,
-                     ne10_uint32_t ifftFlagR);
-/* C version*/
-extern void ne10_radix4_butterfly_float_c(ne10_float32_t *pDst,
-                     ne10_float32_t *pSrc,
-                     ne10_uint16_t N,
-                     ne10_float32_t *pCoef);
-
-extern void ne10_radix4_butterfly_inverse_float_c(ne10_float32_t *pDst,
-                     ne10_float32_t *pSrc,
-                     ne10_uint16_t N,
-                     ne10_float32_t *pCoef,
-                     ne10_float32_t onebyN);
-
-extern void ne10_rfft_float_c(const ne10_rfft_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_float32_t * pTemp);
-
-
-/* NEON version*/
-extern void ne10_radix4_butterfly_float_neon(ne10_float32_t *pDst,
-                     ne10_float32_t *pSrc,
-                     ne10_uint16_t N,
-                     ne10_float32_t *pCoef);
-
-extern void ne10_radix4_butterfly_inverse_float_neon(ne10_float32_t *pDst,
-                     ne10_float32_t *pSrc,
-                     ne10_uint16_t N,
-                     ne10_float32_t *pCoef,
-                     ne10_float32_t onebyN);
-
-extern void ne10_rfft_float_neon(const ne10_rfft_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_float32_t * pTemp);
-/* fir functions*/
-
-/* function pointers*/
-extern void (*ne10_fir_float)(const ne10_fir_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_uint32_t blockSize);
-
-extern void (*ne10_fir_decimate_float)(const ne10_fir_decimate_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_uint32_t blockSize);
-
-extern void (*ne10_fir_interpolate_float)(const ne10_fir_interpolate_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_uint32_t blockSize);
-
-extern void (*ne10_fir_lattice_float)(const ne10_fir_lattice_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_uint32_t blockSize);
-
-extern void (*ne10_fir_sparse_float)(ne10_fir_sparse_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_float32_t * pScratchIn,
-                     ne10_uint32_t blockSize);
-
-
-/* init functions*/
-extern ne10_result_t ne10_fir_init_float(ne10_fir_instance_f32_t * S,
-                     ne10_uint16_t numTaps,
-                     ne10_float32_t * pCoeffs,
-                     ne10_float32_t * pState,
-                     ne10_uint32_t blockSize);
-
-extern ne10_result_t ne10_fir_decimate_init_float(ne10_fir_decimate_instance_f32_t * S,
-                     ne10_uint16_t numTaps,
-                     ne10_uint8_t M,
-                     ne10_float32_t * pCoeffs,
-                     ne10_float32_t * pState,
-                     ne10_uint32_t blockSize);
-
-extern ne10_result_t ne10_fir_interpolate_init_float(ne10_fir_interpolate_instance_f32_t * S,
-                     ne10_uint8_t L,
-                     ne10_uint16_t numTaps,
-                     ne10_float32_t * pCoeffs,
-                     ne10_float32_t * pState,
-                     ne10_uint32_t blockSize);
-
-extern ne10_result_t ne10_fir_lattice_init_float(ne10_fir_lattice_instance_f32_t * S,
-                     ne10_uint16_t numStages,
-                     ne10_float32_t * pCoeffs,
-                     ne10_float32_t * pState);
-
-extern ne10_result_t ne10_fir_sparse_init_float(ne10_fir_sparse_instance_f32_t * S,
-                     ne10_uint16_t numTaps,
-                     ne10_float32_t * pCoeffs,
-                     ne10_float32_t * pState,
-                     ne10_int32_t * pTapDelay,
-                     ne10_uint16_t maxDelay,
-                     ne10_uint32_t blockSize);
-
-/* C version*/
-extern void ne10_fir_float_c(const ne10_fir_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_uint32_t blockSize);
-
-extern void ne10_fir_decimate_float_c(const ne10_fir_decimate_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_uint32_t blockSize);
-
-extern void ne10_fir_interpolate_float_c(const ne10_fir_interpolate_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_uint32_t blockSize);
-
-extern void ne10_fir_lattice_float_c(const ne10_fir_lattice_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_uint32_t blockSize);
-
-extern void ne10_fir_sparse_float_c(ne10_fir_sparse_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_float32_t * pScratchIn,
-                     ne10_uint32_t blockSize);
-
-
-/* NEON version*/
-extern void ne10_fir_float_neon(const ne10_fir_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_uint32_t blockSize);
-
-extern void ne10_fir_decimate_float_neon(const ne10_fir_decimate_instance_f32_t * S,
-                     ne10_float32_t *pSrc,
-                     ne10_float32_t *pDst,
-                     ne10_uint32_t blockSize);
-
-extern void ne10_fir_interpolate_float_neon(const ne10_fir_interpolate_instance_f32_t * S,
-                     ne10_float32_t *pSrc,
-                     ne10_float32_t *pDst,
-                     ne10_uint32_t blockSize);
-
-extern void ne10_fir_lattice_float_neon(const ne10_fir_lattice_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_uint32_t blockSize);
-
-extern void ne10_fir_sparse_float_neon(ne10_fir_sparse_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_float32_t * pScratch,
-                     ne10_uint32_t blockSize);
-
-
-/* iir functions*/
-
-/* function pointers*/
-extern void (*ne10_iir_lattice_float)(const ne10_iir_lattice_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_uint32_t blockSize);
-
-/* init functions*/
-extern ne10_result_t ne10_iir_lattice_init_float(ne10_iir_lattice_instance_f32_t * S,
-                     ne10_uint16_t numStages,
-                     ne10_float32_t * pkCoeffs,
-                     ne10_float32_t * pvCoeffs,
-                     ne10_float32_t * pState,
-                     ne10_uint32_t blockSize);
-
-
-/* C version*/
-extern void ne10_iir_lattice_float_c(const ne10_iir_lattice_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_uint32_t blockSize);
-
-/* NEON version*/
-extern void ne10_iir_lattice_float_neon(const ne10_iir_lattice_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_uint32_t blockSize);
-
+    /* fft functions*/
+
+    /* function pointers*/
+    extern void (*ne10_radix4_butterfly_float) (ne10_float32_t *pDst,
+            ne10_float32_t *pSrc,
+            ne10_uint16_t N,
+            ne10_float32_t *pCoef);
+
+    extern void (*ne10_radix4_butterfly_inverse_float) (ne10_float32_t *pDst,
+            ne10_float32_t *pSrc,
+            ne10_uint16_t N,
+            ne10_float32_t *pCoef,
+            ne10_float32_t onebyN);
+
+    extern void (*ne10_rfft_float) (const ne10_rfft_instance_f32_t * S,
+                                    ne10_float32_t * pSrc,
+                                    ne10_float32_t * pDst,
+                                    ne10_float32_t * pTemp);
+    /* init functions*/
+    extern ne10_result_t ne10_cfft_radix4_init_float (ne10_cfft_radix4_instance_f32_t * S,
+            ne10_uint16_t fftLen,
+            ne10_uint8_t ifftFlag);
+
+    extern ne10_result_t ne10_rfft_init_float (ne10_rfft_instance_f32_t * S,
+            ne10_cfft_radix4_instance_f32_t * S_CFFT,
+            ne10_uint32_t fftLen,
+            ne10_uint32_t ifftFlagR);
+    /* C version*/
+    extern void ne10_radix4_butterfly_float_c (ne10_float32_t *pDst,
+            ne10_float32_t *pSrc,
+            ne10_uint16_t N,
+            ne10_float32_t *pCoef);
+
+    extern void ne10_radix4_butterfly_inverse_float_c (ne10_float32_t *pDst,
+            ne10_float32_t *pSrc,
+            ne10_uint16_t N,
+            ne10_float32_t *pCoef,
+            ne10_float32_t onebyN);
+
+    extern void ne10_rfft_float_c (const ne10_rfft_instance_f32_t * S,
+                                   ne10_float32_t * pSrc,
+                                   ne10_float32_t * pDst,
+                                   ne10_float32_t * pTemp);
+
+
+    /* NEON version*/
+    /**
+     * @addtogroup CFFT_CIFFT
+     * @{
+     */
+    extern void ne10_radix4_butterfly_float_neon (ne10_float32_t *pDst,
+            ne10_float32_t *pSrc,
+            ne10_uint16_t N,
+            ne10_float32_t *pCoef);
+
+    extern void ne10_radix4_butterfly_inverse_float_neon (ne10_float32_t *pDst,
+            ne10_float32_t *pSrc,
+            ne10_uint16_t N,
+            ne10_float32_t *pCoef,
+            ne10_float32_t onebyN);
+    /** @} */ //end of CFFT_CIFFT group
+
+
+    extern void ne10_rfft_float_neon (const ne10_rfft_instance_f32_t * S,
+                                      ne10_float32_t * pSrc,
+                                      ne10_float32_t * pDst,
+                                      ne10_float32_t * pTemp);
+
+
+    /* fir functions*/
+
+    /* function pointers*/
+    extern void (*ne10_fir_float) (const ne10_fir_instance_f32_t * S,
+                                   ne10_float32_t * pSrc,
+                                   ne10_float32_t * pDst,
+                                   ne10_uint32_t blockSize);
+
+    extern void (*ne10_fir_decimate_float) (const ne10_fir_decimate_instance_f32_t * S,
+                                            ne10_float32_t * pSrc,
+                                            ne10_float32_t * pDst,
+                                            ne10_uint32_t blockSize);
+
+    extern void (*ne10_fir_interpolate_float) (const ne10_fir_interpolate_instance_f32_t * S,
+            ne10_float32_t * pSrc,
+            ne10_float32_t * pDst,
+            ne10_uint32_t blockSize);
+
+    extern void (*ne10_fir_lattice_float) (const ne10_fir_lattice_instance_f32_t * S,
+                                           ne10_float32_t * pSrc,
+                                           ne10_float32_t * pDst,
+                                           ne10_uint32_t blockSize);
+
+    extern void (*ne10_fir_sparse_float) (ne10_fir_sparse_instance_f32_t * S,
+                                          ne10_float32_t * pSrc,
+                                          ne10_float32_t * pDst,
+                                          ne10_float32_t * pScratchIn,
+                                          ne10_uint32_t blockSize);
+
+
+    /* init functions*/
+    extern ne10_result_t ne10_fir_init_float (ne10_fir_instance_f32_t * S,
+            ne10_uint16_t numTaps,
+            ne10_float32_t * pCoeffs,
+            ne10_float32_t * pState,
+            ne10_uint32_t blockSize);
+
+    extern ne10_result_t ne10_fir_decimate_init_float (ne10_fir_decimate_instance_f32_t * S,
+            ne10_uint16_t numTaps,
+            ne10_uint8_t M,
+            ne10_float32_t * pCoeffs,
+            ne10_float32_t * pState,
+            ne10_uint32_t blockSize);
+
+    extern ne10_result_t ne10_fir_interpolate_init_float (ne10_fir_interpolate_instance_f32_t * S,
+            ne10_uint8_t L,
+            ne10_uint16_t numTaps,
+            ne10_float32_t * pCoeffs,
+            ne10_float32_t * pState,
+            ne10_uint32_t blockSize);
+
+    extern ne10_result_t ne10_fir_lattice_init_float (ne10_fir_lattice_instance_f32_t * S,
+            ne10_uint16_t numStages,
+            ne10_float32_t * pCoeffs,
+            ne10_float32_t * pState);
+
+    extern ne10_result_t ne10_fir_sparse_init_float (ne10_fir_sparse_instance_f32_t * S,
+            ne10_uint16_t numTaps,
+            ne10_float32_t * pCoeffs,
+            ne10_float32_t * pState,
+            ne10_int32_t * pTapDelay,
+            ne10_uint16_t maxDelay,
+            ne10_uint32_t blockSize);
+
+    /* C version*/
+    extern void ne10_fir_float_c (const ne10_fir_instance_f32_t * S,
+                                  ne10_float32_t * pSrc,
+                                  ne10_float32_t * pDst,
+                                  ne10_uint32_t blockSize);
+
+    extern void ne10_fir_decimate_float_c (const ne10_fir_decimate_instance_f32_t * S,
+                                           ne10_float32_t * pSrc,
+                                           ne10_float32_t * pDst,
+                                           ne10_uint32_t blockSize);
+
+    extern void ne10_fir_interpolate_float_c (const ne10_fir_interpolate_instance_f32_t * S,
+            ne10_float32_t * pSrc,
+            ne10_float32_t * pDst,
+            ne10_uint32_t blockSize);
+
+    extern void ne10_fir_lattice_float_c (const ne10_fir_lattice_instance_f32_t * S,
+                                          ne10_float32_t * pSrc,
+                                          ne10_float32_t * pDst,
+                                          ne10_uint32_t blockSize);
+
+    extern void ne10_fir_sparse_float_c (ne10_fir_sparse_instance_f32_t * S,
+                                         ne10_float32_t * pSrc,
+                                         ne10_float32_t * pDst,
+                                         ne10_float32_t * pScratchIn,
+                                         ne10_uint32_t blockSize);
+
+
+    /* NEON version*/
+
+    /**
+     * @addtogroup FIR
+     * @{
+     */
+    extern void ne10_fir_float_neon (const ne10_fir_instance_f32_t * S,
+                                     ne10_float32_t * pSrc,
+                                     ne10_float32_t * pDst,
+                                     ne10_uint32_t blockSize);
+    /** @} */ //end of FIR group
+
+    /**
+     * @addtogroup FIR_decimate
+     * @{
+     */
+    extern void ne10_fir_decimate_float_neon (const ne10_fir_decimate_instance_f32_t * S,
+            ne10_float32_t *pSrc,
+            ne10_float32_t *pDst,
+            ne10_uint32_t blockSize);
+    /** @} */ //end of FIR_decimate group
+
+    /**
+     * @addtogroup FIR_Interpolate
+     * @{
+     */
+    extern void ne10_fir_interpolate_float_neon (const ne10_fir_interpolate_instance_f32_t * S,
+            ne10_float32_t *pSrc,
+            ne10_float32_t *pDst,
+            ne10_uint32_t blockSize);
+    /** @} */ //end of FIR_interpolate group
+
+    /**
+     * @addtogroup FIR_Lattice
+     * @{
+     */
+    extern void ne10_fir_lattice_float_neon (const ne10_fir_lattice_instance_f32_t * S,
+            ne10_float32_t * pSrc,
+            ne10_float32_t * pDst,
+            ne10_uint32_t blockSize);
+    /** @} */ //end of FIR_Lattice group
+
+    /**
+     * @addtogroup FIR_Sparse
+     * @{
+     */
+    extern void ne10_fir_sparse_float_neon (ne10_fir_sparse_instance_f32_t * S,
+                                            ne10_float32_t * pSrc,
+                                            ne10_float32_t * pDst,
+                                            ne10_float32_t * pScratch,
+                                            ne10_uint32_t blockSize);
+    /** @} */ //end of FIR_sparse group
+
+
+    /* iir functions*/
+
+    /* function pointers*/
+    extern void (*ne10_iir_lattice_float) (const ne10_iir_lattice_instance_f32_t * S,
+                                           ne10_float32_t * pSrc,
+                                           ne10_float32_t * pDst,
+                                           ne10_uint32_t blockSize);
+
+    /* init functions*/
+    extern ne10_result_t ne10_iir_lattice_init_float (ne10_iir_lattice_instance_f32_t * S,
+            ne10_uint16_t numStages,
+            ne10_float32_t * pkCoeffs,
+            ne10_float32_t * pvCoeffs,
+            ne10_float32_t * pState,
+            ne10_uint32_t blockSize);
+
+
+    /* C version*/
+    extern void ne10_iir_lattice_float_c (const ne10_iir_lattice_instance_f32_t * S,
+                                          ne10_float32_t * pSrc,
+                                          ne10_float32_t * pDst,
+                                          ne10_uint32_t blockSize);
+
+    /* NEON version*/
+
+    /**
+     * @addtogroup IIR_Lattice
+     * @{
+     */
+    extern void ne10_iir_lattice_float_neon (const ne10_iir_lattice_instance_f32_t * S,
+            ne10_float32_t * pSrc,
+            ne10_float32_t * pDst,
+            ne10_uint32_t blockSize);
+    /** @} */ //end of IIR_Lattice group
  #ifdef __cplusplus
  }
  #endif
diff --git a/inc/NE10_init.h b/inc/NE10_init.h

index 6f8b746..d72a6c7 100644 (file)
--- a/inc/NE10_init.h
+++ b/inc/NE10_init.h
@@ -34,21 +34,21 @@
  extern "C" {
  #endif
  
-/*!
-    This routine returns NE10_OK if the running platform supports NEON, otherwise it returns NE10_ERR
- */
-extern ne10_result_t ne10_HasNEON();
-
-/*!
-    This routine initializes all the function pointers.
- */
-extern ne10_result_t ne10_init();
-
-/*!
-    This routine initializes all the math function pointers defined in "NE10_math.h" with pointers to ARM NEON or ARM VFP implementations.
- */
-extern ne10_result_t ne10_init_math(ne10_int32_t is_NEON_available);
-extern ne10_result_t ne10_init_dsp (ne10_int32_t is_NEON_available);
+    /*!
+        This routine returns NE10_OK if the running platform supports NEON, otherwise it returns NE10_ERR
+     */
+    extern ne10_result_t ne10_HasNEON();
+
+    /*!
+        This routine initializes all the function pointers.
+     */
+    extern ne10_result_t ne10_init();
+
+    /*!
+        This routine initializes all the math function pointers defined in "NE10_math.h" with pointers to ARM NEON or ARM VFP implementations.
+     */
+    extern ne10_result_t ne10_init_math (ne10_int32_t is_NEON_available);
+    extern ne10_result_t ne10_init_dsp (ne10_int32_t is_NEON_available);
  
  #ifdef __cplusplus
  }
diff --git a/inc/NE10_math.h b/inc/NE10_math.h

index 720f1ef..1e6e966 100644 (file)
--- a/inc/NE10_math.h
+++ b/inc/NE10_math.h
@@ -46,1158 +46,1436 @@ extern "C" {
  
  // ## Vector-Constant Arithmetic ##
  
-/*!
-    Adds a constant scalar value to all the elements of an input array and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   The constant scalar added to the input values
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_addc_float)(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-/*!
-    Adds a constant 2D vector to all of the vectors in an input array and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 2D vector added to the input values
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_addc_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-/*!
-    Adds a constant 3D vector to all of the vectors in an input array and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 3D vector added to the input values
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_addc_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-/*!
-    Adds a constant 4D vector to all of the vectors in an input array and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 4D vector added to the input values
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_addc_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-/*!
-    Subtracts a constant scalar from all the elements of an input array and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   The constant scalar subtracted from the input values
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_subc_float)(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-/*!
-    Subtracts a constant 2D vector from all of the vectors in an input array and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 2D vector subtracted from the input values
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_subc_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-/*!
-    Subtracts a constant 3D vector from all of the vectors in an input array and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 3D vector subtracted from the input values
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_subc_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-/*!
-    Subtracts a constant 4D vector from all of the vectors in an input array and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 4D vector subtracted from the input values
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_subc_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-/*!
-    Subtracts the elements of an input array from a constant scalar and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   The constant scalar to subtract the input values from
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_rsbc_float)(ne10_float32_t * dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count);
-/*!
-    Subtracts the vectors in an input array from a constant 2D vector and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 2D vector to subtract the input values from
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_rsbc_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-/*!
-    Subtracts the vectors in an input array from a constant 3D vector and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 3D vector to subtract the input values from
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_rsbc_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-/*!
-    Subtracts the vectors in an input array from a constant 4D vector and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 4D vector to subtract the input values from
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_rsbc_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-/*!
-    Multiplies the elements of an input array by a constant scalar and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   The constant scalar to multiply the input values with
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_mulc_float)(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-/*!
-    Multiplies the components of 2D vectors in an input array by the components of a constant 2D vector and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 2D vector to multiply the input values with
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_mulc_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-/*!
-    Multiplies the components of 3D vectors in an input array by the components of a constant 3D vector and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 3D vector to multiply the input values with
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_mulc_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-/*!
-    Multiplies the components of 4D vectors in an input array by the components of a constant 4D vector and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 4D vector to multiply the input values with
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_mulc_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-/*!
-    Divides the elements of an input array by a constant scalar and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   The constant scalar to divide the input values by
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_divc_float)(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-/*!
-    Divides the components of 2D vectors in an input array with the components of a constant 2D vector and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 2D vector to divide the input values by
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_divc_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-/*!
-    Divides the components of 3D vectors in an input array with the components of a constant 3D vector and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 3D vector to divide the input values by
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_divc_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-/*!
-    Divides the components of 4D vectors in an input array with the components of a constant 4D vector and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 4D vector to divide the input values by
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_divc_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-/*!
-    Sets the elements of an input array to a constant scalar and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  cst   The constant scalar to set the input values to
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_setc_float)(ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
-/*!
-    Sets the components of 2D vectors in an input array to the components of a constant 2D vector and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  cst   Pointer to the 2D vector to set the input values to
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_setc_vec2f)(ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
-/*!
-    Sets the components of 3D vectors in an input array to the components of a constant 3D vector and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  cst   Pointer to the 3D vector to set the input values to
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_setc_vec3f)(ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
-/*!
-    Sets the components of 3D vectors in an input array to the components of a constant 3D vector and stores the results in an output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  cst   Pointer to the 4D vector to set the input values to
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_setc_vec4f)(ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-/*!
-    Multiplies each entry in the source array (src) by cst, then adds the result to
-     the corresponding item of the accumulation array (acc), and stores the result in the destination array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  acc   The corresponding elemetn is added to the result of the multiplication
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   The constant scalar to multiply the input elements with
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_mlac_float)(ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-/*!
-   Multiplies each entry in the source array (src) by the 2D vector cst, then adds the result to
-     the corresponding item of the accumulation array (acc), and stores the result in the destination array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  acc   The corresponding elemetn is added to the result of the multiplication
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 2D vector to multiply the input vectors with
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_mlac_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-/*!
-   Multiplies each entry in the source array (src) by the 3D vector cst, then adds the result to
-     the corresponding item of the accumulation array (acc), and stores the result in the destination array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  acc   The corresponding elemetn is added to the result of the multiplication
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 3D vector to multiply the input vectors with
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_mlac_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-/*!
-   Multiplies each entry in the source array (src) by the 4D vector cst, then adds the result to
-     the corresponding item of the accumulation array (acc), and stores the result in the destination array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  acc   The corresponding elemetn is added to the result of the multiplication
-    @param[in]  src   Pointer to the source array
-    @param[in]  cst   Pointer to the 4D vector to multiply the input vectors with
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_mlac_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-// ## Arithmetic functions over arrays of cst values ##
-
-/*!
-    Adds the elements of src1 to the elements of src2 and stores the results in the dst.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1  The first array to use as the input array
-    @param[in]  src2  The second array to use as the input array
-    @param[in]  count The number of items in the two input arrays
- */
-extern ne10_result_t (*ne10_add_float)(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-/*!
-    Subtracts the elements of src2 from the elements of src2 and stores the results in the dst.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1  The first array to use as the input array
-    @param[in]  src2  The second array to use as the input array
-    @param[in]  count The number of items in the two input arrays
- */
-extern ne10_result_t (*ne10_sub_float)(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-/*!
-    Multiplies the elements of src1 by the elements of src2 and stores the results in the dst.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1  The first array to use as the input array
-    @param[in]  src2  The second array to use as the input array
-    @param[in]  count The number of items in the two input arrays
- */
-extern ne10_result_t (*ne10_mul_float)(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-/*!
-    Divides the elements of src1 by the elements of src2 and stores the results in the dst.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1  The first array to use as the input array
-    @param[in]  src2  The second array to use as the input array
-    @param[in]  count The number of items in the two input arrays
- */
-extern ne10_result_t (*ne10_div_float)(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-/*!
-    Performs a multiply and accumulate operation using the corresponding elements in acc, src1, and src2.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  acc   These elemtns are added to the result of the multiplication operation
-    @param[in]  src1  The first array to use as the input array
-    @param[in]  src2  The second array to use as the input array
-    @param[in]  count The number of items in the two input arrays
- */
-extern ne10_result_t (*ne10_mla_float)(ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-/*!
-    Calculates the absolute value of each element in the source array and stores the result in the corresponding entry of the destination array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_abs_float)(ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count);
-
-
-
-// ## Operations on Vectors ##
-/*!
-    Returns length of 2D vectors in corresponding elements of the output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_len_vec2f)(ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
-/*!
-    Returns length of 3D vectors in corresponding elements of the output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_len_vec3f)(ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
-/*!
-    Returns length of 4D vectors in corresponding elements of the output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_len_vec4f)(ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
-
-
-
-/*!
-    Normalizes 2D vectors of the input array and stores them in the corresponding elements of the output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_normalize_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
-/*!
-    Normalizes 3D vectors of the input array and stores them in the corresponding elements of the output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_normalize_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
-/*!
-    Normalizes 4D vectors of the input array and stores them in the corresponding elements of the output array.
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_normalize_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
-
-
-
-
-/*!
-    Generates a 2D vector from the absolute values of each of the components of an input vector
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_abs_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
-/*!
-    Generates a 3D vector from the absolute values of each of the components of an input vector
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_abs_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
-/*!
-    Generates a 4D vector from the absolute values of each of the components of an input vector
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src   Pointer to the source array
-    @param[in]  count The number of items in the input array
- */
-extern ne10_result_t (*ne10_abs_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
-
-
-
-// ## SIMD Component-wise Arithmetic on Two Vectors ##
-
-/*!
-    Multiplies the components of a 2D vector with the corresponding components of another
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the first source array
-    @param[in]  src2   Pointer to the second source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_vmul_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-/*!
-    Multiplies the components of a 3D vector with the corresponding components of another
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the first source array
-    @param[in]  src2   Pointer to the second source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_vmul_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-/*!
-    Multiplies the components of a 4D vector with the corresponding components of another
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the first source array
-    @param[in]  src2   Pointer to the second source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_vmul_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-/*!
-    Divides the components of a 2D vector with the corresponding components of another
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the nominators' source array
-    @param[in]  src2   Pointer to the denominators' source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_vdiv_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-/*!
-    Divides the components of a 3D vector with the corresponding components of another
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the nominators' source array
-    @param[in]  src2   Pointer to the denominators' source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_vdiv_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-/*!
-    Divides the components of a 4D vector with the corresponding components of another
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the nominators' source array
-    @param[in]  src2   Pointer to the denominators' source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_vdiv_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-/*!
-    Performs a multiply and accumulate operation on the components of a 2D vector with the corresponding components of another
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the first source array
-    @param[in]  src2   Pointer to the second source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_vmla_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-/*!
-    Performs a multiply and accumulate operation on the components of a 3D vector with the corresponding components of another
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the first source array
-    @param[in]  src2   Pointer to the second source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_vmla_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-/*!
-    Performs a multiply and accumulate operation on the components of a 4D vector with the corresponding components of another
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the first source array
-    @param[in]  src2   Pointer to the second source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_vmla_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-// ## Vector-Vector Algebra ##
-
-/*!
-    Vector addition of two 2D vectors
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the first source array
-    @param[in]  src2   Pointer to the second source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_add_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-/*!
-    Vector addition of two 3D vectors
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the first source array
-    @param[in]  src2   Pointer to the second source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_add_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-/*!
-    Vector addition of two 4D vectors
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the first source array
-    @param[in]  src2   Pointer to the second source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_add_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-/*!
-    Vector subtraction of two 2D vectors
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the first source array
-    @param[in]  src2   Pointer to the second source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_sub_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-/*!
-    Vector subtraction of two 3D vectors
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the first source array
-    @param[in]  src2   Pointer to the second source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_sub_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-/*!
-    Vector subtraction of two 4D vectors
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the first source array
-    @param[in]  src2   Pointer to the second source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_sub_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-/*!
-    Dot product of two 2D vectors
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the first source array
-    @param[in]  src2   Pointer to the second source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_dot_vec2f)(ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-/*!
-    Dot product of two 3D vectors
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the first source array
-    @param[in]  src2   Pointer to the second source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_dot_vec3f)(ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-/*!
-    Dot product of two 4D vectors
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the first source array
-    @param[in]  src2   Pointer to the second source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_dot_vec4f)(ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-/*!
-    Performs a cross product operation on the two input vectors
-    @param[out] dst   Pointer to the destination array
-    @param[in]  src1   Pointer to the first source array
-    @param[in]  src2   Pointer to the second source array
-    @param[in]  count The number of items in the input arrays
- */
-extern ne10_result_t (*ne10_cross_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-
-
-
-
-// ## Matrix-Constant Arithmetic ##
-
-// ne10_mat4x4f_t
-extern ne10_result_t (*ne10_addmat_4x4f)(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t (*ne10_submat_4x4f)(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t (*ne10_mulmat_4x4f)(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t (*ne10_divmat_4x4f)(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t (*ne10_setmat_4x4f)(ne10_mat4x4f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
-
-extern ne10_result_t (*ne10_addmat_3x3f)(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t (*ne10_submat_3x3f)(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t (*ne10_mulmat_3x3f)(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t (*ne10_divmat_3x3f)(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t (*ne10_setmat_3x3f)(ne10_mat3x3f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
-
-extern ne10_result_t (*ne10_addmat_2x2f)(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t (*ne10_submat_2x2f)(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t (*ne10_mulmat_2x2f)(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t (*ne10_divmat_2x2f)(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t (*ne10_setmat_2x2f)(ne10_mat2x2f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
-
-
-
-// ## Operations on Matrices ##
-
-extern ne10_result_t (*ne10_detmat_4x4f)(ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
-extern ne10_result_t (*ne10_detmat_3x3f)(ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
-extern ne10_result_t (*ne10_detmat_2x2f)(ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
-
-extern ne10_result_t (*ne10_invmat_4x4f)(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
-extern ne10_result_t (*ne10_invmat_3x3f)(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
-extern ne10_result_t (*ne10_invmat_2x2f)(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
-
-extern ne10_result_t (*ne10_transmat_4x4f)(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
-extern ne10_result_t (*ne10_identitymat_4x4f)(ne10_mat4x4f_t * dst, ne10_uint32_t count);
-
-extern ne10_result_t (*ne10_transmat_3x3f)(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
-extern ne10_result_t (*ne10_identitymat_3x3f)(ne10_mat3x3f_t * dst, ne10_uint32_t count);
-
-extern ne10_result_t (*ne10_transmat_2x2f)(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
-extern ne10_result_t (*ne10_identitymat_2x2f)(ne10_mat2x2f_t * dst, ne10_uint32_t count);
-
-
-
-// ## Matrix-Vector Algebra ##
-extern ne10_result_t (*ne10_mulcmatvec_cm4x4f_v4f)(ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count);
-extern ne10_result_t (*ne10_mulcmatvec_cm3x3f_v3f)(ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count);
-extern ne10_result_t (*ne10_mulcmatvec_cm2x2f_v2f)(ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count);
-
-
-// ## Matrix-Matrix Algebra ##
-extern ne10_result_t (*ne10_multrans_mat4x4f)(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t (*ne10_multrans_mat3x3f)(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t (*ne10_multrans_mat2x2f)(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-
-
-///////////////////////////
-// C function prototypes:
-///////////////////////////
-
-
-// ## Vector-Constant Arithmetic ##
-
-extern ne10_result_t ne10_addc_float_c(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-extern ne10_result_t ne10_addc_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_addc_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_addc_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_subc_float_c(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract cst from the element(s)
-extern ne10_result_t ne10_subc_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
-extern ne10_result_t ne10_subc_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
-extern ne10_result_t ne10_subc_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
-
-
-
-extern ne10_result_t ne10_rsbc_float_c(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract element(s) from a cst
-extern ne10_result_t ne10_rsbc_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
-extern ne10_result_t ne10_rsbc_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
-extern ne10_result_t ne10_rsbc_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
-
-
-
-extern ne10_result_t ne10_mulc_float_c(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mulc_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mulc_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mulc_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_divc_float_c(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-extern ne10_result_t ne10_divc_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_divc_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_divc_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_setc_float_c(ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
-extern ne10_result_t ne10_setc_vec2f_c(ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_setc_vec3f_c(ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_setc_vec4f_c(ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_mlac_float_c(ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mlac_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mlac_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mlac_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-// ## Arithmetic functions over arrays of cst values ##
-extern ne10_result_t ne10_add_float_c(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_sub_float_c(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_mul_float_c(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_div_float_c(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_mla_float_c(ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_abs_float_c(ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count);
-
-// ## Operations on Vectors ##
-extern ne10_result_t ne10_len_vec2f_c(ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_len_vec3f_c(ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_len_vec4f_c(ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_normalize_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_normalize_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_normalize_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_abs_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_abs_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_abs_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
-
-
-
-// ## SIMD Component-wise Arithmetic on Two Vectors ##
-extern ne10_result_t ne10_vmul_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vmul_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vmul_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_vdiv_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vdiv_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vdiv_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_vmla_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vmla_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vmla_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-// ## Vector-Vector Algebra ##
-extern ne10_result_t ne10_add_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_add_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_add_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_sub_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_sub_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_sub_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_dot_vec2f_c(ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_dot_vec3f_c(ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_dot_vec4f_c(ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_cross_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-
-
-
-// ## Matrix-Constant Arithmetic ##
-
-// ne10_mat4x4f_t
-extern ne10_result_t ne10_addmat_4x4f_c(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_submat_4x4f_c(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_mulmat_4x4f_c(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_divmat_4x4f_c(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_setmat_4x4f_c(ne10_mat4x4f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
-
-extern ne10_result_t ne10_addmat_3x3f_c(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_submat_3x3f_c(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_mulmat_3x3f_c(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_divmat_3x3f_c(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_setmat_3x3f_c(ne10_mat3x3f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
-
-extern ne10_result_t ne10_addmat_2x2f_c(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_submat_2x2f_c(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_mulmat_2x2f_c(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_divmat_2x2f_c(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_setmat_2x2f_c(ne10_mat2x2f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
-
-
-
-// ## Operations on Matrices ##
-
-extern ne10_result_t ne10_detmat_4x4f_c(ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_detmat_3x3f_c(ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_detmat_2x2f_c(ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
-
-extern ne10_result_t ne10_invmat_4x4f_c(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_invmat_3x3f_c(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_invmat_2x2f_c(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
-
-extern ne10_result_t ne10_transmat_4x4f_c(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_identitymat_4x4f_c(ne10_mat4x4f_t * dst, ne10_uint32_t count);
-
-extern ne10_result_t ne10_transmat_3x3f_c(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_identitymat_3x3f_c(ne10_mat3x3f_t * dst, ne10_uint32_t count);
-
-extern ne10_result_t ne10_transmat_2x2f_c(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_identitymat_2x2f_c(ne10_mat2x2f_t * dst, ne10_uint32_t count);
-
-
-
-// ## Matrix-Vector Algebra ##
-extern ne10_result_t ne10_mulcmatvec_cm4x4f_v4f_c(ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_mulcmatvec_cm3x3f_v3f_c(ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_mulcmatvec_cm2x2f_v2f_c(ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count);
-
-
-// ## Matrix-Matrix Algebra ##
-extern ne10_result_t ne10_multrans_mat4x4f_c(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_multrans_mat3x3f_c(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_multrans_mat2x2f_c(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-
-
-/////////////////////////////
-// NEON function prototypes:
-/////////////////////////////
-
-
-// ## Vector-Constant Arithmetic ##
-
-extern ne10_result_t ne10_addc_float_neon(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-extern ne10_result_t ne10_addc_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_addc_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_addc_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_subc_float_neon(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract cst from the element(s)
-extern ne10_result_t ne10_subc_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
-extern ne10_result_t ne10_subc_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
-extern ne10_result_t ne10_subc_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
-
-
-
-extern ne10_result_t ne10_rsbc_float_neon(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract element(s) from a cst
-extern ne10_result_t ne10_rsbc_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
-extern ne10_result_t ne10_rsbc_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
-extern ne10_result_t ne10_rsbc_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
-
-
-
-extern ne10_result_t ne10_mulc_float_neon(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mulc_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mulc_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mulc_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_divc_float_neon(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-extern ne10_result_t ne10_divc_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_divc_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_divc_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_setc_float_neon(ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
-extern ne10_result_t ne10_setc_vec2f_neon(ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_setc_vec3f_neon(ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_setc_vec4f_neon(ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_mlac_float_neon(ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mlac_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mlac_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mlac_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-// ## Arithmetic functions over arrays of cst values ##
-extern ne10_result_t ne10_add_float_neon(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_sub_float_neon(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_mul_float_neon(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_div_float_neon(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_mla_float_neon(ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_abs_float_neon(ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count);
-
-// ## Operations on Vectors ##
-extern ne10_result_t ne10_len_vec2f_neon(ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_len_vec3f_neon(ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_len_vec4f_neon(ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_normalize_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_normalize_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_normalize_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_abs_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_abs_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_abs_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
-
-
-
-// ## SIMD Component-wise Arithmetic on Two Vectors ##
-extern ne10_result_t ne10_vmul_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vmul_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vmul_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_vdiv_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vdiv_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vdiv_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_vmla_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vmla_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vmla_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-// ## Vector-Vector Algebra ##
-extern ne10_result_t ne10_add_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_add_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_add_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_sub_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_sub_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_sub_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_dot_vec2f_neon(ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_dot_vec3f_neon(ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_dot_vec4f_neon(ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_cross_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-
-
-
-// ## Matrix-Constant Arithmetic ##
-
-// ne10_mat4x4f_t
-extern ne10_result_t ne10_addmat_4x4f_neon(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_submat_4x4f_neon(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_mulmat_4x4f_neon(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_divmat_4x4f_neon(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_setmat_4x4f_neon(ne10_mat4x4f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
-
-extern ne10_result_t ne10_addmat_3x3f_neon(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_submat_3x3f_neon(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_mulmat_3x3f_neon(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_divmat_3x3f_neon(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_setmat_3x3f_neon(ne10_mat3x3f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
-
-extern ne10_result_t ne10_addmat_2x2f_neon(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_submat_2x2f_neon(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_mulmat_2x2f_neon(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_divmat_2x2f_neon(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_setmat_2x2f_neon(ne10_mat2x2f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
-
-
-
-// ## Operations on Matrices ##
-
-
-extern ne10_result_t ne10_detmat_4x4f_neon(ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_detmat_3x3f_neon(ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_detmat_2x2f_neon(ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
-
-extern ne10_result_t ne10_invmat_4x4f_neon(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_invmat_3x3f_neon(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_invmat_2x2f_neon(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
-
-extern ne10_result_t ne10_transmat_4x4f_neon(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_identitymat_4x4f_neon(ne10_mat4x4f_t * dst, ne10_uint32_t count);
-
-extern ne10_result_t ne10_transmat_3x3f_neon(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_identitymat_3x3f_neon(ne10_mat3x3f_t * dst, ne10_uint32_t count);
-
-extern ne10_result_t ne10_transmat_2x2f_neon(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_identitymat_2x2f_neon(ne10_mat2x2f_t * dst, ne10_uint32_t count);
-
-
-
-// ## Matrix-Vector Algebra ##
-extern ne10_result_t ne10_mulcmatvec_cm4x4f_v4f_neon(ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_mulcmatvec_cm3x3f_v3f_neon(ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_mulcmatvec_cm2x2f_v2f_neon(ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count);
-
-
-
-
-// ## Matrix-Matrix Algebra ##
-extern ne10_result_t ne10_multrans_mat4x4f_neon(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_multrans_mat3x3f_neon(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_multrans_mat2x2f_neon(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-
-
-////////////////////////////
-// VFP function prototypes:
-////////////////////////////
-
-// ## Vector-Constant Arithmetic ##
-
-extern ne10_result_t ne10_addc_float_asm(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-extern ne10_result_t ne10_addc_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_addc_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_addc_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_subc_float_asm(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract cst from the element(s)
-extern ne10_result_t ne10_subc_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
-extern ne10_result_t ne10_subc_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
-extern ne10_result_t ne10_subc_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
-
-
-
-extern ne10_result_t ne10_rsbc_float_asm(ne10_float32_t * dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count); // subtract element(s) from a cst
-extern ne10_result_t ne10_rsbc_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t *src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
-extern ne10_result_t ne10_rsbc_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t *src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
-extern ne10_result_t ne10_rsbc_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t *src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
-
-
-
-extern ne10_result_t ne10_mulc_float_asm(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mulc_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mulc_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mulc_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_divc_float_asm(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-extern ne10_result_t ne10_divc_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_divc_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_divc_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_setc_float_asm(ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
-extern ne10_result_t ne10_setc_vec2f_asm(ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_setc_vec3f_asm(ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_setc_vec4f_asm(ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_mlac_float_asm(ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mlac_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mlac_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
-extern ne10_result_t ne10_mlac_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
-
-
-
-// ## Arithmetic functions over arrays of cst values ##
-extern ne10_result_t ne10_add_float_asm(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_sub_float_asm(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_mul_float_asm(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_div_float_asm(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_mla_float_asm(ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_abs_float_asm(ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count);
-
-// ## Operations on Vectors ##
-extern ne10_result_t ne10_len_vec2f_asm(ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_len_vec3f_asm(ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_len_vec4f_asm(ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_normalize_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_normalize_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_normalize_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_abs_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_abs_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_abs_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
-
-
-
-// ## SIMD Component-wise Arithmetic on Two Vectors ##
-extern ne10_result_t ne10_vmul_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vmul_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vmul_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_vdiv_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vdiv_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vdiv_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_vmla_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vmla_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_vmla_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-// ## Vector-Vector Algebra ##
-extern ne10_result_t ne10_add_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_add_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_add_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_sub_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_sub_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_sub_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_dot_vec2f_asm(ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_dot_vec3f_asm(ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_dot_vec4f_asm(ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
-
-
-
-extern ne10_result_t ne10_cross_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
-
-
-// ## Matrix-Constant Arithmetic ##
-
-// ne10_mat4x4f_t
-extern ne10_result_t ne10_addmat_4x4f_asm(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_submat_4x4f_asm(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_mulmat_4x4f_asm(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_divmat_4x4f_asm(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_setmat_4x4f_asm(ne10_mat4x4f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
-
-extern ne10_result_t ne10_addmat_3x3f_asm(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_submat_3x3f_asm(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_mulmat_3x3f_asm(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_divmat_3x3f_asm(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_setmat_3x3f_asm(ne10_mat3x3f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
-
-extern ne10_result_t ne10_addmat_2x2f_asm(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_submat_2x2f_asm(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_mulmat_2x2f_asm(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_divmat_2x2f_asm(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_setmat_2x2f_asm(ne10_mat2x2f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
-
-
-
-// ## Operations on Matrices ##
-
-extern ne10_result_t ne10_detmat_4x4f_asm(ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_detmat_3x3f_asm(ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_detmat_2x2f_asm(ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
-
-extern ne10_result_t ne10_invmat_4x4f_asm(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_invmat_3x3f_asm(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_invmat_2x2f_asm(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
-
-extern ne10_result_t ne10_transmat_4x4f_asm(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_identitymat_4x4f_asm(ne10_mat4x4f_t * dst, ne10_uint32_t count);
-
-extern ne10_result_t ne10_transmat_3x3f_asm(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_identitymat_3x3f_asm(ne10_mat3x3f_t * dst, ne10_uint32_t count);
-
-extern ne10_result_t ne10_trans_mat2x2f_asm(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_identity_mat2x2f_asm(ne10_mat2x2f_t * dst, ne10_uint32_t count);
-
-
-
-// ## Matrix-Vector Algebra ##
-extern ne10_result_t ne10_mulcmatvec_cm4x4f_v4f_asm(ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_mulcmatvec_cm3x3f_v3f_asm(ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count);
-extern ne10_result_t ne10_mulcmatvec_cm2x2f_v2f_asm(ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count);
-
-
-
-
-// ## Matrix-Matrix Algebra ##
-extern ne10_result_t ne10_multrans_mat4x4f_asm(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_multrans_mat3x3f_asm(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
-extern ne10_result_t ne10_multrans_mat2x2f_asm(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup ADD_VEC Vector Add
+     *
+     * \par
+     * These functions implement the vector add operation for float data type.
+     */
+
+    /**
+     * @addtogroup ADD_VEC
+     * @{
+     */
+
+    /**
+     * Adds a constant scalar value to all the elements of an input array and stores the results in an output array.
+     * This function point could be pointed to one of ne10_addc_float_c, ne10_addc_float_neon and ne10_addc_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   The constant scalar added to the input values
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_addc_float) (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_float_c (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_float_asm (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    /**
+     * Adds a constant 2D vector to all of the vectors in an input array and stores the results in an output array.
+     * This function point could be pointed to one of ne10_addc_vec2f_c, ne10_addc_vec2f_neon and ne10_addc_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 2D vector added to the input values
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_addc_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    /**
+     * Adds a constant 3D vector to all of the vectors in an input array and stores the results in an output array.
+     * This function point could be pointed to one of ne10_addc_vec3f_c, ne10_addc_vec3f_neon and ne10_addc_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 3D vector added to the input values
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_addc_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    /**
+     * Adds a constant 4D vector to all of the vectors in an input array and stores the results in an output array.
+     * This function point could be pointed to one of ne10_addc_vec4f_c, ne10_addc_vec4f_neon and ne10_addc_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 4D vector added to the input values
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_addc_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_addc_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+    /**
+     * Adds the elements of src1 to the elements of src2 and stores the results in the dst.
+     * This function point could be pointed to one of ne10_add_float_c, ne10_add_float_neon and ne10_add_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1  The first array to use as the input array
+     * @param[in]  src2  The second array to use as the input array
+     * @param[in]  count The number of items in the two input arrays
+     */
+    extern ne10_result_t (*ne10_add_float) (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_float_c (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_float_neon (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_float_asm (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    /**
+     * Vector addition of two 2D vectors.
+     * This function point could be pointed to one of ne10_add_vec2f_c, ne10_add_vec2f_neon and ne10_add_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_add_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    /**
+     * Vector addition of two 3D vectors.
+     * This function point could be pointed to one of ne10_add_vec3f_c, ne10_add_vec3f_neon and ne10_add_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_add_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    /**
+     * Vector addition of two 4D vectors.
+     * This function point could be pointed to one of ne10_add_vec4f_c, ne10_add_vec4f_neon and ne10_add_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_add_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_add_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Vector Add group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup ADD_MAT Matrix Add
+     *
+     * \par
+     * These functions implement the matrix add operation for float data type.
+     */
+
+    /**
+     * @addtogroup ADD_MAT
+     * @{
+     */
+
+    /**
+     * Vector addition of two 4x4 matrixs.
+     * This function point could be pointed to one of ne10_addmat_4x4f_c, ne10_addmat_4x4f_neon and ne10_addmat_4x4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_addmat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_4x4f_neon (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_4x4f_asm (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    /**
+     * Vector addition of two 3x3 matrixs.
+     * This function point could be pointed to one of ne10_addmat_3x3f_c, ne10_addmat_3x3f_neon and ne10_addmat_3x3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_addmat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_3x3f_neon (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_3x3f_asm (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    /**
+     * Vector addition of two 2x2 matrixs.
+     * This function point could be pointed to one of ne10_addmat_2x2f_c, ne10_addmat_2x2f_neon and ne10_addmat_2x2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_addmat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_2x2f_neon (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_addmat_2x2f_asm (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Matrix Add group
+
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup SUB_VEC Vector Sub
+     *
+     * \par
+     * These functions implement the vector sub operation for float data type.
+     */
+
+    /**
+     * @addtogroup SUB_VEC
+     * @{
+     */
+
+    /**
+     * Subtracts a constant scalar from all the elements of an input array and stores the results in an output array.
+     * This function point could be pointed to one of ne10_subc_float_c, ne10_subc_float_neon and ne10_subc_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   The constant scalar subtracted from the input values
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_subc_float) (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_subc_float_c (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract cst from the element(s)
+    extern ne10_result_t ne10_subc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract cst from the element(s)
+    extern ne10_result_t ne10_subc_float_asm (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract cst from the element(s)
+    /**
+     * Subtracts a constant 2D vector from all of the vectors in an input array and stores the results in an output array.
+     * This function point could be pointed to one of ne10_subc_vec2f_c, ne10_subc_vec2f_neon and ne10_subc_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 2D vector subtracted from the input values
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_subc_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_subc_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+    extern ne10_result_t ne10_subc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+    extern ne10_result_t ne10_subc_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+    /**
+     * Subtracts a constant 3D vector from all of the vectors in an input array and stores the results in an output array.
+     * This function point could be pointed to one of ne10_subc_vec3f_c, ne10_subc_vec3f_neon and ne10_subc_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 3D vector subtracted from the input values
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_subc_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_subc_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+    extern ne10_result_t ne10_subc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+    extern ne10_result_t ne10_subc_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+    /**
+     * Subtracts a constant 4D vector from all of the vectors in an input array and stores the results in an output array.
+     * This function point could be pointed to one of ne10_subc_vec4f_c, ne10_subc_vec4f_neon and ne10_subc_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 4D vector subtracted from the input values
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_subc_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_subc_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+    extern ne10_result_t ne10_subc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+    extern ne10_result_t ne10_subc_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+
+    /**
+     * Subtracts the elements of src2 from the elements of src1 and stores the results in the dst.
+     * This function point could be pointed to one of ne10_sub_float_c, ne10_sub_float_neon and ne10_sub_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1  The first array to use as the input array
+     * @param[in]  src2  The second array to use as the input array
+     * @param[in]  count The number of items in the two input arrays
+     */
+    extern ne10_result_t (*ne10_sub_float) (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_float_c (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_float_neon (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_float_asm (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    /**
+     * Vector subtraction of two 2D vectors.
+     * This function point could be pointed to one of ne10_sub_vec2f_c, ne10_sub_vec2f_neon and ne10_sub_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_sub_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    /**
+     * Vector subtraction of two 3D vectors.
+     * This function point could be pointed to one of ne10_sub_vec3f_c, ne10_sub_vec3f_neon and ne10_sub_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_sub_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    /**
+     * Vector subtraction of two 4D vectors.
+     * This function point could be pointed to one of ne10_sub_vec4f_c, ne10_sub_vec4f_neon and ne10_sub_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_sub_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_sub_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Vector Sub group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup RSBC Vector Rsbc
+     *
+     * \par
+     * These functions implement the vector rsbc operation for float data type.
+     */
+
+    /**
+     * @addtogroup RSBC
+     * @{
+     */
+    /**
+     * Subtracts the elements of an input array from a constant scalar and stores the results in an output array.
+     * This function point could be pointed to one of ne10_rsbc_float_c, ne10_rsbc_float_neon and ne10_rsbc_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   The constant scalar to subtract the input values from
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_rsbc_float) (ne10_float32_t * dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_rsbc_float_c (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract element(s) from a cst
+    extern ne10_result_t ne10_rsbc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract element(s) from a cst
+    extern ne10_result_t ne10_rsbc_float_asm (ne10_float32_t * dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count); // subtract element(s) from a cst
+    /**
+     * Subtracts the vectors in an input array from a constant 2D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_rsbc_vec2f_c, ne10_rsbc_vec2f_neon and ne10_rsbc_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 2D vector to subtract the input values from
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_rsbc_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_rsbc_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    extern ne10_result_t ne10_rsbc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    extern ne10_result_t ne10_rsbc_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t *src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    /**
+     * Subtracts the vectors in an input array from a constant 3D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_rsbc_vec3f_c, ne10_rsbc_vec3f_neon and ne10_rsbc_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 3D vector to subtract the input values from
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_rsbc_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_rsbc_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    extern ne10_result_t ne10_rsbc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    extern ne10_result_t ne10_rsbc_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t *src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    /**
+     * Subtracts the vectors in an input array from a constant 4D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_rsbc_vec4f_c, ne10_rsbc_vec4f_neon and ne10_rsbc_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 4D vector to subtract the input values from
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_rsbc_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_rsbc_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    extern ne10_result_t ne10_rsbc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    extern ne10_result_t ne10_rsbc_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t *src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+    /** @} */ //end of Vector RSBC group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup SUB_MAT Matrix Sub
+     *
+     * \par
+     * These functions implement the matrix sub operation for float data type.
+     */
+
+    /**
+     * @addtogroup SUB_MAT
+     * @{
+     */
+    /**
+     * Matrix subtraction of two 4x4 matrixs.
+     * This function point could be pointed to one of ne10_submat_4x4f_c, ne10_submat_4x4f_neon and ne10_submat_4x4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_submat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_4x4f_neon (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_4x4f_asm (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+
+    /**
+     * Matrix subtraction of two 3x3 matrixs.
+     * This function point could be pointed to one of ne10_submat_3x3f_c, ne10_submat_3x3f_neon and ne10_submat_3x3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_submat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_3x3f_neon (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_3x3f_asm (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+
+    /**
+     * Matrix subtraction of two 2x2 matrixs.
+     * This function point could be pointed to one of ne10_submat_2x2f_c, ne10_submat_2x2f_neon and ne10_submat_2x2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_submat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_2x2f_neon (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_submat_2x2f_asm (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Matrix Sub group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup MUL_VEC Vector Multiply
+     *
+     * \par
+     * These functions implement the vector multiply operation for float data type.
+     */
+
+    /**
+     * @addtogroup MUL_VEC
+     * @{
+     */
+
+    /**
+     * Multiplies the elements of an input array by a constant scalar and stores the results in an output array.
+     * This function point could be pointed to one of ne10_mulc_float_c, ne10_mulc_float_neon and ne10_mulc_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   The constant scalar to multiply the input values with
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_mulc_float) (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_float_c (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_float_asm (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    /**
+     * Multiplies the components of 2D vectors in an input array by the components of a constant 2D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_mulc_vec2f_c, ne10_mulc_vec2f_neon and ne10_mulc_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 2D vector to multiply the input values with
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_mulc_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    /**
+     * Multiplies the components of 3D vectors in an input array by the components of a constant 3D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_mulc_vec3f_c, ne10_mulc_vec3f_neon and ne10_mulc_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 3D vector to multiply the input values with
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_mulc_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    /**
+     * Multiplies the components of 4D vectors in an input array by the components of a constant 4D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_mulc_vec4f_c, ne10_mulc_vec4f_neon and ne10_mulc_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 4D vector to multiply the input values with
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_mulc_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulc_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+    /**
+     * Multiplies the elements of src1 by the elements of src2 and stores the results in the dst.
+     * This function point could be pointed to one of ne10_mul_float_c, ne10_mul_float_neon and ne10_mul_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1  The first array to use as the input array
+     * @param[in]  src2  The second array to use as the input array
+     * @param[in]  count The number of items in the two input arrays
+     */
+    extern ne10_result_t (*ne10_mul_float) (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mul_float_c (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mul_float_neon (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mul_float_asm (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    /**
+     * Multiplies the components of a 2D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vmul_vec2f_c, ne10_vmul_vec2f_neon and ne10_vmul_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vmul_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmul_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmul_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmul_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    /**
+     * Multiplies the components of a 3D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vmul_vec3f_c, ne10_vmul_vec3f_neon and ne10_vmul_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vmul_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmul_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmul_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmul_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    /**
+     * Multiplies the components of a 4D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vmul_vec4f_c, ne10_vmul_vec4f_neon and ne10_vmul_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vmul_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmul_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmul_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmul_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Vector Multiply group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup MLA_VEC Vector Multiply-Accumulator
+     *
+     * \par
+     * These functions implement the vector multiply-accumulator operation for float data type.
+     */
+
+    /**
+     * @addtogroup MLA_VEC
+     * @{
+     */
+
+    /**
+     * Multiplies each entry in the source array (src) by cst, then adds the result to
+     * the corresponding item of the accumulation array (acc), and stores the result in the destination array.
+     * This function point could be pointed to one of ne10_mlac_float_c, ne10_mlac_float_neon and ne10_mlac_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  acc   The corresponding elemetn is added to the result of the multiplication
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   The constant scalar to multiply the input elements with
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_mlac_float) (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_float_c (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_float_neon (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_float_asm (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    /**
+     * Multiplies each entry in the source array (src) by the 2D vector cst, then adds the result to
+     * the corresponding item of the accumulation array (acc), and stores the result in the destination array.
+     * This function point could be pointed to one of ne10_mlac_vec2f_c, ne10_mlac_vec2f_neon and ne10_mlac_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  acc   The corresponding elemetn is added to the result of the multiplication
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 2D vector to multiply the input vectors with
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_mlac_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    /**
+     * Multiplies each entry in the source array (src) by the 3D vector cst, then adds the result to
+     * the corresponding item of the accumulation array (acc), and stores the result in the destination array.
+     * This function point could be pointed to one of ne10_mlac_vec3f_c, ne10_mlac_vec3f_neon and ne10_mlac_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  acc   The corresponding elemetn is added to the result of the multiplication
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 3D vector to multiply the input vectors with
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_mlac_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    /**
+     * Multiplies each entry in the source array (src) by the 4D vector cst, then adds the result to
+     * the corresponding item of the accumulation array (acc), and stores the result in the destination array.
+     * This function point could be pointed to one of ne10_mlac_vec4f_c, ne10_mlac_vec4f_neon and ne10_mlac_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  acc   The corresponding elemetn is added to the result of the multiplication
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 4D vector to multiply the input vectors with
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_mlac_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_mlac_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+    /**
+     * Performs a multiply and accumulate operation using the corresponding elements in acc, src1, and src2.
+     * This function point could be pointed to one of ne10_mla_float_c, ne10_mla_float_neon and ne10_mla_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  acc   These elemtns are added to the result of the multiplication operation
+     * @param[in]  src1  The first array to use as the input array
+     * @param[in]  src2  The second array to use as the input array
+     * @param[in]  count The number of items in the two input arrays
+     */
+    extern ne10_result_t (*ne10_mla_float) (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mla_float_c (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mla_float_neon (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mla_float_asm (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    /**
+     * Performs a multiply and accumulate operation on the components of a 2D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vmla_vec2f_c, ne10_vmla_vec2f_neon and ne10_vmla_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vmla_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmla_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmla_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmla_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    /**
+     * Performs a multiply and accumulate operation on the components of a 3D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vmla_vec3f_c, ne10_vmla_vec3f_neon and ne10_vmla_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vmla_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmla_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmla_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmla_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    /**
+     * Performs a multiply and accumulate operation on the components of a 4D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vmla_vec4f_c, ne10_vmla_vec4f_neon and ne10_vmla_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vmla_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmla_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmla_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vmla_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Vector Multiply-Accumulator group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup MUL_MAT Matrix Multiply
+     *
+     * \par
+     * These functions implement the matrix multiply operation for float data type.
+     */
+
+    /**
+     * @addtogroup MUL_MAT
+     * @{
+     */
+
+    /**
+     * Matrix multiplication of two 4x4 matrixs.
+     * This function point could be pointed to one of ne10_mulmat_4x4f_c, ne10_mulmat_4x4f_neon and ne10_mulmat_4x4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_mulmat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulmat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulmat_4x4f_neon (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulmat_4x4f_asm (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+
+    /**
+     * Matrix multiplication of two 3x3 matrixs.
+     * This function point could be pointed to one of ne10_mulmat_3x3f_c, ne10_mulmat_3x3f_neon and ne10_mulmat_3x3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_mulmat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulmat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulmat_3x3f_neon (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulmat_3x3f_asm (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+
+    /**
+     * Matrix multiplication of two 2x2 matrixs.
+     * This function point could be pointed to one of ne10_mulmat_2x2f_c, ne10_mulmat_2x2f_neon and ne10_mulmat_2x2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_mulmat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulmat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulmat_2x2f_neon (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulmat_2x2f_asm (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Matrix Multiply group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup MUL_MAT_VEC Matrix Vector Multiply
+     *
+     * \par
+     * These functions implement the matrix vector multiply operation for float data type.
+     */
+
+    /**
+     * @addtogroup MUL_MAT_VEC
+     * @{
+     */
+    /**
+     * Matrix multiplication of 4x4 matrix and 4D vector.
+     * This function point could be pointed to one of ne10_mulcmatvec_cm4x4f_v4f_c, ne10_mulcmatvec_cm4x4f_v4f_neon and ne10_mulcmatvec_cm4x4f_v4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  cst   Pointer to the matrix to multiply the input values with
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_mulcmatvec_cm4x4f_v4f) (ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulcmatvec_cm4x4f_v4f_c (ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulcmatvec_cm4x4f_v4f_neon (ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulcmatvec_cm4x4f_v4f_asm (ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count);
+    /**
+     * Matrix multiplication of 3x3 matrix and 3D vector.
+     * This function point could be pointed to one of ne10_mulcmatvec_cm3x3f_v3f_c, ne10_mulcmatvec_cm3x3f_v3f_neon and ne10_mulcmatvec_cm3x3f_v3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  cst   Pointer to the matrix to multiply the input values with
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_mulcmatvec_cm3x3f_v3f) (ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulcmatvec_cm3x3f_v3f_c (ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulcmatvec_cm3x3f_v3f_neon (ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulcmatvec_cm3x3f_v3f_asm (ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count);
+    /**
+     * Matrix multiplication of 2x2 matrix and 2D vector.
+     * This function point could be pointed to one of ne10_mulcmatvec_cm2x2f_v2f_c, ne10_mulcmatvec_cm2x2f_v2f_neon and ne10_mulcmatvec_cm2x2f_v2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  cst   Pointer to the matrix to multiply the input values with
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_mulcmatvec_cm2x2f_v2f) (ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulcmatvec_cm2x2f_v2f_c (ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulcmatvec_cm2x2f_v2f_neon (ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_mulcmatvec_cm2x2f_v2f_asm (ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count);
+
+    /** @} */ //end of Matrix Vector Multiply group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup DIV_VEC Vector Div
+     *
+     * \par
+     * These functions implement the vector division operation for float data type.
+     */
+
+    /**
+     * @addtogroup DIV_VEC
+     * @{
+     */
+
+    /**
+     * Divides the elements of an input array by a constant scalar and stores the results in an output array.
+     * This function point could be pointed to one of ne10_divc_float_c, ne10_divc_float_neon and ne10_divc_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   The constant scalar to divide the input values by
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_divc_float) (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_float_c (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_float_asm (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+    /**
+     * Divides the components of 2D vectors in an input array with the components of a constant 2D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_divc_vec2f_c, ne10_divc_vec2f_neon and ne10_divc_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 2D vector to divide the input values by
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_divc_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    /**
+     * Divides the components of 3D vectors in an input array with the components of a constant 3D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_divc_vec3f_c, ne10_divc_vec3f_neon and ne10_divc_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 3D vector to divide the input values by
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_divc_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    /**
+     * Divides the components of 4D vectors in an input array with the components of a constant 4D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_divc_vec4f_c, ne10_divc_vec4f_neon and ne10_divc_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  cst   Pointer to the 4D vector to divide the input values by
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_divc_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_divc_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    /**
+     *  Divides the elements of src1 by the elements of src2 and stores the results in the dst.
+     * This function point could be pointed to one of ne10_div_float_c, ne10_div_float_neon and ne10_div_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1  The first array to use as the input array
+     * @param[in]  src2  The second array to use as the input array
+     * @param[in]  count The number of items in the two input arrays
+     */
+    extern ne10_result_t (*ne10_div_float) (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_div_float_c (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_div_float_neon (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_div_float_asm (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+    /**
+     * Divides the components of a 2D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vdiv_vec2f_c, ne10_vdiv_vec2f_neon and ne10_vdiv_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the nominators' source array
+     * @param[in]  src2   Pointer to the denominators' source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vdiv_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vdiv_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vdiv_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vdiv_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    /**
+     * Divides the components of a 3D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vdiv_vec3f_c, ne10_vdiv_vec3f_neon and ne10_vdiv_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the nominators' source array
+     * @param[in]  src2   Pointer to the denominators' source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vdiv_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vdiv_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vdiv_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vdiv_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    /**
+     * Divides the components of a 4D vector with the corresponding components of another.
+     * This function point could be pointed to one of ne10_vdiv_vec4f_c, ne10_vdiv_vec4f_neon and ne10_vdiv_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the nominators' source array
+     * @param[in]  src2   Pointer to the denominators' source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_vdiv_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vdiv_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vdiv_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_vdiv_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Vector Div group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup DIV_MAT Matrix Div
+     *
+     * \par
+     * These functions implement the matrix division operation for float data type.
+     */
+
+    /**
+     * @addtogroup DIV_MAT
+     * @{
+     */
+
+    /**
+     * Divides the components of a 4x4 matrix with the corresponding components of another.
+     * This function point could be pointed to one of ne10_divmat_4x4f_c, ne10_divmat_4x4f_neon and ne10_divmat_4x4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the nominators' source array
+     * @param[in]  src2   Pointer to the denominators' source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_divmat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_divmat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_divmat_4x4f_neon (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_divmat_4x4f_asm (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+    /**
+     * Divides the components of a 3x3 matrix with the corresponding components of another.
+     * This function point could be pointed to one of ne10_divmat_3x3f_c, ne10_divmat_3x3f_neon and ne10_divmat_3x3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the nominators' source array
+     * @param[in]  src2   Pointer to the denominators' source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_divmat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_divmat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_divmat_3x3f_neon (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_divmat_3x3f_asm (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+    /**
+     * Divides the components of a 2x2 matrix with the corresponding components of another.
+     * This function point could be pointed to one of ne10_divmat_2x2f_c, ne10_divmat_2x2f_neon and ne10_divmat_2x2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the nominators' source array
+     * @param[in]  src2   Pointer to the denominators' source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_divmat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_divmat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_divmat_2x2f_neon (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_divmat_2x2f_asm (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Matrix Div group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup SETC_VEC Vector Setc
+     *
+     * \par
+     * These functions implement vector setc operation for float data type.
+     */
+
+    /**
+     * @addtogroup SETC_VEC
+     * @{
+     */
+
+    /**
+     * Sets the elements of an input array to a constant scalar and stores the results in an output array.
+     * This function point could be pointed to one of ne10_setc_float_c, ne10_setc_float_neon and ne10_setc_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  cst   The constant scalar to set the input values to
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_setc_float) (ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_float_c (ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_float_neon (ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_float_asm (ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+    /**
+     * Sets the components of 2D vectors in an input array to the components of a constant 2D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_setc_vec2f_c, ne10_setc_vec2f_neon and ne10_setc_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  cst   Pointer to the 2D vector to set the input values to
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_setc_vec2f) (ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec2f_c (ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec2f_neon (ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec2f_asm (ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
+    /**
+     * Sets the components of 3D vectors in an input array to the components of a constant 3D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_setc_vec3f_c, ne10_setc_vec3f_neon and ne10_setc_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  cst   Pointer to the 3D vector to set the input values to
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_setc_vec3f) (ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec3f_c (ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec3f_neon (ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec3f_asm (ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
+    /**
+     * Sets the components of 4D vectors in an input array to the components of a constant 3D vector and stores the results in an output array.
+     * This function point could be pointed to one of ne10_setc_vec4f_c, ne10_setc_vec4f_neon and ne10_setc_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  cst   Pointer to the 4D vector to set the input values to
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_setc_vec4f) (ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec4f_c (ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec4f_neon (ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    extern ne10_result_t ne10_setc_vec4f_asm (ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
+    /** @} */ //end of Vector Setc group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup LEN_VEC Vector Len
+     *
+     * \par
+     * These functions implement vector len operation for float data type.
+     */
+
+    /**
+     * @addtogroup LEN_VEC
+     * @{
+     */
+    /**
+     * Returns length of 2D vectors in corresponding elements of the output array.
+     * This function point could be pointed to one of ne10_len_vec2f_c, ne10_len_vec2f_neon and ne10_len_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_len_vec2f) (ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_len_vec2f_c (ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_len_vec2f_neon (ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_len_vec2f_asm (ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    /**
+     * Returns length of 3D vectors in corresponding elements of the output array.
+     * This function point could be pointed to one of ne10_len_vec3f_c, ne10_len_vec3f_neon and ne10_len_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_len_vec3f) (ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_len_vec3f_c (ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_len_vec3f_neon (ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_len_vec3f_asm (ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    /**
+     * Returns length of 4D vectors in corresponding elements of the output array.
+     * This function point could be pointed to one of ne10_len_vec4f_c, ne10_len_vec4f_neon and ne10_len_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_len_vec4f) (ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_len_vec4f_c (ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_len_vec4f_neon (ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_len_vec4f_asm (ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    /** @} */ //end of Vector Len group
+
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup NORM_VEC Vector Normalize
+     *
+     * \par
+     * These functions implement vector normalize operation for float data type.
+     */
+
+    /**
+     * @addtogroup NORM_VEC
+     * @{
+     */
+    /**
+     * Normalizes 2D vectors of the input array and stores them in the corresponding elements of the output array.
+     * This function point could be pointed to one of ne10_normalize_vec2f_c, ne10_normalize_vec2f_neon and ne10_normalize_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_normalize_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_normalize_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_normalize_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_normalize_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    /**
+     * Normalizes 3D vectors of the input array and stores them in the corresponding elements of the output array.
+     * This function point could be pointed to one of ne10_normalize_vec3f_c, ne10_normalize_vec3f_neon and ne10_normalize_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_normalize_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_normalize_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_normalize_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_normalize_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    /**
+     * Normalizes 4D vectors of the input array and stores them in the corresponding elements of the output array.
+     * This function point could be pointed to one of ne10_normalize_vec4f_c, ne10_normalize_vec4f_neon and ne10_normalize_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_normalize_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_normalize_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_normalize_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_normalize_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    /** @} */ //end of Vector Normalize group
+
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup ABS_VEC Vector Abs
+     *
+     * \par
+     * These functions implement vector abs operation for float data type.
+     */
+
+    /**
+     * @addtogroup ABS_VEC
+     * @{
+     */
+
+    /**
+     * Calculates the absolute value of each element in the source array and stores the result in the corresponding entry of the destination array.
+     * This function point could be pointed to one of ne10_abs_float_c, ne10_abs_float_neon and ne10_abs_float_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_abs_float) (ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_float_c (ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_float_neon (ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_float_asm (ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count);
+    /**
+     * Generates a 2D vector from the absolute values of each of the components of an input vector.
+     * This function point could be pointed to one of ne10_abs_vec2f_c, ne10_abs_vec2f_neon and ne10_abs_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_abs_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_vec2f_asm (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+    /**
+     * Generates a 3D vector from the absolute values of each of the components of an input vector.
+     * This function point could be pointed to one of ne10_abs_vec3f_c, ne10_abs_vec3f_neon and ne10_abs_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_abs_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+    /**
+     * Generates a 4D vector from the absolute values of each of the components of an input vector.
+     * This function point could be pointed to one of ne10_abs_vec4f_c, ne10_abs_vec4f_neon and ne10_abs_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_abs_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_abs_vec4f_asm (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+    /** @} */ //end of Vector Abs group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup DOT_VEC Vector Dot
+     *
+     * \par
+     * These functions implement vector dot operation for float data type.
+     */
+
+    /**
+     * @addtogroup DOT_VEC
+     * @{
+     */
+    /**
+     * Dot product of two 2D vectors.
+     * This function point could be pointed to one of ne10_dot_vec2f_c, ne10_dot_vec2f_neon and ne10_dot_vec2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_dot_vec2f) (ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_dot_vec2f_c (ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_dot_vec2f_neon (ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_dot_vec2f_asm (ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+    /**
+     * Dot product of two 3D vectors.
+     * This function point could be pointed to one of ne10_dot_vec3f_c, ne10_dot_vec3f_neon and ne10_dot_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_dot_vec3f) (ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_dot_vec3f_c (ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_dot_vec3f_neon (ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_dot_vec3f_asm (ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    /**
+     * Dot product of two 4D vectors.
+     * This function point could be pointed to one of ne10_dot_vec4f_c, ne10_dot_vec4f_neon and ne10_dot_vec4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_dot_vec4f) (ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_dot_vec4f_c (ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_dot_vec4f_neon (ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_dot_vec4f_asm (ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Vector Dot group
+
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup CROSS_VEC Vector Cross
+     *
+     * \par
+     * These functions implement vector cross operation for float data type.
+     */
+
+    /**
+     * @addtogroup CROSS_VEC
+     * @{
+     */
+
+    /**
+     * Performs a cross product operation on the two input vectors.
+     * This function point could be pointed to one of ne10_cross_vec3f_c, ne10_cross_vec3f_neon and ne10_cross_vec3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src1   Pointer to the first source array
+     * @param[in]  src2   Pointer to the second source array
+     * @param[in]  count The number of items in the input arrays
+     */
+    extern ne10_result_t (*ne10_cross_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_cross_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_cross_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    extern ne10_result_t ne10_cross_vec3f_asm (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+    /** @} */ //end of Vector Cross group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup DET_MAT Matrix Determinant
+     *
+     * \par
+     * These functions implement matrix determinant operation for float data type.
+     */
+
+    /**
+     * @addtogroup DET_MAT
+     * @{
+     */
+
+    /**
+     * Calculate the determinant of a 4x4 matrix.
+     * This function point could be pointed to one of ne10_detmat_4x4f_c, ne10_detmat_4x4f_neon and ne10_detmat_4x4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_detmat_4x4f) (ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_detmat_4x4f_c (ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_detmat_4x4f_neon (ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_detmat_4x4f_asm (ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    /**
+     * Calculate the determinant of a 3x3 matrix.
+     * This function point could be pointed to one of ne10_detmat_3x3f_c, ne10_detmat_3x3f_neon and ne10_detmat_3x3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_detmat_3x3f) (ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_detmat_3x3f_c (ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_detmat_3x3f_neon (ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_detmat_3x3f_asm (ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    /**
+     * Calculate the determinant of a 2x2 matrix.
+     * This function point could be pointed to one of ne10_detmat_2x2f_c, ne10_detmat_2x2f_neon and ne10_detmat_2x2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_detmat_2x2f) (ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_detmat_2x2f_c (ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_detmat_2x2f_neon (ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_detmat_2x2f_asm (ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    /** @} */ //end of Matrix Determinant group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup INV_MAT Matrix Invertible
+     *
+     * \par
+     * These functions implement matrix invertible operation for float data type.
+     */
+
+    /**
+     * @addtogroup INV_MAT
+     * @{
+     */
+    /**
+     * Calculate the invertible matrix of a 4x4 matrix.
+     * This function point could be pointed to one of ne10_invmat_4x4f_c, ne10_invmat_4x4f_neon and ne10_invmat_4x4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_invmat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_invmat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_invmat_4x4f_neon (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_invmat_4x4f_asm (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    /**
+     * Calculate the invertible matrix of a 3x3 matrix.
+     * This function point could be pointed to one of ne10_invmat_3x3f_c, ne10_invmat_3x3f_neon and ne10_invmat_3x3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_invmat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_invmat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_invmat_3x3f_neon (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_invmat_3x3f_asm (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    /**
+     * Calculate the invertible matrix of a 2x2 matrix.
+     * This function point could be pointed to one of ne10_invmat_2x2f_c, ne10_invmat_2x2f_neon and ne10_invmat_2x2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_invmat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_invmat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_invmat_2x2f_neon (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_invmat_2x2f_asm (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    /** @} */ //end of Matrix Invertible group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup TRANS_MAT Matrix Transpose
+     *
+     * \par
+     * These functions implement matrix transpose operation for float data type.
+     */
+
+    /**
+     * @addtogroup TRANS_MAT
+     * @{
+     */
+    /**
+     * Calculate the transpose matrix of a 4x4 matrix.
+     * This function point could be pointed to one of ne10_transmat_4x4f_c, ne10_transmat_4x4f_neon and ne10_transmat_4x4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_transmat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_transmat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_transmat_4x4f_neon (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_transmat_4x4f_asm (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+    /**
+     * Calculate the transpose matrix of a 4x4 matrix.
+     * This function point could be pointed to one of ne10_transmat_4x4f_c, ne10_transmat_4x4f_neon and ne10_transmat_4x4f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_transmat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_transmat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_transmat_3x3f_neon (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_transmat_3x3f_asm (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+    /**
+     * Calculate the transpose matrix of a 3x3 matrix.
+     * This function point could be pointed to one of ne10_transmat_3x3f_c, ne10_transmat_3x3f_neon and ne10_transmat_3x3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  src   Pointer to the source array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_transmat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_transmat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_transmat_2x2f_neon (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    extern ne10_result_t ne10_trans_mat2x2f_asm (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+    /** @} */ //end of Matrix Transpose group
+
+    /**
+     * @ingroup groupMaths
+     */
+
+    /**
+     * @defgroup IDENTITY_MAT Matrix Identity
+     *
+     * \par
+     * These functions implement matrix identity operation for float data type.
+     */
+
+    /**
+     * @addtogroup IDENTITY_MAT
+     * @{
+     */
+    /**
+     * Set the identity matrix of a 2x2 matrix.
+     * This function point could be pointed to one of ne10_identitymat_2x2f_c, ne10_identitymat_2x2f_neon and ne10_identitymat_2x2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_identitymat_4x4f) (ne10_mat4x4f_t * dst, ne10_uint32_t count);
+    extern ne10_result_t ne10_identitymat_4x4f_c (ne10_mat4x4f_t * dst, ne10_uint32_t count);
+    extern ne10_result_t ne10_identitymat_4x4f_neon (ne10_mat4x4f_t * dst, ne10_uint32_t count);
+    extern ne10_result_t ne10_identitymat_4x4f_asm (ne10_mat4x4f_t * dst, ne10_uint32_t count);
+    /**
+     * Set the identity matrix of a 3x3 matrix.
+     * This function point could be pointed to one of ne10_identitymat_3x3f_c, ne10_identitymat_3x3f_neon and ne10_identitymat_3x3f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_identitymat_3x3f) (ne10_mat3x3f_t * dst, ne10_uint32_t count);
+    extern ne10_result_t ne10_identitymat_3x3f_c (ne10_mat3x3f_t * dst, ne10_uint32_t count);
+    extern ne10_result_t ne10_identitymat_3x3f_neon (ne10_mat3x3f_t * dst, ne10_uint32_t count);
+    extern ne10_result_t ne10_identitymat_3x3f_asm (ne10_mat3x3f_t * dst, ne10_uint32_t count);
+    /**
+     * Set the identity matrix of a 2x2 matrix.
+     * This function point could be pointed to one of ne10_identitymat_2x2f_c, ne10_identitymat_2x2f_neon and ne10_identitymat_2x2f_asm.
+     * @param[out] dst   Pointer to the destination array
+     * @param[in]  count The number of items in the input array
+     */
+    extern ne10_result_t (*ne10_identitymat_2x2f) (ne10_mat2x2f_t * dst, ne10_uint32_t count);
+    extern ne10_result_t ne10_identitymat_2x2f_c (ne10_mat2x2f_t * dst, ne10_uint32_t count);
+    extern ne10_result_t ne10_identitymat_2x2f_neon (ne10_mat2x2f_t * dst, ne10_uint32_t count);
+    extern ne10_result_t ne10_identity_mat2x2f_asm (ne10_mat2x2f_t * dst, ne10_uint32_t count);
+    /** @} */ //end of Matrix Identity group
  
  #ifdef __cplusplus
  }
diff --git a/inc/NE10_types.h b/inc/NE10_types.h

index 4416631..ce49005 100644 (file)
--- a/inc/NE10_types.h
+++ b/inc/NE10_types.h
@@ -62,43 +62,56 @@ typedef float                   ne10_float32_t;
  typedef double                  ne10_float64_t;
  typedef int                     ne10_result_t;     // resulting [error-]code
  
+/**
+ * @brief a 2-tuple of ne10_float32_t values.
+ */
  typedef struct
  {
-        ne10_float32_t x;
-        ne10_float32_t y;
-} ne10_vec2f_t; // a 2-tuple of ne10_float32_t values
+    ne10_float32_t x;
+    ne10_float32_t y;
+} ne10_vec2f_t;
  
+/**
+ * @brief a 3-tuple of ne10_float32_t values.
+ */
  typedef struct
  {
-        ne10_float32_t x;
-        ne10_float32_t y;
-        ne10_float32_t z;
-} ne10_vec3f_t; // a 3-tuple of ne10_float32_t values
+    ne10_float32_t x;
+    ne10_float32_t y;
+    ne10_float32_t z;
+} ne10_vec3f_t;
  
+/**
+ * @brief a 4-tuple of ne10_float32_t values.
+ */
  typedef struct
  {
-        ne10_float32_t x;
-        ne10_float32_t y;
-        ne10_float32_t z;
-        ne10_float32_t w;
-} ne10_vec4f_t; // a 4-tuple of ne10_float32_t values
+    ne10_float32_t x;
+    ne10_float32_t y;
+    ne10_float32_t z;
+    ne10_float32_t w;
+} ne10_vec4f_t;
  
  /////////////////////////////////////////////////////////
  // definitions for matrix
  /////////////////////////////////////////////////////////
  
-typedef struct { ne10_float32_t r1; ne10_float32_t r2; } __attribute__((packed)) ne10_mat_row2f;
+typedef struct
+{
+    ne10_float32_t r1;
+    ne10_float32_t r2;
+} __attribute__ ( (packed)) ne10_mat_row2f;
  
  typedef struct
  {
-        ne10_mat_row2f c1;
-        ne10_mat_row2f c2;
+    ne10_mat_row2f c1;
+    ne10_mat_row2f c2;
  
-} __attribute__((packed)) ne10_mat2x2f_t;     // a 2x2 matrix
+} __attribute__ ( (packed)) ne10_mat2x2f_t;   // a 2x2 matrix
  
-static inline void createColumnMajorMatrix2x2( ne10_mat2x2f_t * outMat, ne10_float32_t m11, ne10_float32_t m21, ne10_float32_t m12, ne10_float32_t m22)
+static inline void createColumnMajorMatrix2x2 (ne10_mat2x2f_t * outMat, ne10_float32_t m11, ne10_float32_t m21, ne10_float32_t m12, ne10_float32_t m22)
  {
-   assert( NULL != outMat );
+    assert (NULL != outMat);
  
      outMat->c1.r1 = m11;
      outMat->c1.r2 = m21;
@@ -107,21 +120,26 @@ static inline void createColumnMajorMatrix2x2( ne10_mat2x2f_t * outMat, ne10_flo
  }
  
  
-typedef struct { ne10_float32_t r1; ne10_float32_t r2; ne10_float32_t r3; } __attribute__((packed)) ne10_mat_row3f;
+typedef struct
+{
+    ne10_float32_t r1;
+    ne10_float32_t r2;
+    ne10_float32_t r3;
+} __attribute__ ( (packed)) ne10_mat_row3f;
  
  typedef struct
  {
-        ne10_mat_row3f c1;
-        ne10_mat_row3f c2;
-        ne10_mat_row3f c3;
+    ne10_mat_row3f c1;
+    ne10_mat_row3f c2;
+    ne10_mat_row3f c3;
  
-} __attribute__((packed)) ne10_mat3x3f_t;     // a 3x3 matrix
+} __attribute__ ( (packed)) ne10_mat3x3f_t;   // a 3x3 matrix
  
-static inline void createColumnMajorMatrix3x3( ne10_mat3x3f_t * outMat, ne10_float32_t m11, ne10_float32_t m21, ne10_float32_t m31,
-                                                                       ne10_float32_t m12, ne10_float32_t m22, ne10_float32_t m32,
-                                                                       ne10_float32_t m13, ne10_float32_t m23, ne10_float32_t m33)
+static inline void createColumnMajorMatrix3x3 (ne10_mat3x3f_t * outMat, ne10_float32_t m11, ne10_float32_t m21, ne10_float32_t m31,
+        ne10_float32_t m12, ne10_float32_t m22, ne10_float32_t m32,
+        ne10_float32_t m13, ne10_float32_t m23, ne10_float32_t m33)
  {
-    assert( NULL != outMat );
+    assert (NULL != outMat);
  
      outMat->c1.r1 = m11;
      outMat->c1.r2 = m21;
@@ -137,23 +155,29 @@ static inline void createColumnMajorMatrix3x3( ne10_mat3x3f_t * outMat, ne10_flo
  }
  
  
-typedef struct { ne10_float32_t r1; ne10_float32_t r2; ne10_float32_t r3; ne10_float32_t r4; } __attribute__((packed)) ne10_mat_row4f;
+typedef struct
+{
+    ne10_float32_t r1;
+    ne10_float32_t r2;
+    ne10_float32_t r3;
+    ne10_float32_t r4;
+} __attribute__ ( (packed)) ne10_mat_row4f;
  
  typedef struct
  {
-        ne10_mat_row4f c1;
-        ne10_mat_row4f c2;
-        ne10_mat_row4f c3;
-        ne10_mat_row4f c4;
+    ne10_mat_row4f c1;
+    ne10_mat_row4f c2;
+    ne10_mat_row4f c3;
+    ne10_mat_row4f c4;
  
-} __attribute__((packed)) ne10_mat4x4f_t;     // a 4x4 matrix
+} __attribute__ ( (packed)) ne10_mat4x4f_t;   // a 4x4 matrix
  
-static inline void createColumnMajorMatrix4x4( ne10_mat4x4f_t * outMat, ne10_float32_t m11, ne10_float32_t m21, ne10_float32_t m31, ne10_float32_t m41,
-                                                                       ne10_float32_t m12, ne10_float32_t m22, ne10_float32_t m32, ne10_float32_t m42,
-                                                                       ne10_float32_t m13, ne10_float32_t m23, ne10_float32_t m33, ne10_float32_t m43,
-                                                                       ne10_float32_t m14, ne10_float32_t m24, ne10_float32_t m34, ne10_float32_t m44)
+static inline void createColumnMajorMatrix4x4 (ne10_mat4x4f_t * outMat, ne10_float32_t m11, ne10_float32_t m21, ne10_float32_t m31, ne10_float32_t m41,
+        ne10_float32_t m12, ne10_float32_t m22, ne10_float32_t m32, ne10_float32_t m42,
+        ne10_float32_t m13, ne10_float32_t m23, ne10_float32_t m33, ne10_float32_t m43,
+        ne10_float32_t m14, ne10_float32_t m24, ne10_float32_t m34, ne10_float32_t m44)
  {
-    assert( NULL != outMat );
+    assert (NULL != outMat);
  
      outMat->c1.r1 = m11;
      outMat->c1.r2 = m21;
@@ -189,7 +213,7 @@ typedef struct
      ne10_uint8_t ifft_flag;                          /**< Flag for selection of CFFT/ICFFT */
      ne10_uint8_t bit_reverse_flag;                   /**< Flag for selection of bitreversal or not */
      ne10_float32_t *p_twiddle;                       /**< Points to the twiddle factors array. The array is of length 2 * MaxFFTSize. */
-    ne10_uint16_t *p_bit_rev_table;                  /**< Points to the bit reversal array. The array is of size       MaxFFTSize/4 */
+    ne10_uint16_t *p_bit_rev_table;                  /**< Points to the bit reversal array. The array is of size    MaxFFTSize/4 */
      ne10_uint16_t twid_coef_modifier;                /**< Modifier to support different FFT sizes with same twiddle table */
      ne10_uint16_t bit_rev_factor;                    /**< Modifier to support different FFT sizes with same bit reversal table */
      ne10_float32_t one_by_fft_len;                   /**< 1/(Length of the FFT). */
@@ -214,7 +238,7 @@ typedef struct
  // definitions for fir
  /////////////////////////////////////////////////////////
  
-/*
+/**
   * @brief Instance structure for the floating-point FIR filter.
   */
  typedef struct
@@ -224,7 +248,7 @@ typedef struct
      ne10_float32_t *pCoeffs;   /**< Points to the coefficient array. The array is of length numTaps. */
  } ne10_fir_instance_f32_t;
  
-/*
+/**
   * @brief Instance structure for the floating point FIR Lattice filter.
   */
  typedef struct
@@ -234,7 +258,7 @@ typedef struct
      ne10_float32_t *pCoeffs;     /**< Points to the coefficient array. The array is of length numStages. */
  } ne10_fir_lattice_instance_f32_t;
  
-/*
+/**
   * @brief Instance structure for the floating-point FIR Decimation.
   */
  typedef struct
@@ -245,7 +269,7 @@ typedef struct
      ne10_float32_t    *pState;       /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */
  } ne10_fir_decimate_instance_f32_t;
  
-/*
+/**
   * @brief Instance structure for the floating-point FIR Interpolation.
   */
  typedef struct
@@ -256,7 +280,7 @@ typedef struct
      ne10_float32_t *pState;          /**< Points to the state variable array. The array is of length numTaps+maxBlockSize-1. */
  } ne10_fir_interpolate_instance_f32_t;
  
-/*
+/**
   * @brief Instance structure for the floating-point FIR Sparse filter.
   */
  typedef struct
diff --git a/modules/dsp/NE10_cfft.c b/modules/dsp/NE10_cfft.c

index c8ad334..6063894 100644 (file)
--- a/modules/dsp/NE10_cfft.c
+++ b/modules/dsp/NE10_cfft.c
@@ -30,17 +30,141 @@
   */
  
  #include "NE10_types.h"
+/**
+ * @ingroup groupDSPs
+ */
+
+/**
+ * @defgroup CFFT_CIFFT Complex FFT
+ *
+ * \par
+ * Complex Fast Fourier Transform(CFFT) and Complex Inverse Fast Fourier Transform(CIFFT) is an efficient algorithm to compute Discrete Fourier Transform(DFT) and Inverse Discrete Fourier Transform(IDFT).
+ * Computational complexity of CFFT reduces drastically when compared to DFT.
+ * \par
+ * This set of functions implements CFFT/CIFFT
+ * for floating-point data types.  The functions operate on out-of-place buffer which use different buffer for input and output.
+ * Complex input is stored in input buffer in an interleaved fashion.
+ *
+ * \par
+ * The functions operate on blocks of input and output data and each call to the function processes
+ * <code>2*fftLen</code> samples through the transform.  <code>pSrc</code>  points to input arrays containing <code>2*fftLen</code> values.
+ * \par
+ * The <code>pDst</code> points to the array of output buffer of size <code>2*fftLen</code> and inputs and outputs are stored in an interleaved fashion as shown below.
+ * <pre> {real[0], imag[0], real[1], imag[1],..} </pre>
+ *
+ * \par Lengths supported by the transform:
+ * \par
+ * Internally, the functions utilize a radix-4 decimation in frequency(DIF) algorithm
+ * and the size of the FFT supported are of the lengths [16, 64, 256, 1024].
+ *
+ *
+ * \par Algorithm:
+ *
+ * <b>Complex Fast Fourier Transform:</b>
+ * \par
+ * Input real and imaginary data:
+ * <pre>
+ * x(n) = xa + j * ya
+ * x(n+N/4 ) = xb + j * yb
+ * x(n+N/2 ) = xc + j * yc
+ * x(n+3N 4) = xd + j * yd
+ * </pre>
+ * where N is length of FFT
+ * \par
+ * Output real and imaginary data:
+ * <pre>
+ * X(4r) = xa'+ j * ya'
+ * X(4r+1) = xb'+ j * yb'
+ * X(4r+2) = xc'+ j * yc'
+ * X(4r+3) = xd'+ j * yd'
+ * </pre>
+ * \par
+ * Twiddle factors for radix-4 FFT:
+ * <pre>
+ * Wn = co1 + j * (- si1)
+ * W2n = co2 + j * (- si2)
+ * W3n = co3 + j * (- si3)
+ * </pre>
+ *
+ * \par
+ * \image html CFFT.gif "Radix-4 Decimation-in Frequency Complex Fast Fourier Transform"
+ *
+ * \par
+ * Output from Radix-4 CFFT Results in Digit reversal order. Interchange middle two branches of every butterfly results in Bit reversed output.
+ * \par
+ * <b> Butterfly CFFT equations:</b>
+ * <pre>
+ * xa' = xa + xb + xc + xd
+ * ya' = ya + yb + yc + yd
+ * xc' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
+ * yc' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
+ * xb' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
+ * yb' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
+ * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
+ * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
+ * </pre>
+ *
+ *
+ * <b>Complex Inverse Fast Fourier Transform:</b>
+ * \par
+ * CIFFT uses same twiddle factor table as CFFT with modifications in the design equation as shown below.
+ *
+ * \par
+ * <b> Modified Butterfly CIFFT equations:</b>
+ * <pre>
+ * xa' = xa + xb + xc + xd
+ * ya' = ya + yb + yc + yd
+ * xc' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
+ * yc' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
+ * xb' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
+ * yb' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
+ * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
+ * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
+ * </pre>
+ *
+ * \par Instance Structure
+ * A separate instance structure must be defined for each Instance but the twiddle factors and bit reversal tables can be reused.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Initializes twiddle factor table and bit reversal table pointers
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Manually initialize the instance structure as follows:
+ * <pre>
+ *ne10_cfft_radix4_instance_f32_t = {fft_len, ifft_flag, bit_reverse_flag, p_twiddle, p_bit_rev_table, twid_coef_modifier, bit_rev_factor, one_by_fft_len};
+ * </pre>
+ * \par
+ * where <code>fftLen</code> length of CFFT/CIFFT; <code>ifft_flag</code> Flag for selection of CFFT or CIFFT(Set ifft_flag to calculate CIFFT otherwise calculates CFFT);
+ * <code>bit_reverse_flag</code> Flag for selection of output order(Set bitReverseFlag to output in normal order otherwise output in bit reversed order);
+ * <code>p_twiddle</code>points to array of twiddle coefficients; <code>pBitRevTable</code> points to the array of bit reversal table.
+ * <code>p_bit_rev_table</code> modifier for bit reversal table which supports all FFT lengths with same table.
+ * <code>twid_coef_modifier</code> modifier for twiddle factor table which supports all FFT lengths with same table;
+ * <code>one_by_fft_len</code> value of 1/fftLen to calculate CIFFT;
+ *
+ */
  
  
-/*
-; * @brief  Core radix-4 FFT of floating-point data.
-; * @param[out]  *pDst
-; * @param[in]  *pSrc             points to the In-place buffer
-; * @param[in]  N                 length of FFT
-; * @param[in]  *pCoef            points to the twiddle factors
-; * @retureq none.
-; * The function implements a Radix-4 Complex FFT
-; */
+/**
+ * @addtogroup CFFT_CIFFT
+ * @{
+ */
+
+/**
+ * @brief Core radix-4 FFT of floating-point data.
+ * @param[out]  *pDst            point to the output buffer (out-of-place)
+ * @param[in]  *pSrc             point to the input buffer (out-of-place: the pSrc is used for intermedia buffer, so the input buffer is destroyed)
+ * @param[in]  N                 length of FFT
+ * @param[in]  *pCoef            point to the twiddle factors
+ * @return none.
+ * The function implements a Radix-4 Complex FFT
+ * Can support FFT lengths of 16, 64, 256, 1024
+ */
  
  void ne10_radix4_butterfly_float_c(
                       ne10_float32_t *pDst,
@@ -256,15 +380,16 @@ void ne10_radix4_butterfly_float_c(
      }
  }
  
-/*
-; * @brief  Core radix-4 IFFT of floating-point data.
-; * @param[out]  *pDst
-; * @param[in]  *pSrc             points to the In-place buffer
-; * @param[in]  N                 length of FFT
-; * @param[in]  *pCoef            points to the twiddle factors
-; * @retureq none.
-; * The function implements a Radix-4 Complex IFFT
-; */
+
+/**
+ * @brief Core radix-4 IFFT of floating-point data.
+ * @param[out]  *pDst            point to the output buffer (out-of-place)
+ * @param[in]  *pSrc             point to the input buffer (out-of-place: the pSrc is used for intermedia buffer, so the input buffer is destroyed)
+ * @param[in]  N                 length of FFT
+ * @param[in]  *pCoef            point to the twiddle factors
+ * @return none.
+ * The function implements a Radix-4 Complex IFFT
+ */
  
  void ne10_radix4_butterfly_inverse_float_c(
                       ne10_float32_t *pDst,
@@ -587,3 +712,7 @@ void ne10_radix4_butterfly_inverse_float_c(
      }
  }
  
+
+/**
+ * @} end of CFFT_CIFFT group
+ */
diff --git a/modules/dsp/NE10_fir.c b/modules/dsp/NE10_fir.c

index 0c5cd78..07da376 100644 (file)
--- a/modules/dsp/NE10_fir.c
+++ b/modules/dsp/NE10_fir.c
@@ -38,6 +38,7 @@
  /**
   * @defgroup FIR Finite Impulse Response (FIR) Filters
   *
+ * \par
   * This set of functions implements Finite Impulse Response (FIR) filters
   * for floating-point data types.
   * The functions operate on blocks of input and output data and each call to the function processes
@@ -351,6 +352,93 @@ void ne10_fir_float_c (const ne10_fir_instance_f32_t * S,
      }
  
  }
+/** @} */ //end of FIR group
+
+/**
+ * @ingroup groupDSPs
+ */
+
+/**
+ * @defgroup FIR_Decimate Finite Impulse Response (FIR) Decimator
+ *
+ * \par
+ * These functions combine an FIR filter together with a decimator.
+ * They are used in multirate systems for reducing the sample rate of a signal without introducing aliasing distortion.
+ * Conceptually, the functions are equivalent to the block diagram below:
+ * \image html FIRDecimator.gif "Components included in the FIR Decimator functions"
+ * When decimating by a factor of <code>M</code>, the signal should be prefiltered by a lowpass filter with a normalized
+ * cutoff frequency of <code>1/M</code> in order to prevent aliasing distortion.
+ * The user of the function is responsible for providing the filter coefficients.
+ *
+ * The FIR decimator functions provided in the CMSIS DSP Library combine the FIR filter and the decimator in an efficient manner.
+ * Instead of calculating all of the FIR filter outputs and discarding <code>M-1</code> out of every <code>M</code>, only the
+ * samples output by the decimator are computed.
+ * The functions operate on blocks of input and output data.
+ * <code>pSrc</code> points to an array of <code>blockSize</code> input values and
+ * <code>pDst</code> points to an array of <code>blockSize/M</code> output values.
+ * In order to have an integer number of output samples <code>blockSize</code>
+ * must always be a multiple of the decimation factor <code>M</code>.
+ *
+ * The library provides functions for floating-point data types.
+ *
+ * \par Algorithm:
+ * The FIR portion of the algorithm uses the standard form filter:
+ * <pre>
+ *    y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
+ * </pre>
+ * where, <code>b[n]</code> are the filter coefficients.
+ * \par
+ * The <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
+ * Coefficients are stored in time reversed order.
+ * \par
+ * <pre>
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * </pre>
+ * \par
+ * <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
+ * Samples in the state buffer are stored in the order:
+ * \par
+ * <pre>
+ *    {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
+ * </pre>
+ * The state variables are updated after each block of data is processed, the coefficients are untouched.
+ *
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient arrays may be shared among several instances while state variable array should be allocated separately.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ * - Checks to make sure that the size of the input is a multiple of the decimation factor.
+ *
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * The code below statically initializes each of the 3 different data type filter instance structures
+ * <pre>
+ *ne10_fir_decimate_instance_f32_t S = {M, numTaps, pCoeffs, pState};
+ * </pre>
+ * where <code>M</code> is the decimation factor; <code>numTaps</code> is the number of filter coefficients in the filter;
+ * <code>pCoeffs</code> is the address of the coefficient buffer;
+ * <code>pState</code> is the address of the state buffer.
+ * Be sure to set the values in the state buffer to zeros when doing static initialization.
+ *
+ * \par Fixed-Point Behavior
+ * Care must be taken when using the fixed-point versions of the FIR decimate filter functions.
+ * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ * Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ * @addtogroup FIR_Decimate
+ * @{
+ */
  
  /**
     * @brief Processing function for the floating-point FIR decimator.
@@ -515,6 +603,102 @@ void ne10_fir_decimate_float_c (const ne10_fir_decimate_instance_f32_t * S,
      }
  
  }
+/** @} */ //end of FIR_Decimate group
+
+
+/**
+ * @ingroup groupDSPs
+ */
+
+/**
+ * @defgroup FIR_Interpolate Finite Impulse Response (FIR) Interpolator
+ *
+ * \par
+ * These functions combine an upsampler (zero stuffer) and an FIR filter.
+ * They are used in multirate systems for increasing the sample rate of a signal without introducing high frequency images.
+ * Conceptually, the functions are equivalent to the block diagram below:
+ * \image html FIRInterpolator.gif "Components included in the FIR Interpolator functions"
+ * After upsampling by a factor of <code>L</code>, the signal should be filtered by a lowpass filter with a normalized
+ * cutoff frequency of <code>1/L</code> in order to eliminate high frequency copies of the spectrum.
+ * The user of the function is responsible for providing the filter coefficients.
+ *
+ * The FIR interpolator functions provided in the CMSIS DSP Library combine the upsampler and FIR filter in an efficient manner.
+ * The upsampler inserts <code>L-1</code> zeros between each sample.
+ * Instead of multiplying by these zero values, the FIR filter is designed to skip them.
+ * This leads to an efficient implementation without any wasted effort.
+ * The functions operate on blocks of input and output data.
+ * <code>pSrc</code> points to an array of <code>blockSize</code> input values and
+ * <code>pDst</code> points to an array of <code>blockSize*L</code> output values.
+ *
+ * The library provides functions for floating-point data types.
+ *
+ * \par Algorithm:
+ * The functions use a polyphase filter structure:
+ * <pre>
+ *    y[n] = b[0] * x[n] + b[L]   * x[n-1] + ... + b[L*(phaseLength-1)] * x[n-phaseLength+1]
+ *    y[n+1] = b[1] * x[n] + b[L+1] * x[n-1] + ... + b[L*(phaseLength-1)+1] * x[n-phaseLength+1]
+ *    ...
+ *    y[n+(L-1)] = b[L-1] * x[n] + b[2*L-1] * x[n-1] + ....+ b[L*(phaseLength-1)+(L-1)] * x[n-phaseLength+1]
+ * </pre>
+ * This approach is more efficient than straightforward upsample-then-filter algorithms.
+ * With this method the computation is reduced by a factor of <code>1/L</code> when compared to using a standard FIR filter.
+ * \par
+ * <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
+ * <code>numTaps</code> must be a multiple of the interpolation factor <code>L</code> and this is checked by the
+ * initialization functions.
+ * Internally, the function divides the FIR filter's impulse response into shorter filters of length
+ * <code>phaseLength=numTaps/L</code>.
+ * Coefficients are stored in time reversed order.
+ * \par
+ * <pre>
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+ * </pre>
+ * \par
+ * <code>pState</code> points to a state array of size <code>blockSize + phaseLength - 1</code>.
+ * Samples in the state buffer are stored in the order:
+ * \par
+ * <pre>
+ *    {x[n-phaseLength+1], x[n-phaseLength], x[n-phaseLength-1], x[n-phaseLength-2]....x[0], x[1], ..., x[blockSize-1]}
+ * </pre>
+ * The state variables are updated after each block of data is processed, the coefficients are untouched.
+ *
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient arrays may be shared among several instances while state variable array should be allocated separately.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ * - Checks to make sure that the length of the filter is a multiple of the interpolation factor.
+ *
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * The code below statically initializes each of the 3 different data type filter instance structures
+ * <pre>
+ * ne10_fir_interpolate_instance_f32_t S = {L, phaseLength, pCoeffs, pState};
+ * </pre>
+ * where <code>L</code> is the interpolation factor; <code>phaseLength=numTaps/L</code> is the
+ * length of each of the shorter FIR filters used internally,
+ * <code>pCoeffs</code> is the address of the coefficient buffer;
+ * <code>pState</code> is the address of the state buffer.
+ * Be sure to set the values in the state buffer to zeros when doing static initialization.
+ *
+ * \par Fixed-Point Behavior
+ * Care must be taken when using the fixed-point versions of the FIR interpolate filter functions.
+ * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ * Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ * @addtogroup FIR_Interpolate
+ * @{
+ */
  
  /**
   * @brief Processing function for the floating-point FIR interpolator.
@@ -698,6 +882,83 @@ void ne10_fir_interpolate_float_c (const ne10_fir_interpolate_instance_f32_t * S
      }
  
  }
+/** @} */ //end of FIR_interpolate group
+
+
+/**
+ * @ingroup groupDSPs
+ */
+
+/**
+ * @defgroup FIR_Lattice Finite Impulse Response (FIR) Lattice Filters
+ *
+ * \par
+ * This set of functions implements Finite Impulse Response (FIR) lattice filters
+ * for floating-point data types.  Lattice filters are used in a
+ * variety of adaptive filter applications.  The filter structure is feedforward and
+ * the net impulse response is finite length.
+ * The functions operate on blocks
+ * of input and output data and each call to the function processes
+ * <code>blockSize</code> samples through the filter.  <code>pSrc</code> and
+ * <code>pDst</code> point to input and output arrays containing <code>blockSize</code> values.
+ *
+ * \par Algorithm:
+ * \image html FIRLattice.gif "Finite Impulse Response Lattice filter"
+ * The following difference equation is implemented:
+ * <pre>
+ *    f0[n] = g0[n] = x[n]
+ *    fm[n] = fm-1[n] + km * gm-1[n-1] for m = 1, 2, ...M
+ *    gm[n] = km * fm-1[n] + gm-1[n-1] for m = 1, 2, ...M
+ *    y[n] = fM[n]
+ * </pre>
+ * \par
+ * <code>pCoeffs</code> points to tha array of reflection coefficients of size <code>numStages</code>.
+ * Reflection Coefficients are stored in the following order.
+ * \par
+ * <pre>
+ *    {k1, k2, ..., kM}
+ * </pre>
+ * where M is number of stages
+ * \par
+ * <code>pState</code> points to a state array of size <code>numStages</code>.
+ * The state variables (g values) hold previous inputs and are stored in the following order.
+ * <pre>
+ *    {g0[n], g1[n], g2[n] ...gM-1[n]}
+ * </pre>
+ * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ *
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Set the values in the state buffer to zeros and then manually initialize the instance structure as follows:
+ * <pre>
+ *ne10_iir_lattice_instance_f32_t S = {numStages, pState, pCoeffs};
+ * </pre>
+ * \par
+ * where <code>numStages</code> is the number of stages in the filter; <code>pState</code> is the address of the state buffer;
+ * <code>pCoeffs</code> is the address of the coefficient buffer.
+ * \par Fixed-Point Behavior
+ * Care must be taken when using the fixed-point versions of the FIR Lattice filter functions.
+ * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ * Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ * @addtogroup FIR_Lattice
+ * @{
+ */
  
  /**
     * @brief Processing function for the floating-point FIR lattice filter.
@@ -1004,10 +1265,11 @@ void ne10_fir_lattice_float_c (const ne10_fir_lattice_instance_f32_t * S,
      }
  
  }
-/**
-   * @brief floating-point Circular write function.
-   */
+/** @} */ //end of FIR_Lattice group
  
+/**
+ * @brief floating-point Circular write function.
+ */
  static void ne10_circular_write_float (ne10_int32_t * circBuffer,
      ne10_int32_t L,
      ne10_uint16_t * writeOffset,
@@ -1102,6 +1364,67 @@ static void ne10_circular_read_float (ne10_int32_t * circBuffer,
      *readOffset = rOffset;
  }
  
+/**
+ * @ingroup groupDSPs
+ */
+
+/**
+ * @defgroup FIR_Sparse Finite Impulse Response (FIR) Sparse Filters
+ *
+ * \par
+ * This group of functions implements sparse FIR filters.
+ * Sparse FIR filters are equivalent to standard FIR filters except that most of the coefficients are equal to zero.
+ * Sparse filters are used for simulating reflections in communications and audio applications.
+ *
+ * There are separate functions for floating-point data types.
+ * The functions operate on blocks  of input and output data and each call to the function processes
+ * <code>blockSize</code> samples through the filter.  <code>pSrc</code> and
+ * <code>pDst</code> points to input and output arrays respectively containing <code>blockSize</code> values.
+ *
+ * \par Algorithm:
+ * The sparse filter instant structure contains an array of tap indices <code>pTapDelay</code> which specifies the locations of the non-zero coefficients.
+ * This is in addition to the coefficient array <code>b</code>.
+ * The implementation essentially skips the multiplications by zero and leads to an efficient realization.
+ * <pre>
+ *     y[n] = b[0] * x[n-pTapDelay[0]] + b[1] * x[n-pTapDelay[1]] + b[2] * x[n-pTapDelay[2]] + ...+ b[numTaps-1] * x[n-pTapDelay[numTaps-1]]
+ * </pre>
+ * \par
+ * \image html FIRSparse.gif "Sparse FIR filter.  b[n] represents the filter coefficients"
+ * \par
+ * <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>;
+ * <code>pTapDelay</code> points to an array of nonzero indices and is also of size <code>numTaps</code>;
+ * <code>pState</code> points to a state array of size <code>maxDelay + blockSize</code>, where
+ * <code>maxDelay</code> is the largest offset value that is ever used in the <code>pTapDelay</code> array.
+ * Some of the processing functions also require temporary working buffers.
+ *
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient and offset arrays may be shared among several instances while state variable arrays cannot be shared.
+ * There are separate instance structure declarations for each of the 4 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ *
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Set the values in the state buffer to zeros before static initialization.
+ * The code below statically initializes each of the 4 different data type filter instance structures
+ * <pre>
+ *ne10_fir_sparse_instance_f32_t S = {numTaps, 0, pState, pCoeffs, maxDelay, pTapDelay};
+ * </pre>
+ *
+ */
+
+/**
+ * @addtogroup FIR_Sparse
+ * @{
+ */
  
  /**
   * @brief Processing function for the floating-point sparse FIR filter.
@@ -1277,8 +1600,5 @@ void ne10_fir_sparse_float_c (ne10_fir_sparse_instance_f32_t * S,
      }
  
  }
+/** @} */ //end of FIR_sparse group
  
-
-/**
- * @} end of FIR group
- */
diff --git a/modules/dsp/NE10_iir.c b/modules/dsp/NE10_iir.c

index b8c08a8..d886e00 100644 (file)
--- a/modules/dsp/NE10_iir.c
+++ b/modules/dsp/NE10_iir.c
@@ -38,8 +38,9 @@
  /**
   * @defgroup IIR_Lattice Infinite Impulse Response (IIR) Lattice Filters
   *
+ * \par
   * This set of functions implements lattice filters
- * for Q15, Q31 and floating-point data types.  Lattice filters are used in a
+ * for and floating-point data types.  Lattice filters are used in a
   * variety of adaptive filter applications.  The filter structure has feedforward and
   * feedback components and the net impulse response is infinite length.
   * The functions operate on blocks
@@ -306,10 +307,4 @@ void ne10_iir_lattice_float_c (const ne10_iir_lattice_instance_f32_t * S,
      }
  
  }
-
-
-
-
-/**
- * @} end of IIR_Lattice group
- */
+/** @} */ //end of IIR_Lattice group
diff --git a/modules/dsp/NE10_rfft.c b/modules/dsp/NE10_rfft.c

index b29cad5..6ad8e20 100644 (file)
--- a/modules/dsp/NE10_rfft.c
+++ b/modules/dsp/NE10_rfft.c
@@ -32,6 +32,84 @@
  #include "NE10_types.h"
  
  /**
+ * @ingroup groupDSPs
+ */
+
+/**
+ * @defgroup RFFT_RIFFT Real FFT
+ *
+ * \par
+ * Complex FFT/IFFT typically assumes complex input and output. However many applications use real valued data in time domain.
+ * Real FFT/IFFT efficiently process real valued sequences with the advantage of requirement of low memory and with less complexity.
+ *
+ * \par
+ * This set of functions implements Real Fast Fourier Transforms(RFFT) and Real Inverse Fast Fourier Transform(RIFFT)
+ * for floating-point data types.
+ *
+ *
+ * \par Algorithm:
+ *
+ * <b>Real Fast Fourier Transform:</b>
+ * \par
+ * Real FFT of N-point is calculated using CFFT of N/2-point and Split RFFT process as shown below figure.
+ * \par
+ * \image html RFFT.gif "Real Fast Fourier Transform"
+ * \par
+ * The RFFT functions operate on blocks of input and output data and each call to the function processes
+ * <code>fftLenR</code> samples through the transform.  <code>pSrc</code>  points to input array containing <code>fftLenR</code> values.
+ * <code>pDst</code>  points to output array containing <code>2*fftLenR</code> values. \n
+ * Input for real FFT is in the order of
+ * <pre>{real[0], real[1], real[2], real[3], ..}</pre>
+ * Output for real FFT is complex and are in the order of
+ * <pre>{real(0), imag(0), real(1), imag(1), ...}</pre>
+ *
+ * <b>Real Inverse Fast Fourier Transform:</b>
+ * \par
+ * Real IFFT of N-point is calculated using Split RIFFT process and CFFT of N/2-point as shown below figure.
+ * \par
+ * \image html RIFFT.gif "Real Inverse Fast Fourier Transform"
+ * \par
+ * The RIFFT functions operate on blocks of input and output data and each call to the function processes
+ * <code>2*fftLenR</code> samples through the transform.  <code>pSrc</code>  points to input array containing <code>2*fftLenR</code> values.
+ * <code>pDst</code>  points to output array containing <code>fftLenR</code> values. \n
+ * Input for real IFFT is complex and are in the order of
+ * <pre>{real(0), imag(0), real(1), imag(1), ...}</pre>
+ *  Output for real IFFT is real and in the order of
+ * <pre>{real[0], real[1], real[2], real[3], ..}</pre>
+ *
+ * \par Lengths supported by the transform:
+ * \par
+ * Real FFT/IFFT supports the lengths [128, 512, 2048], as it internally uses CFFT/CIFFT.
+ *
+ * \par Instance Structure
+ * A separate instance structure must be defined for each Instance but the twiddle factors can be reused.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Initializes twiddle factor tables.
+ * - Initializes CFFT data structure fields.
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Manually initialize the instance structure as follows:
+ * <pre>
+ *ne10_rfft_instance_f32_t S = {fft_len_real, fft_len_by2, ifft_flag_r, bit_reverse_flag_r, twid_coef_r_modifier, p_twiddle_A_real, p_twiddle_B_real, p_cfft};
+ * </pre>
+ * where <code>fft_len_real</code> length of RFFT/RIFFT; <code>fft_len_by2</code> length of CFFT/CIFFT.
+ * <code>ifft_flag_r</code> Flag for selection of RFFT or RIFFT(Set ifftFlagR to calculate RIFFT otherwise calculates RFFT);
+ * <code>bit_reverse_flag_r</code> Flag for selection of output order(Set bitReverseFlagR to output in normal order otherwise output in bit reversed order);
+ * <code>twid_coef_r_modifier</code> modifier for twiddle factor table which supports 128, 512, 2048 RFFT lengths with same table;
+ * <code>p_twiddle_A_real</code>points to A array of twiddle coefficients; <code>p_twiddle_B_real</code>points to B array of twiddle coefficients;
+ * <code>p_cfft</code> points to the CFFT Instance structure. The CFFT structure also needs to be initialized, refer to arm_cfft_radix4_f32() for details regarding
+ * static initialization of cfft structure.
+ *
+ */
+
+/**
   * @brief  Core Real FFT process
   * @param[in]   *pSrc                points to the Input buffer
   * @param[in]   N                    length of Real FFT
@@ -164,17 +242,21 @@ static void ne10_split_rifft_float_c(
  }
  
  /**
+ * @addtogroup RFFT_RIFFT
+ * @{
+ */
+
+/**
   * @brief  Real FFT process
- * @param  *S is an instance for the structure
- * @param  *pSrc points to the input buffer
+ * @param[in]  *S is an instance for the structure
+ * @param[in]  *pSrc point to the input buffer (out-of-place: it's also a tmp buffer, so the input buffer is destroyed)
+ * @param[out]  *pDst point to the output buffer (out-of-place)
+ * @param[in]  *pTemp point to the temp buffer (used for intermedia buffer)
   * @return none.
   * The function implements a Real FFT/ Real IFFT depending
   * on the direction flag
   * Can support FFT lengths of 128, 512, 2048
   *
- * <b>Approximate Cycle Calculation for M4: </b>
- *
- * <code>C0 + C1 * fftLen </code>
   */
  void ne10_rfft_float_c(
                       const ne10_rfft_instance_f32_t * S,
@@ -204,4 +286,6 @@ void ne10_rfft_float_c(
  
  }
  
-
+/**
+ * @} end of RFFT_RIFFT group
+ */
diff --git a/modules/dsp/NE10_rfft.neon.c b/modules/dsp/NE10_rfft.neon.c

index a914109..419a971 100644 (file)
--- a/modules/dsp/NE10_rfft.neon.c
+++ b/modules/dsp/NE10_rfft.neon.c
@@ -459,17 +459,21 @@ static void ne10_split_rifft_float_neon(
  }
  
  /**
+ * @addtogroup RFFT_RIFFT
+ * @{
+ */
+
+/**
   * @brief  Real FFT process
- * @param  *S is an instance for the structure
- * @param  *pSrc points to the input buffer
+ * @param[in]  *S is an instance for the structure
+ * @param[in]  *pSrc point to the input buffer (out-of-place: it's also a tmp buffer, so the input buffer is destroyed)
+ * @param[out]  *pDst point to the output buffer (out-of-place)
+ * @param[in]  *pTemp point to the temp buffer (used for intermedia buffer)
   * @return none.
   * The function implements a Real FFT/ Real IFFT depending
   * on the direction flag
   * Can support FFT lengths of 128, 512, 2048
   *
- * <b>Approximate Cycle Calculation for M4: </b>
- *
- * <code>C0 + C1 * fftLen </code>
   */
  void ne10_rfft_float_neon(
                       const ne10_rfft_instance_f32_t * S,
@@ -498,5 +502,7 @@ void ne10_rfft_float_neon(
      }
  
  }
-
+/**
+ * @} end of RFFT_RIFFT group
+ */
  
diff --git a/modules/math/NE10_add.c b/modules/math/NE10_add.c

index 8a6f537..d08a247 100644 (file)
--- a/modules/math/NE10_add.c
+++ b/modules/math/NE10_add.c
@@ -34,6 +34,7 @@
  
  #include <assert.h>
  
+
  ne10_result_t ne10_add_float_c (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count)
  {
      NE10_X_OPERATION_FLOAT_C
diff --git a/samples/NE10_test.c b/samples/NE10_test.c

index 407fcc3..c4cd81f 100644 (file)
--- a/samples/NE10_test.c
+++ b/samples/NE10_test.c
@@ -24,12 +24,60 @@
   *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   */
+#include <stdio.h>
+#include <stdlib.h>
  
  #include "NE10.h"
-#include "NE10_init.h"
  
-// This test code shows you how you can statically embed NE10 in your code
+/**
+ * @ingroup groupSamples
+ */
+/**
+ * @addtogroup groupSamples
+ * @{
+ */
  
+/**
+ * @brief This test code shows you how to call Ne10 functions with auto detecting NEON hardware
+ */
+void test_add1 (void)
+{
+    int i;
+    ne10_float32_t thesrc[5];
+    ne10_float32_t thecst;
+    ne10_float32_t thedst[5];
+
+    for (i = 0; i < 5; i++)
+    {
+        thesrc[i] = (ne10_float32_t) rand() / RAND_MAX * 5.0f;
+    }
+    thecst = (ne10_float32_t) rand() / RAND_MAX * 5.0f;
+
+    ne10_addc_float (thedst , thesrc, thecst, 5);
+}
+
+/**
+ * @brief This test code shows you how to call Ne10 functions directly
+ */
+void test_add2 (void)
+{
+    int i;
+    ne10_float32_t thesrc[5];
+    ne10_float32_t thecst;
+    ne10_float32_t thedst1[5];
+    ne10_float32_t thedst2[5];
+    for (i = 0; i < 5; i++)
+    {
+        thesrc[i] = (ne10_float32_t) rand() / RAND_MAX * 5.0f;
+    }
+    thecst = (ne10_float32_t) rand() / RAND_MAX * 5.0f;
+
+    ne10_addc_float_c (thedst1 , thesrc, thecst, 5);
+    ne10_addc_float_neon (thedst2 , thesrc, thecst, 5);
+}
+/**
+ * @} end of groupSamples
+ */
  void main()
  {
      ne10_result_t status;
@@ -40,5 +88,7 @@ void main()
          printf ("NE10 init failed.\n");
  
      printf ("NE10 has been initialized.\n");
+    test_add1();
+    test_add2();
  }
author	yang <yang.zhang@arm.com>
	Tue, 18 Dec 2012 08:33:59 +0000 (16:33 +0800)
committer	yang <yang.zhang@arm.com>
	Tue, 18 Dec 2012 08:33:59 +0000 (16:33 +0800)
doc/FunctionList.txt	[deleted file]	patch \| blob \| history
doc/doxygen/doxygen.cfg	[moved from tools/doxygen/doxygen.cfg with 99% similarity]	patch \| blob \| history
doc/doxygen/image/CFFT.gif	[new file with mode: 0755]	patch \| blob
doc/doxygen/image/FIR.gif	[new file with mode: 0755]	patch \| blob
doc/doxygen/image/FIRDecimator.gif	[new file with mode: 0755]	patch \| blob
doc/doxygen/image/FIRInterpolator.gif	[new file with mode: 0755]	patch \| blob
doc/doxygen/image/FIRLattice.gif	[new file with mode: 0755]	patch \| blob
doc/doxygen/image/FIRSparse.gif	[new file with mode: 0755]	patch \| blob
doc/doxygen/image/IIRLattice.gif	[new file with mode: 0755]	patch \| blob
doc/doxygen/image/RFFT.gif	[new file with mode: 0755]	patch \| blob
doc/doxygen/image/RIFFT.gif	[new file with mode: 0755]	patch \| blob
doc/doxygen/image/ne10_library.png	[new file with mode: 0644]	patch \| blob
doc/doxygen/image/ne10_logo.png	[new file with mode: 0644]	patch \| blob
inc/NE10.h		patch \| blob \| history
inc/NE10_dsp.h		patch \| blob \| history
inc/NE10_init.h		patch \| blob \| history
inc/NE10_math.h		patch \| blob \| history
inc/NE10_types.h		patch \| blob \| history
modules/dsp/NE10_cfft.c		patch \| blob \| history
modules/dsp/NE10_fir.c		patch \| blob \| history
modules/dsp/NE10_iir.c		patch \| blob \| history
modules/dsp/NE10_rfft.c		patch \| blob \| history
modules/dsp/NE10_rfft.neon.c		patch \| blob \| history
modules/math/NE10_add.c		patch \| blob \| history
samples/NE10_test.c		patch \| blob \| history